npm - @midscene/core - Versions diffs - 0.26.5-beta-20250814095614.0 → 0.26.5-beta-20250814125155.0 - Mend

@midscene/core 0.26.5-beta-20250814095614.0 → 0.26.5-beta-20250814125155.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

package/dist/es/ai-model/action-executor.mjs +139 -0
package/dist/es/ai-model/action-executor.mjs.map +1 -0
package/dist/es/ai-model/common.mjs +219 -0
package/dist/es/ai-model/common.mjs.map +1 -0
package/dist/es/ai-model/index.mjs +10 -0
package/dist/es/ai-model/inspect.mjs +317 -0
package/dist/es/ai-model/inspect.mjs.map +1 -0
package/dist/es/ai-model/llm-planning.mjs +85 -0
package/dist/es/ai-model/llm-planning.mjs.map +1 -0
package/dist/es/ai-model/prompt/assertion.mjs +55 -0
package/dist/es/ai-model/prompt/assertion.mjs.map +1 -0
package/dist/es/ai-model/prompt/common.mjs +7 -0
package/dist/es/ai-model/prompt/common.mjs.map +1 -0
package/dist/es/ai-model/prompt/describe.mjs +44 -0
package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
package/dist/es/ai-model/prompt/extraction.mjs +137 -0
package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
package/dist/es/ai-model/prompt/llm-locator.mjs +275 -0
package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
package/dist/es/ai-model/prompt/llm-planning.mjs +359 -0
package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
package/dist/es/ai-model/prompt/llm-section-locator.mjs +47 -0
package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
package/dist/es/ai-model/prompt/ui-tars-locator.mjs +34 -0
package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -0
package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
package/dist/es/ai-model/prompt/util.mjs +123 -0
package/dist/es/ai-model/prompt/util.mjs.map +1 -0
package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
package/dist/es/ai-model/service-caller/index.mjs +413 -0
package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
package/dist/es/ai-model/ui-tars-planning.mjs +235 -0
package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
package/dist/es/image/index.mjs +2 -0
package/dist/es/index.mjs +7 -2360
package/dist/es/index.mjs.map +1 -1
package/dist/es/insight/index.mjs +261 -0
package/dist/es/insight/index.mjs.map +1 -0
package/dist/es/insight/utils.mjs +19 -0
package/dist/es/insight/utils.mjs.map +1 -0
package/dist/es/types.mjs +11 -0
package/dist/es/types.mjs.map +1 -0
package/dist/es/utils.mjs +2 -2
package/dist/es/yaml.mjs +0 -0
package/dist/lib/ai-model/action-executor.js +173 -0
package/dist/lib/ai-model/action-executor.js.map +1 -0
package/dist/lib/ai-model/common.js +289 -0
package/dist/lib/ai-model/common.js.map +1 -0
package/dist/lib/ai-model/index.js +103 -0
package/dist/lib/ai-model/index.js.map +1 -0
package/dist/lib/ai-model/inspect.js +360 -0
package/dist/lib/ai-model/inspect.js.map +1 -0
package/dist/lib/ai-model/llm-planning.js +119 -0
package/dist/lib/ai-model/llm-planning.js.map +1 -0
package/dist/lib/ai-model/prompt/assertion.js +92 -0
package/dist/lib/ai-model/prompt/assertion.js.map +1 -0
package/dist/lib/ai-model/prompt/common.js +41 -0
package/dist/lib/ai-model/prompt/common.js.map +1 -0
package/dist/lib/ai-model/prompt/describe.js +78 -0
package/dist/lib/ai-model/prompt/describe.js.map +1 -0
package/dist/lib/ai-model/prompt/extraction.js +177 -0
package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
package/dist/lib/ai-model/prompt/llm-locator.js +315 -0
package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
package/dist/lib/ai-model/prompt/llm-planning.js +415 -0
package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
package/dist/lib/ai-model/prompt/llm-section-locator.js +84 -0
package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
package/dist/lib/ai-model/prompt/ui-tars-locator.js +68 -0
package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -0
package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
package/dist/lib/ai-model/prompt/util.js +175 -0
package/dist/lib/ai-model/prompt/util.js.map +1 -0
package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
package/dist/lib/ai-model/service-caller/index.js +496 -0
package/dist/lib/ai-model/service-caller/index.js.map +1 -0
package/dist/lib/ai-model/ui-tars-planning.js +272 -0
package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
package/dist/lib/image/index.js +56 -0
package/dist/lib/image/index.js.map +1 -0
package/dist/lib/index.js +21 -2393
package/dist/lib/index.js.map +1 -1
package/dist/lib/insight/index.js +295 -0
package/dist/lib/insight/index.js.map +1 -0
package/dist/lib/insight/utils.js +53 -0
package/dist/lib/insight/utils.js.map +1 -0
package/dist/lib/types.js +82 -0
package/dist/lib/types.js.map +1 -0
package/dist/lib/utils.js +2 -2
package/dist/lib/yaml.js +20 -0
package/dist/lib/yaml.js.map +1 -0
package/dist/types/ai-model/action-executor.d.ts +19 -0
package/dist/types/ai-model/common.d.ts +34 -0
package/dist/types/ai-model/index.d.ts +11 -0
package/dist/types/ai-model/inspect.d.ts +49 -0
package/dist/types/ai-model/llm-planning.d.ts +10 -0
package/dist/types/ai-model/prompt/assertion.d.ts +5 -0
package/dist/types/ai-model/prompt/common.d.ts +2 -0
package/dist/types/ai-model/prompt/describe.d.ts +1 -0
package/dist/types/ai-model/prompt/extraction.d.ts +4 -0
package/dist/types/ai-model/prompt/llm-locator.d.ts +9 -0
package/dist/types/ai-model/prompt/llm-planning.d.ts +15 -0
package/dist/types/ai-model/prompt/llm-section-locator.d.ts +6 -0
package/dist/types/ai-model/prompt/playwright-generator.d.ts +25 -0
package/dist/types/ai-model/prompt/ui-tars-locator.d.ts +1 -0
package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
package/dist/types/ai-model/prompt/util.d.ts +45 -0
package/dist/types/ai-model/prompt/yaml-generator.d.ts +99 -0
package/dist/types/ai-model/service-caller/index.d.ts +26 -0
package/dist/types/ai-model/ui-tars-planning.d.ts +76 -0
package/dist/types/image/index.d.ts +1 -0
package/dist/types/index.d.ts +9 -1289
package/dist/types/insight/index.d.ts +26 -0
package/dist/types/insight/utils.d.ts +2 -0
package/dist/types/tree.d.ts +1 -11
package/dist/types/types.d.ts +399 -0
package/dist/types/utils.d.ts +27 -47
package/dist/types/yaml.d.ts +172 -0
package/package.json +6 -6
package/dist/es/ai-model.mjs +0 -2502
package/dist/es/ai-model.mjs.map +0 -1
package/dist/lib/ai-model.js +0 -2622
package/dist/lib/ai-model.js.map +0 -1
package/dist/types/ai-model.d.ts +0 -596

package/dist/lib/ai-model/prompt/llm-locator.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"ai-model/prompt/llm-locator.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/prompt/llm-locator.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { PromptTemplate } from '@langchain/core/prompts';\nimport type { vlLocateMode } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { bboxDescription } from './common';\nexport function systemPromptToLocateElement(\n vlMode: ReturnType<typeof vlLocateMode>,\n) {\n if (vlMode) {\n const bboxComment = bboxDescription(vlMode);\n return `\n## Role:\nYou are an expert in software testing.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Give the coordinates of the element that matches the user's description best in the screenshot.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxComment}\n \"errors\"?: string[],\n \"isOrderSensitive\": boolean // Whether the targetElementDescription is order-sensitive (true/false)\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` is the bounding box of the element that matches the user's description best in the screenshot\n* \\`isOrderSensitive\\` is a boolean indicating whether the user's description is order-sensitive (true/false)\n* \\`errors\\` is an optional array of error messages (if any)\n\nOrder-sensitive means the description contains phrases like:\n- \"the third item in the list\"\n- \"the last button\"\n- \"the first input box\"\n- \"the second row\"\n\nNot order-sensitive means the description is like:\n- \"confirm button\"\n- \"search box\"\n- \"password input\"\n\nFor example, when an element is found and the description is order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n\nWhen no element is found and the description is not order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [],\n \"isOrderSensitive\": false,\n \"errors\": [\"I can see ..., but {some element} is not found\"]\n}\n\\`\\`\\`\n`;\n }\n\n return `\n## Role:\nYou are an expert in software page image (2D) and page element text analysis.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Return JSON data containing the selection reason and element ID.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Skills:\n- Image analysis and recognition\n- Multilingual text understanding\n- Software UI design and testing\n\n## Workflow:\n1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.\n2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.\n3. Found the required number of elements\n4. Return JSON data containing the selection reason and element ID.\n5. Judge whether the user's description is order-sensitive (see below for definition and examples).\n\n## Constraints:\n- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.\n- Elements in the image with NodeType other than \"TEXT Node\" have been highlighted to identify the element among multiple non-text elements.\n- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.\n- If no elements are found, the \"elements\" array should be empty.\n- The returned data must conform to the specified JSON format.\n- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)\n\n## Order-Sensitive Definition:\n- If the description contains phrases like \"the third item in the list\", \"the last button\", \"the first input box\", \"the second row\", etc., it is order-sensitive (isOrderSensitive = true).\n- If the description is like \"confirm button\", \"search box\", \"password input\", etc., it is not order-sensitive (isOrderSensitive = false).\n\n## Output Format:\n\nPlease return the result in JSON format as follows:\n\n\\`\\`\\`json\n{\n \"elements\": [\n // If no matching elements are found, return an empty array []\n {\n \"reason\": \"PLACEHOLDER\", // The thought process for finding the element, replace PLACEHOLDER with your thought process\n \"text\": \"PLACEHOLDER\", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty\n \"id\": \"PLACEHOLDER\" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo\n }\n // More elements...\n ],\n \"isOrderSensitive\": true, // or false, depending on the user's description\n \"errors\": [] // Array of strings containing any error messages\n}\n\\`\\`\\`\n\n## Example:\nExample 1:\nInput Example:\n\\`\\`\\`json\n// Description: \"Shopping cart icon in the upper right corner\"\n{\n \"description\": \"PLACEHOLDER\", // Description of the target element\n \"screenshot\": \"path/screenshot.png\",\n \"text\": '{\n \"pageSize\": {\n \"width\": 400, // Width of the page\n \"height\": 905 // Height of the page\n },\n \"elementInfos\": [\n {\n \"id\": \"1231\", // ID of the element\n \"indexId\": \"0\", // Index of the element，The image is labeled to the left of the element\n \"attributes\": { // Attributes of the element\n \"nodeType\": \"IMG Node\", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node\n \"src\": \"https://ap-southeast-3.m\",\n \"class\": \".img\"\n },\n \"content\": \"\", // Text content of the element\n \"rect\": {\n \"left\": 280, // Distance from the left side of the page\n \"top\": 8, // Distance from the top of the page\n \"width\": 44, // Width of the element\n \"height\": 44 // Height of the element\n }\n },\n {\n \"id\": \"66551\", // ID of the element\n \"indexId\": \"1\", // Index of the element,The image is labeled to the left of the element\n \"attributes\": { // Attributes of the element\n \"nodeType\": \"IMG Node\", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node\n \"src\": \"data:image/png;base64,iVBORw0KGgoAAAANSU...\",\n \"class\": \".icon\"\n },\n \"content\": \"\", // Text content of the element\n \"rect\": {\n \"left\": 350, // Distance from the left side of the page\n \"top\": 16, // Distance from the top of the page\n \"width\": 25, // Width of the element\n \"height\": 25 // Height of the element\n }\n },\n ...\n {\n \"id\": \"12344\",\n \"indexId\": \"2\", // Index of the element，The image is labeled to the left of the element\n \"attributes\": {\n \"nodeType\": \"TEXT Node\",\n \"class\": \".product-name\"\n },\n \"center\": [\n 288,\n 834\n ],\n \"content\": \"Mango Drink\",\n \"rect\": {\n \"left\": 188,\n \"top\": 827,\n \"width\": 199,\n \"height\": 13\n }\n },\n ...\n ]\n }\n '\n}\n\\`\\`\\`\nOutput Example:\n\\`\\`\\`json\n{\n \"elements\": [\n {\n // Describe the reason for finding this element, replace with actual value in practice\n \"reason\": \"Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button\",\n \"text\": \"\",\n // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**\n \"id\": \"1231\"\n }\n ],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n \n `;\n}\n\nexport const locatorSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'find_elements',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n elements: {\n type: 'array',\n items: {\n type: 'object',\n properties: {\n reason: {\n type: 'string',\n description: 'Reason for finding this element',\n },\n text: {\n type: 'string',\n description: 'Text content of the element',\n },\n id: {\n type: 'string',\n description: 'ID of this element',\n },\n },\n required: ['reason', 'text', 'id'],\n additionalProperties: false,\n },\n description: 'List of found elements',\n },\n isOrderSensitive: {\n type: 'boolean',\n description:\n 'Whether the targetElementDescription is order-sensitive (true/false)',\n },\n errors: {\n type: 'array',\n items: {\n type: 'string',\n },\n description: 'List of error messages, if any',\n },\n },\n required: ['elements', 'isOrderSensitive', 'errors'],\n additionalProperties: false,\n },\n },\n};\n\nexport const findElementPrompt = new PromptTemplate({\n template: `\nHere is the item user want to find:\n=====================================\n{targetElementDescription}\n=====================================\n\n{pageDescription}\n `,\n inputVariables: ['pageDescription', 'targetElementDescription'],\n});\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","systemPromptToLocateElement","vlMode","bboxComment","bboxDescription","locatorSchema","findElementPrompt","PromptTemplate"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;ACFO,SAASI,4BACdC,MAAuC;IAEvC,IAAIA,QAAQ;QACV,MAAMC,cAAcC,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EAAgBF;QACpC,OAAO,CAAC;;;;;;;;;;;;gDAYoC,EAAEC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuC9D,CAAC;IACC;IAEA,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA8IR,CAAC;AACH;AAEO,MAAME,gBAA0C;IACrD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,UAAU;oBACR,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,YAAY;4BACV,QAAQ;gCACN,MAAM;gCACN,aAAa;4BACf;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,IAAI;gCACF,MAAM;gCACN,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAU;4BAAQ;yBAAK;wBAClC,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,kBAAkB;oBAChB,MAAM;oBACN,aACE;gBACJ;gBACA,QAAQ;oBACN,MAAM;oBACN,OAAO;wBACL,MAAM;oBACR;oBACA,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAY;gBAAoB;aAAS;YACpD,sBAAsB;QACxB;IACF;AACF;AAEO,MAAMC,oBAAoB,IAAIC,wBAAAA,cAAcA,CAAC;IAClD,UAAU,CAAC;;;;;;;EAOX,CAAC;IACD,gBAAgB;QAAC;QAAmB;KAA2B;AACjE"}

package/dist/lib/ai-model/prompt/llm-planning.js ADDED Viewed

@@ -0,0 +1,415 @@
+"use strict";
+var __webpack_require__ = {};
+(()=>{
+    __webpack_require__.n = (module)=>{
+        var getter = module && module.__esModule ? ()=>module['default'] : ()=>module;
+        __webpack_require__.d(getter, {
+            a: getter
+        });
+        return getter;
+    };
+})();
+(()=>{
+    __webpack_require__.d = (exports1, definition)=>{
+        for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
+            enumerable: true,
+            get: definition[key]
+        });
+    };
+})();
+(()=>{
+    __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
+})();
+(()=>{
+    __webpack_require__.r = (exports1)=>{
+        if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
+            value: 'Module'
+        });
+        Object.defineProperty(exports1, '__esModule', {
+            value: true
+        });
+    };
+})();
+var __webpack_exports__ = {};
+__webpack_require__.r(__webpack_exports__);
+__webpack_require__.d(__webpack_exports__, {
+    systemPromptToTaskPlanning: ()=>systemPromptToTaskPlanning,
+    descriptionForAction: ()=>descriptionForAction,
+    generateTaskBackgroundContext: ()=>generateTaskBackgroundContext,
+    planSchema: ()=>planSchema,
+    automationUserPrompt: ()=>automationUserPrompt
+});
+const external_node_assert_namespaceObject = require("node:assert");
+var external_node_assert_default = /*#__PURE__*/ __webpack_require__.n(external_node_assert_namespaceObject);
+const prompts_namespaceObject = require("@langchain/core/prompts");
+const external_common_js_namespaceObject = require("./common.js");
+const vlCoTLog = '"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. ';
+const vlCurrentLog = '"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do .. first". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
+const llmCurrentLog = '"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do ..". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
+const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
+  "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
+const vlLocateParam = (required)=>`locate${required ? '' : '?'}: {bbox: [number, number, number, number], prompt: string }`;
+const llmLocateParam = (required)=>`locate${required ? '' : '?'}: {"id": string, "prompt": string}`;
+const descriptionForAction = (action, locatorScheme)=>{
+    const tab = '  ';
+    let locateParam = '';
+    if ('required' === action.location) locateParam = locatorScheme;
+    else if ('optional' === action.location) locateParam = `${locatorScheme} | null`;
+    else if (false === action.location) locateParam = '';
+    const locatorParam = locateParam ? `- ${locateParam}` : '';
+    if (action.whatToLocate) if (locateParam) locateParam += ` // ${action.whatToLocate}`;
+    else console.warn(`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`);
+    let paramSchema = '';
+    if (action.paramSchema) paramSchema = `- param: ${action.paramSchema}`;
+    if (action.paramDescription) {
+        external_node_assert_default()(paramSchema, `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`);
+        paramSchema += ` // ${action.paramDescription}`;
+    }
+    const fields = [
+        paramSchema,
+        locatorParam
+    ].filter(Boolean);
+    return `- ${action.name}, ${action.description}
+${tab}- type: "${action.name}"
+${tab}${fields.join(`\n${tab}`)}
+`.trim();
+};
+const systemTemplateOfVLPlanning = ({ actionSpace, vlMode })=>{
+    const actionNameList = actionSpace.map((action)=>action.name).join(', ');
+    const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam('required' === action.location)));
+    const actionList = actionDescriptionList.join('\n');
+    return `
+Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
+Restriction:
+- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
+- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
+- Don't repeat actions in the previous logs.
+- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${(0, external_common_js_namespaceObject.bboxDescription)(vlMode)}.
+Supporting actions:
+${actionList}
+Field description:
+* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
+Return in JSON format:
+{
+  ${vlCoTLog}
+  ${vlCurrentLog}
+  ${commonOutputFields}
+  "action":
+    {
+      // one of the supporting actions
+    } | null,
+  ,
+  "sleep"?: number, // The sleep time after the action, in milliseconds.
+}
+For example, when the instruction is "click 'Confirm' button, and click 'Yes' in popup" and the log is "I will use action Tap to click 'Confirm' button", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.
+this and output the JSON:
+{
+  "what_the_user_wants_to_do_next_by_instruction": "We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup",
+  "log": "I will use action Tap to click 'Yes' in popup",
+  "more_actions_needed_by_instruction": false,
+  "action": {
+    "type": "Tap",
+    "locate": {
+      "bbox": [100, 100, 200, 200],
+      "prompt": "The 'Yes' button in popup"
+    }
+  }
+}
+`;
+};
+const systemTemplateOfLLM = ({ actionSpace })=>{
+    const actionNameList = actionSpace.map((action)=>action.name).join(' / ');
+    const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam('required' === action.location)));
+    const actionList = actionDescriptionList.join('\n');
+    return `
+## Role
+You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
+## Objective
+- Decompose the instruction user asked into a series of actions
+- Locate the target element if possible
+- If the instruction cannot be accomplished, give a further plan.
+## Workflow
+1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
+2. Decompose the user's task into a sequence of feasible actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
+3. Consider whether the user's instruction will be accomplished after the actions you composed.
+- If the instruction is accomplished, set \`more_actions_needed_by_instruction\` to false.
+- If more actions are needed, set \`more_actions_needed_by_instruction\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \`log\` field, he or she will continue the task according to your logs.
+4. If the task is not feasible on this page, set \`error\` field to the reason.
+## Constraints
+- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.
+- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
+- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
+- If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field.
+## About the \`actions\` field
+The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
+type LocateParam = {
+  "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
+  "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
+} | null // If it's not on the page, the LocateParam should be null
+## Supported actions
+Each action has a \`type\` and corresponding \`param\`. To be detailed:
+${actionList}
+`.trim();
+};
+const outputTemplate = `
+## Output JSON Format:
+The JSON format is as follows:
+{
+  "actions": [
+    // ... some actions
+  ],
+  ${llmCurrentLog}
+  ${commonOutputFields}
+}
+## Examples
+### Example: Decompose a task
+When you received the following information:
+* Instruction: 'Click the language switch button, wait 1s, click "English"'
+* Logs: null
+* Page Context (screenshot and description) shows: There is a language switch button, and the "English" option is not shown in the screenshot now.
+By viewing the page screenshot and description, you should consider this and output the JSON:
+* The user intent is: tap the switch button, sleep, and tap the 'English' option
+* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
+* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
+* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
+* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
+* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
+{
+  "actions":[
+    {
+      "thought": "Click the language switch button to open the language options.",
+      "type": "Tap",
+      "param": null,
+      "locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
+    },
+    {
+      "thought": "Wait for 1 second to ensure the language options are displayed.",
+      "type": "Sleep",
+      "param": { "timeMs": 1000 },
+    }
+  ],
+  "error": null,
+  "more_actions_needed_by_instruction": true,
+  "log": "Click the language switch button to open the language options. Wait for 1 second",
+}
+### Example: What NOT to do
+Wrong output:
+{
+  "actions":[
+    {
+      "thought": "Click the language switch button to open the language options.",
+      "type": "Tap",
+      "param": null,
+      "locate": {
+        { "id": "c81c4e9a33" }, // WRONG: prompt is missing, this is not a valid LocateParam
+      }
+    },
+    {
+      "thought": "Click the English option",
+      "type": "Tap",
+      "param": null,
+      "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
+    }
+  ],
+  "more_actions_needed_by_instruction": false, // WRONG: should be true
+  "log": "Click the language switch button to open the language options",
+}
+`;
+async function systemPromptToTaskPlanning({ actionSpace, vlMode }) {
+    if (vlMode) return systemTemplateOfVLPlanning({
+        actionSpace,
+        vlMode
+    });
+    return `${systemTemplateOfLLM({
+        actionSpace
+    })}\n\n${outputTemplate}`;
+}
+const planSchema = {
+    type: 'json_schema',
+    json_schema: {
+        name: 'action_items',
+        strict: false,
+        schema: {
+            type: 'object',
+            strict: false,
+            properties: {
+                actions: {
+                    type: 'array',
+                    items: {
+                        type: 'object',
+                        strict: false,
+                        properties: {
+                            thought: {
+                                type: 'string',
+                                description: 'Reasons for generating this task, and why this task is feasible on this page'
+                            },
+                            type: {
+                                type: 'string',
+                                description: 'Type of action'
+                            },
+                            param: {
+                                anyOf: [
+                                    {
+                                        type: 'null'
+                                    },
+                                    {
+                                        type: 'object',
+                                        additionalProperties: true
+                                    }
+                                ],
+                                description: 'Parameter of the action'
+                            },
+                            locate: {
+                                type: [
+                                    'object',
+                                    'null'
+                                ],
+                                properties: {
+                                    id: {
+                                        type: 'string'
+                                    },
+                                    prompt: {
+                                        type: 'string'
+                                    }
+                                },
+                                required: [
+                                    'id',
+                                    'prompt'
+                                ],
+                                additionalProperties: false,
+                                description: 'Location information for the target element'
+                            }
+                        },
+                        required: [
+                            'thought',
+                            'type',
+                            'param',
+                            'locate'
+                        ],
+                        additionalProperties: false
+                    },
+                    description: 'List of actions to be performed'
+                },
+                more_actions_needed_by_instruction: {
+                    type: 'boolean',
+                    description: 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.'
+                },
+                log: {
+                    type: 'string',
+                    description: 'Log what these planned actions do. Do not include further actions that have not been planned.'
+                },
+                error: {
+                    type: [
+                        'string',
+                        'null'
+                    ],
+                    description: 'Error messages about unexpected situations'
+                }
+            },
+            required: [
+                'actions',
+                'more_actions_needed_by_instruction',
+                'log',
+                'error'
+            ],
+            additionalProperties: false
+        }
+    }
+};
+const generateTaskBackgroundContext = (userInstruction, log, userActionContext)=>{
+    if (log) return `
+Here is the user's instruction:
+<instruction>
+  <high_priority_knowledge>
+    ${userActionContext}
+  </high_priority_knowledge>
+  ${userInstruction}
+</instruction>
+These are the logs from previous executions, which indicate what was done in the previous actions.
+Do NOT repeat these actions.
+<previous_logs>
+${log}
+</previous_logs>
+`;
+    return `
+Here is the user's instruction:
+<instruction>
+  <high_priority_knowledge>
+    ${userActionContext}
+  </high_priority_knowledge>
+  ${userInstruction}
+</instruction>
+`;
+};
+const automationUserPrompt = (vlMode)=>{
+    if (vlMode) return new prompts_namespaceObject.PromptTemplate({
+        template: '{taskBackgroundContext}',
+        inputVariables: [
+            'taskBackgroundContext'
+        ]
+    });
+    return new prompts_namespaceObject.PromptTemplate({
+        template: `
+pageDescription:
+=====================================
+{pageDescription}
+=====================================
+{taskBackgroundContext}`,
+        inputVariables: [
+            "pageDescription",
+            'taskBackgroundContext'
+        ]
+    });
+};
+exports.automationUserPrompt = __webpack_exports__.automationUserPrompt;
+exports.descriptionForAction = __webpack_exports__.descriptionForAction;
+exports.generateTaskBackgroundContext = __webpack_exports__.generateTaskBackgroundContext;
+exports.planSchema = __webpack_exports__.planSchema;
+exports.systemPromptToTaskPlanning = __webpack_exports__.systemPromptToTaskPlanning;
+for(var __webpack_i__ in __webpack_exports__)if (-1 === [
+    "automationUserPrompt",
+    "descriptionForAction",
+    "generateTaskBackgroundContext",
+    "planSchema",
+    "systemPromptToTaskPlanning"
+].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
+Object.defineProperty(exports, '__esModule', {
+    value: true
+});
+//# sourceMappingURL=llm-planning.js.map

package/dist/lib/ai-model/prompt/llm-planning.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"ai-model/prompt/llm-planning.js","sources":["webpack://@midscene/core/webpack/runtime/compat_get_default_export","webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["// getDefaultExport function for compatibility with non-ESM modules\n__webpack_require__.n = (module) => {\n\tvar getter = module && module.__esModule ?\n\t\t() => (module['default']) :\n\t\t() => (module);\n\t__webpack_require__.d(getter, { a: getter });\n\treturn getter;\n};\n","__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import assert from 'node:assert';\nimport type { DeviceAction } from '@/types';\nimport { PromptTemplate } from '@langchain/core/prompts';\nimport type { vlLocateMode } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\nconst vlCoTLog = `\"what_the_user_wants_to_do_next_by_instruction\": string, // What the user wants to do according to the instruction and previous logs. `;\nconst vlCurrentLog = `\"log\": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do .. first\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\nconst llmCurrentLog = `\"log\": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do ..\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\nconst vlLocateParam = (required: boolean) =>\n `locate${required ? '' : '?'}: {bbox: [number, number, number, number], prompt: string }`;\nconst llmLocateParam = (required: boolean) =>\n `locate${required ? '' : '?'}: {\"id\": string, \"prompt\": string}`;\n\nexport const descriptionForAction = (\n action: DeviceAction,\n locatorScheme: string,\n) => {\n const tab = ' ';\n let locateParam = '';\n if (action.location === 'required') {\n locateParam = locatorScheme;\n } else if (action.location === 'optional') {\n locateParam = `${locatorScheme} | null`;\n } else if (action.location === false) {\n locateParam = '';\n }\n const locatorParam = locateParam ? `- ${locateParam}` : '';\n\n if (action.whatToLocate) {\n if (!locateParam) {\n console.warn(\n `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`,\n );\n } else {\n locateParam += ` // ${action.whatToLocate}`;\n }\n }\n\n let paramSchema = '';\n if (action.paramSchema) {\n paramSchema = `- param: ${action.paramSchema}`;\n }\n if (action.paramDescription) {\n assert(\n paramSchema,\n `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`,\n );\n paramSchema += ` // ${action.paramDescription}`;\n }\n\n const fields = [paramSchema, locatorParam].filter(Boolean);\n\n return `- ${action.name}, ${action.description}\n${tab}- type: \"${action.name}\"\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nconst systemTemplateOfVLPlanning = ({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction[];\n vlMode: ReturnType<typeof vlLocateMode>;\n}) => {\n const actionNameList = actionSpace.map((action) => action.name).join(', ');\n const actionDescriptionList = actionSpace.map((action) =>\n descriptionForAction(action, vlLocateParam(action.location === 'required')),\n );\n const actionList = actionDescriptionList.join('\\n');\n\n return `\nTarget: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\nRestriction:\n- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.\n- Always give ONLY ONE action in \\`log\\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.\n- Don't repeat actions in the previous logs.\n- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.\n\nSupporting actions:\n${actionList}\n\nField description:\n* The \\`prompt\\` field inside the \\`locate\\` field is a short description that could be used to locate the element.\n\nReturn in JSON format:\n{\n ${vlCoTLog}\n ${vlCurrentLog}\n ${commonOutputFields}\n \"action\": \n {\n // one of the supporting actions\n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, when the instruction is \"click 'Confirm' button, and click 'Yes' in popup\" and the log is \"I will use action Tap to click 'Confirm' button\", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.\n\nthis and output the JSON:\n\n{\n \"what_the_user_wants_to_do_next_by_instruction\": \"We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup\",\n \"log\": \"I will use action Tap to click 'Yes' in popup\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox\": [100, 100, 200, 200],\n \"prompt\": \"The 'Yes' button in popup\"\n }\n }\n}\n`;\n};\n\nconst systemTemplateOfLLM = ({\n actionSpace,\n}: { actionSpace: DeviceAction[] }) => {\n const actionNameList = actionSpace.map((action) => action.name).join(' / ');\n const actionDescriptionList = actionSpace.map((action) =>\n descriptionForAction(\n action,\n llmLocateParam(action.location === 'required'),\n ),\n );\n const actionList = actionDescriptionList.join('\\n');\n\n return `\n## Role\n\nYou are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.\n\n## Objective\n\n- Decompose the instruction user asked into a series of actions\n- Locate the target element if possible\n- If the instruction cannot be accomplished, give a further plan.\n\n## Workflow\n\n1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.\n2. Decompose the user's task into a sequence of feasible actions, and place it in the \\`actions\\` field. There are different types of actions (${actionNameList}). The \"About the action\" section below will give you more details.\n3. Consider whether the user's instruction will be accomplished after the actions you composed.\n- If the instruction is accomplished, set \\`more_actions_needed_by_instruction\\` to false.\n- If more actions are needed, set \\`more_actions_needed_by_instruction\\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \\`log\\` field, he or she will continue the task according to your logs.\n4. If the task is not feasible on this page, set \\`error\\` field to the reason.\n\n## Constraints\n\n- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.\n- Trust the \"What have been done\" field about the task (if any), don't repeat actions in it.\n- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \\`\\`\\`json\\`\\`\\`.\n- If the screenshot and the instruction are totally irrelevant, set reason in the \\`error\\` field.\n\n## About the \\`actions\\` field\n\nThe \\`locate\\` param is commonly used in the \\`param\\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:\n\ntype LocateParam = {\n \"id\": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.\n \"prompt\"?: string // the description of the element to find. It can only be omitted when locate is null.\n} | null // If it's not on the page, the LocateParam should be null\n\n## Supported actions\n\nEach action has a \\`type\\` and corresponding \\`param\\`. To be detailed:\n${actionList}\n\n`.trim();\n};\n\nconst outputTemplate = `\n## Output JSON Format:\n\nThe JSON format is as follows:\n\n{\n \"actions\": [\n // ... some actions\n ],\n ${llmCurrentLog}\n ${commonOutputFields}\n}\n\n## Examples\n\n### Example: Decompose a task\n\nWhen you received the following information:\n\n* Instruction: 'Click the language switch button, wait 1s, click \"English\"'\n* Logs: null\n* Page Context (screenshot and description) shows: There is a language switch button, and the \"English\" option is not shown in the screenshot now.\n\nBy viewing the page screenshot and description, you should consider this and output the JSON:\n\n* The user intent is: tap the switch button, sleep, and tap the 'English' option\n* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.\n* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.\n* The \"English\" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.\n* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.\n* The task cannot be accomplished (because the last tapping action is not finished yet), so the \\`more_actions_needed_by_instruction\\` field is true. The \\`error\\` field is null.\n\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\", \n \"param\": null,\n \"locate\": { id: \"c81c4e9a33\", prompt: \"The language switch button\" }},\n },\n {\n \"thought\": \"Wait for 1 second to ensure the language options are displayed.\",\n \"type\": \"Sleep\",\n \"param\": { \"timeMs\": 1000 },\n }\n ],\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"log\": \"Click the language switch button to open the language options. Wait for 1 second\",\n}\n\n### Example: What NOT to do\nWrong output:\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\",\n \"param\": null,\n \"locate\": {\n { \"id\": \"c81c4e9a33\" }, // WRONG: prompt is missing, this is not a valid LocateParam\n }\n },\n {\n \"thought\": \"Click the English option\",\n \"type\": \"Tap\", \n \"param\": null,\n \"locate\": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished\n }\n ],\n \"more_actions_needed_by_instruction\": false, // WRONG: should be true\n \"log\": \"Click the language switch button to open the language options\",\n}\n`;\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction[];\n vlMode: ReturnType<typeof vlLocateMode>;\n}) {\n if (vlMode) {\n return systemTemplateOfVLPlanning({ actionSpace, vlMode });\n }\n\n return `${systemTemplateOfLLM({ actionSpace })}\\n\\n${outputTemplate}`;\n}\n\nexport const planSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'action_items',\n strict: false,\n schema: {\n type: 'object',\n strict: false,\n properties: {\n actions: {\n type: 'array',\n items: {\n type: 'object',\n strict: false,\n properties: {\n thought: {\n type: 'string',\n description:\n 'Reasons for generating this task, and why this task is feasible on this page',\n },\n type: {\n type: 'string',\n description: 'Type of action',\n },\n param: {\n anyOf: [\n { type: 'null' },\n {\n type: 'object',\n additionalProperties: true,\n },\n ],\n description: 'Parameter of the action',\n },\n locate: {\n type: ['object', 'null'],\n properties: {\n id: { type: 'string' },\n prompt: { type: 'string' },\n },\n required: ['id', 'prompt'],\n additionalProperties: false,\n description: 'Location information for the target element',\n },\n },\n required: ['thought', 'type', 'param', 'locate'],\n additionalProperties: false,\n },\n description: 'List of actions to be performed',\n },\n more_actions_needed_by_instruction: {\n type: 'boolean',\n description:\n 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.',\n },\n log: {\n type: 'string',\n description:\n 'Log what these planned actions do. Do not include further actions that have not been planned.',\n },\n error: {\n type: ['string', 'null'],\n description: 'Error messages about unexpected situations',\n },\n },\n required: [\n 'actions',\n 'more_actions_needed_by_instruction',\n 'log',\n 'error',\n ],\n additionalProperties: false,\n },\n },\n};\n\nexport const generateTaskBackgroundContext = (\n userInstruction: string,\n log?: string,\n userActionContext?: string,\n) => {\n if (log) {\n return `\nHere is the user's instruction:\n\n<instruction>\n <high_priority_knowledge>\n ${userActionContext}\n </high_priority_knowledge>\n\n ${userInstruction}\n</instruction>\n\nThese are the logs from previous executions, which indicate what was done in the previous actions.\nDo NOT repeat these actions.\n<previous_logs>\n${log}\n</previous_logs>\n`;\n }\n\n return `\nHere is the user's instruction:\n<instruction>\n <high_priority_knowledge>\n ${userActionContext}\n </high_priority_knowledge>\n\n ${userInstruction}\n</instruction>\n`;\n};\n\nexport const automationUserPrompt = (\n vlMode: ReturnType<typeof vlLocateMode>,\n) => {\n if (vlMode) {\n return new PromptTemplate({\n template: '{taskBackgroundContext}',\n inputVariables: ['taskBackgroundContext'],\n });\n }\n\n return new PromptTemplate({\n template: `\npageDescription:\n=====================================\n{pageDescription}\n=====================================\n\n{taskBackgroundContext}`,\n inputVariables: ['pageDescription', 'taskBackgroundContext'],\n });\n};\n"],"names":["__webpack_require__","module","getter","definition","key","Object","obj","prop","Symbol","vlCoTLog","vlCurrentLog","llmCurrentLog","commonOutputFields","vlLocateParam","required","llmLocateParam","descriptionForAction","action","locatorScheme","tab","locateParam","locatorParam","console","paramSchema","assert","fields","Boolean","systemTemplateOfVLPlanning","actionSpace","vlMode","actionNameList","actionDescriptionList","actionList","bboxDescription","systemTemplateOfLLM","outputTemplate","systemPromptToTaskPlanning","planSchema","generateTaskBackgroundContext","userInstruction","log","userActionContext","automationUserPrompt","PromptTemplate"],"mappings":";;;IACAA,oBAAoB,CAAC,GAAG,CAACC;QACxB,IAAIC,SAASD,UAAUA,OAAO,UAAU,GACvC,IAAOA,MAAM,CAAC,UAAU,GACxB,IAAOA;QACRD,oBAAoB,CAAC,CAACE,QAAQ;YAAE,GAAGA;QAAO;QAC1C,OAAOA;IACR;;;ICPAF,oBAAoB,CAAC,GAAG,CAAC,UAASG;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGH,oBAAoB,CAAC,CAACG,YAAYC,QAAQ,CAACJ,oBAAoB,CAAC,CAAC,UAASI,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAJ,oBAAoB,CAAC,GAAG,CAACM,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFP,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOQ,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;ACEA,MAAMI,WAAW;AACjB,MAAMC,eAAe;AACrB,MAAMC,gBAAgB;AAEtB,MAAMC,qBAAqB,CAAC;+NACmM,CAAC;AAChO,MAAMC,gBAAgB,CAACC,WACrB,CAAC,MAAM,EAAEA,WAAW,KAAK,IAAI,2DAA2D,CAAC;AAC3F,MAAMC,iBAAiB,CAACD,WACtB,CAAC,MAAM,EAAEA,WAAW,KAAK,IAAI,kCAAkC,CAAC;AAE3D,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,IAAIC,cAAc;IAClB,IAAIH,AAAoB,eAApBA,OAAO,QAAQ,EACjBG,cAAcF;SACT,IAAID,AAAoB,eAApBA,OAAO,QAAQ,EACxBG,cAAc,GAAGF,cAAc,OAAO,CAAC;SAClC,IAAID,AAAoB,UAApBA,OAAO,QAAQ,EACxBG,cAAc;IAEhB,MAAMC,eAAeD,cAAc,CAAC,EAAE,EAAEA,aAAa,GAAG;IAExD,IAAIH,OAAO,YAAY,EACrB,IAAKG,aAKHA,eAAe,CAAC,IAAI,EAAEH,OAAO,YAAY,EAAE;SAJ3CK,QAAQ,IAAI,CACV,CAAC,oCAAoC,EAAEL,OAAO,IAAI,CAAC,6EAA6E,CAAC;IAOvI,IAAIM,cAAc;IAClB,IAAIN,OAAO,WAAW,EACpBM,cAAc,CAAC,SAAS,EAAEN,OAAO,WAAW,EAAE;IAEhD,IAAIA,OAAO,gBAAgB,EAAE;QAC3BO,+BACED,aACA,CAAC,qEAAqE,EAAEN,OAAO,IAAI,CAAC,UAAU,EAAEA,OAAO,WAAW,EAAE;QAEtHM,eAAe,CAAC,IAAI,EAAEN,OAAO,gBAAgB,EAAE;IACjD;IAEA,MAAMQ,SAAS;QAACF;QAAaF;KAAa,CAAC,MAAM,CAACK;IAElD,OAAO,CAAC,EAAE,EAAET,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,CAAC;AACjD,EAAEE,IAAI,SAAS,EAAEF,OAAO,IAAI,CAAC;AAC7B,EAAEE,MAAMM,OAAO,IAAI,CAAC,CAAC,EAAE,EAAEN,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEA,MAAMQ,6BAA6B,CAAC,EAClCC,WAAW,EACXC,MAAM,EAIP;IACC,MAAMC,iBAAiBF,YAAY,GAAG,CAAC,CAACX,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAMc,wBAAwBH,YAAY,GAAG,CAAC,CAACX,SAC7CD,qBAAqBC,QAAQJ,cAAcI,AAAoB,eAApBA,OAAO,QAAQ;IAE5D,MAAMe,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;yIAK+H,EAAED,eAAe;;kGAExD,EAAEG,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EAAgBJ,QAAQ;;;AAG5H,EAAEG,WAAW;;;;;;;EAOX,EAAEvB,SAAS;EACX,EAAEC,aAAa;EACf,EAAEE,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;AAyBvB,CAAC;AACD;AAEA,MAAMsB,sBAAsB,CAAC,EAC3BN,WAAW,EACqB;IAChC,MAAME,iBAAiBF,YAAY,GAAG,CAAC,CAACX,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAMc,wBAAwBH,YAAY,GAAG,CAAC,CAACX,SAC7CD,qBACEC,QACAF,eAAeE,AAAoB,eAApBA,OAAO,QAAQ;IAGlC,MAAMe,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;;;;;;;;;;+IAcqI,EAAED,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;AAyBhK,EAAEE,WAAW;;AAEb,CAAC,CAAC,IAAI;AACN;AAEA,MAAMG,iBAAiB,CAAC;;;;;;;;;EAStB,EAAExB,cAAc;EAChB,EAAEC,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA+DvB,CAAC;AAEM,eAAewB,2BAA2B,EAC/CR,WAAW,EACXC,MAAM,EAIP;IACC,IAAIA,QACF,OAAOF,2BAA2B;QAAEC;QAAaC;IAAO;IAG1D,OAAO,GAAGK,oBAAoB;QAAEN;IAAY,GAAG,IAAI,EAAEO,gBAAgB;AACvE;AAEO,MAAME,aAAuC;IAClD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,QAAQ;YACR,YAAY;gBACV,SAAS;oBACP,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,QAAQ;wBACR,YAAY;4BACV,SAAS;gCACP,MAAM;gCACN,aACE;4BACJ;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,OAAO;gCACL,OAAO;oCACL;wCAAE,MAAM;oCAAO;oCACf;wCACE,MAAM;wCACN,sBAAsB;oCACxB;iCACD;gCACD,aAAa;4BACf;4BACA,QAAQ;gCACN,MAAM;oCAAC;oCAAU;iCAAO;gCACxB,YAAY;oCACV,IAAI;wCAAE,MAAM;oCAAS;oCACrB,QAAQ;wCAAE,MAAM;oCAAS;gCAC3B;gCACA,UAAU;oCAAC;oCAAM;iCAAS;gCAC1B,sBAAsB;gCACtB,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAW;4BAAQ;4BAAS;yBAAS;wBAChD,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,oCAAoC;oBAClC,MAAM;oBACN,aACE;gBACJ;gBACA,KAAK;oBACH,MAAM;oBACN,aACE;gBACJ;gBACA,OAAO;oBACL,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBACR;gBACA;gBACA;gBACA;aACD;YACD,sBAAsB;QACxB;IACF;AACF;AAEO,MAAMC,gCAAgC,CAC3CC,iBACAC,KACAC;IAEA,IAAID,KACF,OAAO,CAAC;;;;;IAKR,EAAEC,kBAAkB;;;EAGtB,EAAEF,gBAAgB;;;;;;AAMpB,EAAEC,IAAI;;AAEN,CAAC;IAGC,OAAO,CAAC;;;;IAIN,EAAEC,kBAAkB;;;EAGtB,EAAEF,gBAAgB;;AAEpB,CAAC;AACD;AAEO,MAAMG,uBAAuB,CAClCb;IAEA,IAAIA,QACF,OAAO,IAAIc,wBAAAA,cAAcA,CAAC;QACxB,UAAU;QACV,gBAAgB;YAAC;SAAwB;IAC3C;IAGF,OAAO,IAAIA,wBAAAA,cAAcA,CAAC;QACxB,UAAU,CAAC;;;;;;uBAMQ,CAAC;QACpB,gBAAgB;YAAC;YAAmB;SAAwB;IAC9D;AACF"}

package/dist/lib/ai-model/prompt/llm-section-locator.js ADDED Viewed

@@ -0,0 +1,84 @@
+"use strict";
+var __webpack_require__ = {};
+(()=>{
+    __webpack_require__.d = (exports1, definition)=>{
+        for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
+            enumerable: true,
+            get: definition[key]
+        });
+    };
+})();
+(()=>{
+    __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
+})();
+(()=>{
+    __webpack_require__.r = (exports1)=>{
+        if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
+            value: 'Module'
+        });
+        Object.defineProperty(exports1, '__esModule', {
+            value: true
+        });
+    };
+})();
+var __webpack_exports__ = {};
+__webpack_require__.r(__webpack_exports__);
+__webpack_require__.d(__webpack_exports__, {
+    sectionLocatorInstruction: ()=>sectionLocatorInstruction,
+    systemPromptToLocateSection: ()=>systemPromptToLocateSection
+});
+const prompts_namespaceObject = require("@langchain/core/prompts");
+const external_common_js_namespaceObject = require("./common.js");
+function systemPromptToLocateSection(vlMode) {
+    return `
+You goal is to find out one section containing the target element in the screenshot, put it in the \`bbox\` field. If the user describe the target element with some reference elements, you should also find the section containing the reference elements, put it in the \`references_bbox\` field.
+Usually, it should be approximately an area not more than 300x300px. Changes of the size are allowed if there are many elements to cover.
+return in this JSON format:
+\`\`\`json
+{
+  "bbox": [number, number, number, number],
+  "references_bbox"?: [
+    [number, number, number, number],
+    [number, number, number, number],
+    ...
+  ],
+  "error"?: string
+}
+\`\`\`
+In which, all the numbers in the \`bbox\` and \`references_bbox\` represent ${(0, external_common_js_namespaceObject.bboxDescription)(vlMode)}.
+For example, if the user describe the target element as "the delete button on the second row with title 'Peter'", you should put the bounding box of the delete button in the \`bbox\` field, and the bounding box of the second row in the \`references_bbox\` field.
+the return value should be like this:
+\`\`\`json
+{
+  "bbox": [100, 100, 200, 200],
+  "references_bbox": [[100, 100, 200, 200]]
+}
+\`\`\`
+`;
+}
+const sectionLocatorInstruction = new prompts_namespaceObject.PromptTemplate({
+    template: `Here is the target element user interested in:
+<targetDescription>
+{sectionDescription}
+</targetDescription>
+  `,
+    inputVariables: [
+        "sectionDescription"
+    ]
+});
+exports.sectionLocatorInstruction = __webpack_exports__.sectionLocatorInstruction;
+exports.systemPromptToLocateSection = __webpack_exports__.systemPromptToLocateSection;
+for(var __webpack_i__ in __webpack_exports__)if (-1 === [
+    "sectionLocatorInstruction",
+    "systemPromptToLocateSection"
+].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
+Object.defineProperty(exports, '__esModule', {
+    value: true
+});
+//# sourceMappingURL=llm-section-locator.js.map

package/dist/lib/ai-model/prompt/llm-section-locator.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"ai-model/prompt/llm-section-locator.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/prompt/llm-section-locator.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { PromptTemplate } from '@langchain/core/prompts';\nimport type { vlLocateMode } from '@midscene/shared/env';\nimport { bboxDescription } from './common';\n\nexport function systemPromptToLocateSection(\n vlMode: ReturnType<typeof vlLocateMode>,\n) {\n return `\nYou goal is to find out one section containing the target element in the screenshot, put it in the \\`bbox\\` field. If the user describe the target element with some reference elements, you should also find the section containing the reference elements, put it in the \\`references_bbox\\` field.\n\nUsually, it should be approximately an area not more than 300x300px. Changes of the size are allowed if there are many elements to cover.\n\nreturn in this JSON format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number],\n \"references_bbox\"?: [\n [number, number, number, number],\n [number, number, number, number],\n ...\n ],\n \"error\"?: string\n}\n\\`\\`\\`\n\nIn which, all the numbers in the \\`bbox\\` and \\`references_bbox\\` represent ${bboxDescription(vlMode)}.\n\nFor example, if the user describe the target element as \"the delete button on the second row with title 'Peter'\", you should put the bounding box of the delete button in the \\`bbox\\` field, and the bounding box of the second row in the \\`references_bbox\\` field.\n\nthe return value should be like this:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"references_bbox\": [[100, 100, 200, 200]]\n}\n\\`\\`\\`\n`;\n}\n\nexport const sectionLocatorInstruction = new PromptTemplate({\n template: `Here is the target element user interested in:\n<targetDescription>\n{sectionDescription}\n</targetDescription>\n `,\n inputVariables: ['sectionDescription'],\n});\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","systemPromptToLocateSection","vlMode","bboxDescription","sectionLocatorInstruction","PromptTemplate"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;ACFO,SAASI,4BACdC,MAAuC;IAEvC,OAAO,CAAC;;;;;;;;;;;;;;;;;;4EAkBkE,EAAEC,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EAAgBD,QAAQ;;;;;;;;;;;AAWtG,CAAC;AACD;AAEO,MAAME,4BAA4B,IAAIC,wBAAAA,cAAcA,CAAC;IAC1D,UAAU,CAAC;;;;EAIX,CAAC;IACD,gBAAgB;QAAC;KAAqB;AACxC"}