npm - @midscene/core - Versions diffs - 0.8.4 → 0.8.5-beta-20241122072506.0 - Mend

@midscene/core 0.8.4 → 0.8.5-beta-20241122072506.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/lib/ai-model.js +102 -51
package/dist/lib/index.js +110 -53
package/dist/lib/types/ai-model.d.ts +2 -2
package/dist/lib/types/{index-690c2a06.d.ts → index-152b8346.d.ts} +1 -1
package/dist/lib/types/index.d.ts +5 -5
package/dist/lib/types/{types-29994b1b.d.ts → types-0d8eeece.d.ts} +3 -1
package/dist/lib/types/utils.d.ts +3 -3
package/dist/lib/utils.js +2 -1
package/package.json +2 -2
package/report/index.html +2 -2

package/dist/lib/ai-model.js CHANGED Viewed

@@ -4497,7 +4497,7 @@ Input Example:
       },
       "elementInfos": [
         {
-          "id": "we23xsfwe", // ID of the element
+          "id": "1231", // ID of the element
           "indexId": "0", // Index of the element，The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4513,7 +4513,7 @@ Input Example:
           }
         },
         {
-          "id": "wefew2222few2", // ID of the element
+          "id": "66551", // ID of the element
           "indexId": "1", // Index of the element,The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4530,7 +4530,7 @@ Input Example:
         },
         ...
         {
-          "id": "kwekfj2323",
+          "id": "12344",
           "indexId": "2", // Index of the element，The image is labeled to the left of the element
           "attributes": {
             "nodeType": "TEXT Node",
@@ -4563,7 +4563,7 @@ Output Example:
       "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
       "text": "",
       // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
-      "id": "wefew2222few2"
+      "id": "1231"
     }
   ],
   "errors": []
@@ -4650,6 +4650,19 @@ var findElementSchema = {
 };
 // src/ai-model/prompt/planning.ts
+var quickAnswerFormat = () => {
+  const matchByPosition = getAIConfig(MATCH_BY_POSITION);
+  const description = `
+  ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
+  `;
+  const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
+  const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
+  return {
+    description,
+    format,
+    sample
+  };
+};
 function systemPromptToTaskPlanning() {
   return `
 ## Role:
@@ -4673,32 +4686,24 @@ Each action has a type and corresponding param. To be detailed:
 * type: 'KeyboardPress',  press a key
   * param: { value: string },  the value to input or the key to press. Use （Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta） to represent the key.
 * type: 'Scroll'
-  * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
+  * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
 * type: 'Error'
   * param: { message: string }, the error message
 * type: 'Sleep'
   * param: { timeMs: number }, wait for timeMs milliseconds
-Here is an example of how to decompose a task.
-When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
-* Locate: 'The search bar'
-* Input: 'Weather in Shanghai'
-* Sleep: 1000
-* KeyboardPress: 'Enter'
 Remember:
 1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
-2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
-If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
+## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
-## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
-If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
+If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
 {
-  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
+  "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
   "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-  ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
+  ${quickAnswerFormat().description}
 }
 If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4711,33 +4716,71 @@ Please return the result in JSON format as follows:
   actions: [ // always return in Array
     {
       "thought": "find out the search bar",
-      "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
-      "param": {
+      "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
+      "param": { //
         "prompt": "The search bar"
       },
-      "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
-        "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
-        "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-        ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
+      "quickAnswer": {
+        "reason": "This is ...",
+        "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
+        ${quickAnswerFormat().format}
       } | null,
     },
     {
       "thought": "Reasons for generating this task, and why this task is feasible on this page",
-      "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
-      "param": any, // Parameter towards the task type
+      "type": "Tap",
+      "param": null,
     },
+    // ... more actions
+  ],
+  error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
+}
+## Here is an example of how to decompose a task
+When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
+* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
+* Think and look in detail and fill all the fields in the JSON format.
+\`\`\`json
+{
+  queryLanguage: 'English',
+  actions:[
     {
-      "thought": "Reasons for generating this task, and why this task is feasible on this page",
-      "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
-      "param": {
-        "prompt": "The search bar"
+      thought: "Locate the language switch button with the text '中文'.",
+      type: 'Locate',
+      param: { prompt: "The language switch button with the text '中文'" },
+      quickAnswer: { // according to Objective 2,  this action type is 'Locate', and we can find the element, so we need to give a quick answer
+        reason: "It is located  near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
+        text: '中文',
+        ${quickAnswerFormat().sample}
       },
-      "quickAnswer": null,
     },
-    // ... more actions
+    {
+      thought: 'Click the language switch button to open the language options.',
+      type: 'Tap',
+      param: null,
+    },
+    {
+      thought: 'Wait for 1 second to ensure the language options are displayed.',
+      type: 'Sleep',
+      param: { timeMs: 1000 },
+    },
+    {
+      thought: "Locate the 'English' option in the language menu.",
+      type: 'Locate',
+      param: { prompt: "The 'English' option in the language menu" },
+      quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
+    },
+    {
+      thought: "Click the 'English' option to switch the language.",
+      type: 'Tap',
+      param: null,
+    }
   ],
-  error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
 }
+\`\`\`
 `;
 }
 var planSchema = {
@@ -4775,7 +4818,7 @@ var planSchema = {
                 properties: {
                   reason: {
                     type: "string",
-                    description: "Reason for finding element 4"
+                    description: "Reason for finding this element"
                   },
                   text: {
                     type: "string",
@@ -4833,8 +4876,6 @@ skill content:
 Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
 Return in the following JSON format:
 {
   language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
@@ -4882,8 +4923,7 @@ var assertSchema = {
 function describeSize(size) {
   return `${size.width} x ${size.height}`;
 }
-function truncateText(text) {
-  const maxLength = 50;
+function truncateText(text, maxLength = 20) {
   if (text && text.length > maxLength) {
     return `${text.slice(0, maxLength)}...`;
   }
@@ -4915,16 +4955,15 @@ async function describeUserPage(context) {
   const elementInfosDescription = cropFieldInformation(elementsInfo);
   return {
     description: `
-    {
-      // The size of the page
-      "pageSize": ${describeSize({ width, height })},
+{
+  // The size of the page
+  "pageSize": ${describeSize({ width, height })},
-      ${// if match by id, use the description of the element
-    !getAIConfig(MATCH_BY_POSITION) ? `
-          // json description of the element
-          "content": ${JSON.stringify(elementInfosDescription)}
-          ` : ""}
-    }`,
+  ${// if match by id, use the description of the element
+    getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
+  "content": ${JSON.stringify(elementInfosDescription)}
+      `}
+}`,
     elementById(id) {
       (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
       const item = idElementMap[`${id}`];
@@ -4943,7 +4982,13 @@ function cropFieldInformation(elementsInfo) {
       const tailorAttributes = Object.keys(attributes).reduce(
         (res, currentKey) => {
           const attributeVal = attributes[currentKey];
-          res[currentKey] = truncateText(attributeVal);
+          if (currentKey === "style" || currentKey === "src")
+            return res;
+          if (currentKey === "nodeType") {
+            res[currentKey] = attributeVal.replace(/\sNode$/, "");
+          } else {
+            res[currentKey] = truncateText(attributeVal);
+          }
           return res;
         },
         {}
@@ -4952,12 +4997,18 @@ function cropFieldInformation(elementsInfo) {
         id,
         markerId: item.indexId,
         attributes: tailorAttributes,
-        rect,
+        rect: {
+          left: rect.left,
+          top: rect.top,
+          width: rect.width,
+          height: rect.height
+          // remove 'zoom' if it exists
+        },
         content: tailorContent
       };
     }
   );
-  return JSON.stringify(elementInfosDescription);
+  return elementInfosDescription;
 }
 // src/ai-model/openai/index.ts

package/dist/lib/index.js CHANGED Viewed

@@ -4292,6 +4292,7 @@ __export(src_exports, {
   default: () => src_default,
   getAIConfig: () => getAIConfig,
   getElement: () => getElement,
+  getLogDirByType: () => getLogDirByType,
   getSection: () => getSection,
   getVersion: () => getVersion,
   overrideAIConfig: () => overrideAIConfig,
@@ -4524,7 +4525,7 @@ Input Example:
       },
       "elementInfos": [
         {
-          "id": "we23xsfwe", // ID of the element
+          "id": "1231", // ID of the element
           "indexId": "0", // Index of the element，The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4540,7 +4541,7 @@ Input Example:
           }
         },
         {
-          "id": "wefew2222few2", // ID of the element
+          "id": "66551", // ID of the element
           "indexId": "1", // Index of the element,The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4557,7 +4558,7 @@ Input Example:
         },
         ...
         {
-          "id": "kwekfj2323",
+          "id": "12344",
           "indexId": "2", // Index of the element，The image is labeled to the left of the element
           "attributes": {
             "nodeType": "TEXT Node",
@@ -4590,7 +4591,7 @@ Output Example:
       "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
       "text": "",
       // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
-      "id": "wefew2222few2"
+      "id": "1231"
     }
   ],
   "errors": []
@@ -4677,6 +4678,19 @@ var findElementSchema = {
 };
 // src/ai-model/prompt/planning.ts
+var quickAnswerFormat = () => {
+  const matchByPosition = getAIConfig(MATCH_BY_POSITION);
+  const description = `
+  ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
+  `;
+  const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
+  const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
+  return {
+    description,
+    format,
+    sample
+  };
+};
 function systemPromptToTaskPlanning() {
   return `
 ## Role:
@@ -4700,32 +4714,24 @@ Each action has a type and corresponding param. To be detailed:
 * type: 'KeyboardPress',  press a key
   * param: { value: string },  the value to input or the key to press. Use （Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta） to represent the key.
 * type: 'Scroll'
-  * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
+  * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
 * type: 'Error'
   * param: { message: string }, the error message
 * type: 'Sleep'
   * param: { timeMs: number }, wait for timeMs milliseconds
-Here is an example of how to decompose a task.
-When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
-* Locate: 'The search bar'
-* Input: 'Weather in Shanghai'
-* Sleep: 1000
-* KeyboardPress: 'Enter'
 Remember:
 1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
-2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
-If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
+## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
-## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
-If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
+If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
 {
-  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
+  "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
   "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-  ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
+  ${quickAnswerFormat().description}
 }
 If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4738,33 +4744,71 @@ Please return the result in JSON format as follows:
   actions: [ // always return in Array
     {
       "thought": "find out the search bar",
-      "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
-      "param": {
+      "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
+      "param": { //
         "prompt": "The search bar"
       },
-      "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
-        "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
-        "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-        ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
+      "quickAnswer": {
+        "reason": "This is ...",
+        "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
+        ${quickAnswerFormat().format}
       } | null,
     },
     {
       "thought": "Reasons for generating this task, and why this task is feasible on this page",
-      "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
-      "param": any, // Parameter towards the task type
+      "type": "Tap",
+      "param": null,
     },
+    // ... more actions
+  ],
+  error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
+}
+## Here is an example of how to decompose a task
+When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
+* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
+* Think and look in detail and fill all the fields in the JSON format.
+\`\`\`json
+{
+  queryLanguage: 'English',
+  actions:[
     {
-      "thought": "Reasons for generating this task, and why this task is feasible on this page",
-      "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
-      "param": {
-        "prompt": "The search bar"
+      thought: "Locate the language switch button with the text '中文'.",
+      type: 'Locate',
+      param: { prompt: "The language switch button with the text '中文'" },
+      quickAnswer: { // according to Objective 2,  this action type is 'Locate', and we can find the element, so we need to give a quick answer
+        reason: "It is located  near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
+        text: '中文',
+        ${quickAnswerFormat().sample}
       },
-      "quickAnswer": null,
     },
-    // ... more actions
+    {
+      thought: 'Click the language switch button to open the language options.',
+      type: 'Tap',
+      param: null,
+    },
+    {
+      thought: 'Wait for 1 second to ensure the language options are displayed.',
+      type: 'Sleep',
+      param: { timeMs: 1000 },
+    },
+    {
+      thought: "Locate the 'English' option in the language menu.",
+      type: 'Locate',
+      param: { prompt: "The 'English' option in the language menu" },
+      quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
+    },
+    {
+      thought: "Click the 'English' option to switch the language.",
+      type: 'Tap',
+      param: null,
+    }
   ],
-  error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
 }
+\`\`\`
 `;
 }
 var planSchema = {
@@ -4802,7 +4846,7 @@ var planSchema = {
                 properties: {
                   reason: {
                     type: "string",
-                    description: "Reason for finding element 4"
+                    description: "Reason for finding this element"
                   },
                   text: {
                     type: "string",
@@ -4863,8 +4907,6 @@ skill content:
 Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
 Return in the following JSON format:
 {
   language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
@@ -4912,8 +4954,7 @@ var assertSchema = {
 function describeSize(size) {
   return `${size.width} x ${size.height}`;
 }
-function truncateText(text) {
-  const maxLength = 50;
+function truncateText(text, maxLength = 20) {
   if (text && text.length > maxLength) {
     return `${text.slice(0, maxLength)}...`;
   }
@@ -4945,16 +4986,15 @@ async function describeUserPage(context) {
   const elementInfosDescription = cropFieldInformation(elementsInfo);
   return {
     description: `
-    {
-      // The size of the page
-      "pageSize": ${describeSize({ width, height })},
+{
+  // The size of the page
+  "pageSize": ${describeSize({ width, height })},
-      ${// if match by id, use the description of the element
-    !getAIConfig(MATCH_BY_POSITION) ? `
-          // json description of the element
-          "content": ${JSON.stringify(elementInfosDescription)}
-          ` : ""}
-    }`,
+  ${// if match by id, use the description of the element
+    getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
+  "content": ${JSON.stringify(elementInfosDescription)}
+      `}
+}`,
     elementById(id) {
       (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
       const item = idElementMap[`${id}`];
@@ -4973,7 +5013,13 @@ function cropFieldInformation(elementsInfo) {
       const tailorAttributes = Object.keys(attributes).reduce(
         (res, currentKey) => {
           const attributeVal = attributes[currentKey];
-          res[currentKey] = truncateText(attributeVal);
+          if (currentKey === "style" || currentKey === "src")
+            return res;
+          if (currentKey === "nodeType") {
+            res[currentKey] = attributeVal.replace(/\sNode$/, "");
+          } else {
+            res[currentKey] = truncateText(attributeVal);
+          }
           return res;
         },
         {}
@@ -4982,12 +5028,18 @@ function cropFieldInformation(elementsInfo) {
         id,
         markerId: item.indexId,
         attributes: tailorAttributes,
-        rect,
+        rect: {
+          left: rect.left,
+          top: rect.top,
+          width: rect.width,
+          height: rect.height
+          // remove 'zoom' if it exists
+        },
         content: tailorContent
       };
     }
   );
-  return JSON.stringify(elementInfosDescription);
+  return elementInfosDescription;
 }
 function retrieveElement(prompt, opt) {
   if (opt == null ? void 0 : opt.multi) {
@@ -5294,6 +5346,7 @@ function writeLogFile(opts) {
 # Midscene.js dump files
 ${logDirName}/report
 ${logDirName}/dump
+${logDirName}/tmp
 `,
         "utf-8"
       );
@@ -5325,7 +5378,7 @@ function stringifyDumpData(data, indents) {
   return JSON.stringify(data, replacerForPageObject, indents);
 }
 function getVersion() {
-  return "0.8.4";
+  return "0.8.5-beta-20241122072506.0";
 }
 // src/action/executor.ts
@@ -5348,9 +5401,12 @@ var Executor = class {
     };
   }
   async append(task) {
+    var _a, _b;
     (0, import_node_assert5.default)(
       this.status !== "error",
-      "executor is in error state, cannot append task"
+      `executor is in error state, cannot append task
+error=${(_a = this.latestErrorTask()) == null ? void 0 : _a.error}
+${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
     );
     if (Array.isArray(task)) {
       this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
@@ -6094,6 +6150,7 @@ var src_default = Insight;
   allAIConfig,
   getAIConfig,
   getElement,
+  getLogDirByType,
   getSection,
   getVersion,
   overrideAIConfig,

package/dist/lib/types/ai-model.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-690c2a06.js';
+export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-152b8346.js';
 export { ChatCompletionMessageParam } from 'openai/resources';
-import './types-29994b1b.js';
+import './types-0d8eeece.js';
 declare function systemPromptToFindElement(): string;

package/dist/lib/types/{index-690c2a06.d.ts → index-152b8346.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ChatCompletionMessageParam } from 'openai/resources';
-import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-29994b1b.js';
+import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-0d8eeece.js';
 type AIArgs = [
     ChatCompletionSystemMessageParam,

package/dist/lib/types/index.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-29994b1b.js';
-export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-29994b1b.js';
-import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-690c2a06.js';
-export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-690c2a06.js';
-export { getVersion, setLogDir } from './utils.js';
+import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-0d8eeece.js';
+export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-0d8eeece.js';
+import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-152b8346.js';
+export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-152b8346.js';
+export { getLogDirByType, getVersion, setLogDir } from './utils.js';
 import 'openai/resources';
 declare class Executor {

package/dist/lib/types/{types-29994b1b.d.ts → types-0d8eeece.d.ts} RENAMED Viewed

@@ -9,7 +9,9 @@ interface Size {
     height: number;
     dpr?: number;
 }
-type Rect = Point & Size;
+type Rect = Point & Size & {
+    zoom?: number;
+};
 declare enum NodeType {
     CONTAINER = "CONTAINER Node",
     FORM_ITEM = "FORM_ITEM Node",