@midscene/core 0.8.4 → 0.8.5-beta-20241122072506.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4497,7 +4497,7 @@ Input Example:
4497
4497
  },
4498
4498
  "elementInfos": [
4499
4499
  {
4500
- "id": "we23xsfwe", // ID of the element
4500
+ "id": "1231", // ID of the element
4501
4501
  "indexId": "0", // Index of the element,The image is labeled to the left of the element
4502
4502
  "attributes": { // Attributes of the element
4503
4503
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4513,7 +4513,7 @@ Input Example:
4513
4513
  }
4514
4514
  },
4515
4515
  {
4516
- "id": "wefew2222few2", // ID of the element
4516
+ "id": "66551", // ID of the element
4517
4517
  "indexId": "1", // Index of the element,The image is labeled to the left of the element
4518
4518
  "attributes": { // Attributes of the element
4519
4519
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4530,7 +4530,7 @@ Input Example:
4530
4530
  },
4531
4531
  ...
4532
4532
  {
4533
- "id": "kwekfj2323",
4533
+ "id": "12344",
4534
4534
  "indexId": "2", // Index of the element,The image is labeled to the left of the element
4535
4535
  "attributes": {
4536
4536
  "nodeType": "TEXT Node",
@@ -4563,7 +4563,7 @@ Output Example:
4563
4563
  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4564
4564
  "text": "",
4565
4565
  // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
4566
- "id": "wefew2222few2"
4566
+ "id": "1231"
4567
4567
  }
4568
4568
  ],
4569
4569
  "errors": []
@@ -4650,6 +4650,19 @@ var findElementSchema = {
4650
4650
  };
4651
4651
 
4652
4652
  // src/ai-model/prompt/planning.ts
4653
+ var quickAnswerFormat = () => {
4654
+ const matchByPosition = getAIConfig(MATCH_BY_POSITION);
4655
+ const description = `
4656
+ ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
4657
+ `;
4658
+ const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
4659
+ const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
4660
+ return {
4661
+ description,
4662
+ format,
4663
+ sample
4664
+ };
4665
+ };
4653
4666
  function systemPromptToTaskPlanning() {
4654
4667
  return `
4655
4668
  ## Role:
@@ -4673,32 +4686,24 @@ Each action has a type and corresponding param. To be detailed:
4673
4686
  * type: 'KeyboardPress', press a key
4674
4687
  * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4675
4688
  * type: 'Scroll'
4676
- * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
4689
+ * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
4677
4690
  * type: 'Error'
4678
4691
  * param: { message: string }, the error message
4679
4692
  * type: 'Sleep'
4680
4693
  * param: { timeMs: number }, wait for timeMs milliseconds
4681
4694
 
4682
- Here is an example of how to decompose a task.
4683
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4684
- * Locate: 'The search bar'
4685
- * Input: 'Weather in Shanghai'
4686
- * Sleep: 1000
4687
- * KeyboardPress: 'Enter'
4688
-
4689
4695
  Remember:
4690
4696
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4691
- 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4697
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4698
+ 3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
4692
4699
 
4693
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4700
+ ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
4694
4701
 
4695
- ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
4696
-
4697
- If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4702
+ If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4698
4703
  {
4699
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4704
+ "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
4700
4705
  "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4701
- ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4706
+ ${quickAnswerFormat().description}
4702
4707
  }
4703
4708
 
4704
4709
  If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4711,33 +4716,71 @@ Please return the result in JSON format as follows:
4711
4716
  actions: [ // always return in Array
4712
4717
  {
4713
4718
  "thought": "find out the search bar",
4714
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4715
- "param": {
4719
+ "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
4720
+ "param": { //
4716
4721
  "prompt": "The search bar"
4717
4722
  },
4718
- "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
4719
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4720
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4721
- ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4723
+ "quickAnswer": {
4724
+ "reason": "This is ...",
4725
+ "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4726
+ ${quickAnswerFormat().format}
4722
4727
  } | null,
4723
4728
  },
4724
4729
  {
4725
4730
  "thought": "Reasons for generating this task, and why this task is feasible on this page",
4726
- "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4727
- "param": any, // Parameter towards the task type
4731
+ "type": "Tap",
4732
+ "param": null,
4728
4733
  },
4734
+ // ... more actions
4735
+ ],
4736
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4737
+ }
4738
+
4739
+ ## Here is an example of how to decompose a task
4740
+
4741
+ When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
4742
+
4743
+ * The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
4744
+ * Think and look in detail and fill all the fields in the JSON format.
4745
+
4746
+ \`\`\`json
4747
+ {
4748
+ queryLanguage: 'English',
4749
+ actions:[
4729
4750
  {
4730
- "thought": "Reasons for generating this task, and why this task is feasible on this page",
4731
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4732
- "param": {
4733
- "prompt": "The search bar"
4751
+ thought: "Locate the language switch button with the text '中文'.",
4752
+ type: 'Locate',
4753
+ param: { prompt: "The language switch button with the text '中文'" },
4754
+ quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
4755
+ reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
4756
+ text: '中文',
4757
+ ${quickAnswerFormat().sample}
4734
4758
  },
4735
- "quickAnswer": null,
4736
4759
  },
4737
- // ... more actions
4760
+ {
4761
+ thought: 'Click the language switch button to open the language options.',
4762
+ type: 'Tap',
4763
+ param: null,
4764
+ },
4765
+ {
4766
+ thought: 'Wait for 1 second to ensure the language options are displayed.',
4767
+ type: 'Sleep',
4768
+ param: { timeMs: 1000 },
4769
+ },
4770
+ {
4771
+ thought: "Locate the 'English' option in the language menu.",
4772
+ type: 'Locate',
4773
+ param: { prompt: "The 'English' option in the language menu" },
4774
+ quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
4775
+ },
4776
+ {
4777
+ thought: "Click the 'English' option to switch the language.",
4778
+ type: 'Tap',
4779
+ param: null,
4780
+ }
4738
4781
  ],
4739
- error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4740
4782
  }
4783
+ \`\`\`
4741
4784
  `;
4742
4785
  }
4743
4786
  var planSchema = {
@@ -4775,7 +4818,7 @@ var planSchema = {
4775
4818
  properties: {
4776
4819
  reason: {
4777
4820
  type: "string",
4778
- description: "Reason for finding element 4"
4821
+ description: "Reason for finding this element"
4779
4822
  },
4780
4823
  text: {
4781
4824
  type: "string",
@@ -4833,8 +4876,6 @@ skill content:
4833
4876
 
4834
4877
  Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
4835
4878
 
4836
-
4837
-
4838
4879
  Return in the following JSON format:
4839
4880
  {
4840
4881
  language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
@@ -4882,8 +4923,7 @@ var assertSchema = {
4882
4923
  function describeSize(size) {
4883
4924
  return `${size.width} x ${size.height}`;
4884
4925
  }
4885
- function truncateText(text) {
4886
- const maxLength = 50;
4926
+ function truncateText(text, maxLength = 20) {
4887
4927
  if (text && text.length > maxLength) {
4888
4928
  return `${text.slice(0, maxLength)}...`;
4889
4929
  }
@@ -4915,16 +4955,15 @@ async function describeUserPage(context) {
4915
4955
  const elementInfosDescription = cropFieldInformation(elementsInfo);
4916
4956
  return {
4917
4957
  description: `
4918
- {
4919
- // The size of the page
4920
- "pageSize": ${describeSize({ width, height })},
4958
+ {
4959
+ // The size of the page
4960
+ "pageSize": ${describeSize({ width, height })},
4921
4961
 
4922
- ${// if match by id, use the description of the element
4923
- !getAIConfig(MATCH_BY_POSITION) ? `
4924
- // json description of the element
4925
- "content": ${JSON.stringify(elementInfosDescription)}
4926
- ` : ""}
4927
- }`,
4962
+ ${// if match by id, use the description of the element
4963
+ getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
4964
+ "content": ${JSON.stringify(elementInfosDescription)}
4965
+ `}
4966
+ }`,
4928
4967
  elementById(id) {
4929
4968
  (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
4930
4969
  const item = idElementMap[`${id}`];
@@ -4943,7 +4982,13 @@ function cropFieldInformation(elementsInfo) {
4943
4982
  const tailorAttributes = Object.keys(attributes).reduce(
4944
4983
  (res, currentKey) => {
4945
4984
  const attributeVal = attributes[currentKey];
4946
- res[currentKey] = truncateText(attributeVal);
4985
+ if (currentKey === "style" || currentKey === "src")
4986
+ return res;
4987
+ if (currentKey === "nodeType") {
4988
+ res[currentKey] = attributeVal.replace(/\sNode$/, "");
4989
+ } else {
4990
+ res[currentKey] = truncateText(attributeVal);
4991
+ }
4947
4992
  return res;
4948
4993
  },
4949
4994
  {}
@@ -4952,12 +4997,18 @@ function cropFieldInformation(elementsInfo) {
4952
4997
  id,
4953
4998
  markerId: item.indexId,
4954
4999
  attributes: tailorAttributes,
4955
- rect,
5000
+ rect: {
5001
+ left: rect.left,
5002
+ top: rect.top,
5003
+ width: rect.width,
5004
+ height: rect.height
5005
+ // remove 'zoom' if it exists
5006
+ },
4956
5007
  content: tailorContent
4957
5008
  };
4958
5009
  }
4959
5010
  );
4960
- return JSON.stringify(elementInfosDescription);
5011
+ return elementInfosDescription;
4961
5012
  }
4962
5013
 
4963
5014
  // src/ai-model/openai/index.ts
package/dist/lib/index.js CHANGED
@@ -4292,6 +4292,7 @@ __export(src_exports, {
4292
4292
  default: () => src_default,
4293
4293
  getAIConfig: () => getAIConfig,
4294
4294
  getElement: () => getElement,
4295
+ getLogDirByType: () => getLogDirByType,
4295
4296
  getSection: () => getSection,
4296
4297
  getVersion: () => getVersion,
4297
4298
  overrideAIConfig: () => overrideAIConfig,
@@ -4524,7 +4525,7 @@ Input Example:
4524
4525
  },
4525
4526
  "elementInfos": [
4526
4527
  {
4527
- "id": "we23xsfwe", // ID of the element
4528
+ "id": "1231", // ID of the element
4528
4529
  "indexId": "0", // Index of the element,The image is labeled to the left of the element
4529
4530
  "attributes": { // Attributes of the element
4530
4531
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4540,7 +4541,7 @@ Input Example:
4540
4541
  }
4541
4542
  },
4542
4543
  {
4543
- "id": "wefew2222few2", // ID of the element
4544
+ "id": "66551", // ID of the element
4544
4545
  "indexId": "1", // Index of the element,The image is labeled to the left of the element
4545
4546
  "attributes": { // Attributes of the element
4546
4547
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4557,7 +4558,7 @@ Input Example:
4557
4558
  },
4558
4559
  ...
4559
4560
  {
4560
- "id": "kwekfj2323",
4561
+ "id": "12344",
4561
4562
  "indexId": "2", // Index of the element,The image is labeled to the left of the element
4562
4563
  "attributes": {
4563
4564
  "nodeType": "TEXT Node",
@@ -4590,7 +4591,7 @@ Output Example:
4590
4591
  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4591
4592
  "text": "",
4592
4593
  // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
4593
- "id": "wefew2222few2"
4594
+ "id": "1231"
4594
4595
  }
4595
4596
  ],
4596
4597
  "errors": []
@@ -4677,6 +4678,19 @@ var findElementSchema = {
4677
4678
  };
4678
4679
 
4679
4680
  // src/ai-model/prompt/planning.ts
4681
+ var quickAnswerFormat = () => {
4682
+ const matchByPosition = getAIConfig(MATCH_BY_POSITION);
4683
+ const description = `
4684
+ ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
4685
+ `;
4686
+ const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
4687
+ const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
4688
+ return {
4689
+ description,
4690
+ format,
4691
+ sample
4692
+ };
4693
+ };
4680
4694
  function systemPromptToTaskPlanning() {
4681
4695
  return `
4682
4696
  ## Role:
@@ -4700,32 +4714,24 @@ Each action has a type and corresponding param. To be detailed:
4700
4714
  * type: 'KeyboardPress', press a key
4701
4715
  * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4702
4716
  * type: 'Scroll'
4703
- * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
4717
+ * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
4704
4718
  * type: 'Error'
4705
4719
  * param: { message: string }, the error message
4706
4720
  * type: 'Sleep'
4707
4721
  * param: { timeMs: number }, wait for timeMs milliseconds
4708
4722
 
4709
- Here is an example of how to decompose a task.
4710
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4711
- * Locate: 'The search bar'
4712
- * Input: 'Weather in Shanghai'
4713
- * Sleep: 1000
4714
- * KeyboardPress: 'Enter'
4715
-
4716
4723
  Remember:
4717
4724
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4718
- 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4725
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4726
+ 3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
4719
4727
 
4720
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4728
+ ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
4721
4729
 
4722
- ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
4723
-
4724
- If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4730
+ If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4725
4731
  {
4726
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4732
+ "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
4727
4733
  "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4728
- ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4734
+ ${quickAnswerFormat().description}
4729
4735
  }
4730
4736
 
4731
4737
  If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4738,33 +4744,71 @@ Please return the result in JSON format as follows:
4738
4744
  actions: [ // always return in Array
4739
4745
  {
4740
4746
  "thought": "find out the search bar",
4741
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4742
- "param": {
4747
+ "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
4748
+ "param": { //
4743
4749
  "prompt": "The search bar"
4744
4750
  },
4745
- "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
4746
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4747
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4748
- ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4751
+ "quickAnswer": {
4752
+ "reason": "This is ...",
4753
+ "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4754
+ ${quickAnswerFormat().format}
4749
4755
  } | null,
4750
4756
  },
4751
4757
  {
4752
4758
  "thought": "Reasons for generating this task, and why this task is feasible on this page",
4753
- "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4754
- "param": any, // Parameter towards the task type
4759
+ "type": "Tap",
4760
+ "param": null,
4755
4761
  },
4762
+ // ... more actions
4763
+ ],
4764
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4765
+ }
4766
+
4767
+ ## Here is an example of how to decompose a task
4768
+
4769
+ When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
4770
+
4771
+ * The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
4772
+ * Think and look in detail and fill all the fields in the JSON format.
4773
+
4774
+ \`\`\`json
4775
+ {
4776
+ queryLanguage: 'English',
4777
+ actions:[
4756
4778
  {
4757
- "thought": "Reasons for generating this task, and why this task is feasible on this page",
4758
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4759
- "param": {
4760
- "prompt": "The search bar"
4779
+ thought: "Locate the language switch button with the text '中文'.",
4780
+ type: 'Locate',
4781
+ param: { prompt: "The language switch button with the text '中文'" },
4782
+ quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
4783
+ reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
4784
+ text: '中文',
4785
+ ${quickAnswerFormat().sample}
4761
4786
  },
4762
- "quickAnswer": null,
4763
4787
  },
4764
- // ... more actions
4788
+ {
4789
+ thought: 'Click the language switch button to open the language options.',
4790
+ type: 'Tap',
4791
+ param: null,
4792
+ },
4793
+ {
4794
+ thought: 'Wait for 1 second to ensure the language options are displayed.',
4795
+ type: 'Sleep',
4796
+ param: { timeMs: 1000 },
4797
+ },
4798
+ {
4799
+ thought: "Locate the 'English' option in the language menu.",
4800
+ type: 'Locate',
4801
+ param: { prompt: "The 'English' option in the language menu" },
4802
+ quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
4803
+ },
4804
+ {
4805
+ thought: "Click the 'English' option to switch the language.",
4806
+ type: 'Tap',
4807
+ param: null,
4808
+ }
4765
4809
  ],
4766
- error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4767
4810
  }
4811
+ \`\`\`
4768
4812
  `;
4769
4813
  }
4770
4814
  var planSchema = {
@@ -4802,7 +4846,7 @@ var planSchema = {
4802
4846
  properties: {
4803
4847
  reason: {
4804
4848
  type: "string",
4805
- description: "Reason for finding element 4"
4849
+ description: "Reason for finding this element"
4806
4850
  },
4807
4851
  text: {
4808
4852
  type: "string",
@@ -4863,8 +4907,6 @@ skill content:
4863
4907
 
4864
4908
  Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
4865
4909
 
4866
-
4867
-
4868
4910
  Return in the following JSON format:
4869
4911
  {
4870
4912
  language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
@@ -4912,8 +4954,7 @@ var assertSchema = {
4912
4954
  function describeSize(size) {
4913
4955
  return `${size.width} x ${size.height}`;
4914
4956
  }
4915
- function truncateText(text) {
4916
- const maxLength = 50;
4957
+ function truncateText(text, maxLength = 20) {
4917
4958
  if (text && text.length > maxLength) {
4918
4959
  return `${text.slice(0, maxLength)}...`;
4919
4960
  }
@@ -4945,16 +4986,15 @@ async function describeUserPage(context) {
4945
4986
  const elementInfosDescription = cropFieldInformation(elementsInfo);
4946
4987
  return {
4947
4988
  description: `
4948
- {
4949
- // The size of the page
4950
- "pageSize": ${describeSize({ width, height })},
4989
+ {
4990
+ // The size of the page
4991
+ "pageSize": ${describeSize({ width, height })},
4951
4992
 
4952
- ${// if match by id, use the description of the element
4953
- !getAIConfig(MATCH_BY_POSITION) ? `
4954
- // json description of the element
4955
- "content": ${JSON.stringify(elementInfosDescription)}
4956
- ` : ""}
4957
- }`,
4993
+ ${// if match by id, use the description of the element
4994
+ getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
4995
+ "content": ${JSON.stringify(elementInfosDescription)}
4996
+ `}
4997
+ }`,
4958
4998
  elementById(id) {
4959
4999
  (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
4960
5000
  const item = idElementMap[`${id}`];
@@ -4973,7 +5013,13 @@ function cropFieldInformation(elementsInfo) {
4973
5013
  const tailorAttributes = Object.keys(attributes).reduce(
4974
5014
  (res, currentKey) => {
4975
5015
  const attributeVal = attributes[currentKey];
4976
- res[currentKey] = truncateText(attributeVal);
5016
+ if (currentKey === "style" || currentKey === "src")
5017
+ return res;
5018
+ if (currentKey === "nodeType") {
5019
+ res[currentKey] = attributeVal.replace(/\sNode$/, "");
5020
+ } else {
5021
+ res[currentKey] = truncateText(attributeVal);
5022
+ }
4977
5023
  return res;
4978
5024
  },
4979
5025
  {}
@@ -4982,12 +5028,18 @@ function cropFieldInformation(elementsInfo) {
4982
5028
  id,
4983
5029
  markerId: item.indexId,
4984
5030
  attributes: tailorAttributes,
4985
- rect,
5031
+ rect: {
5032
+ left: rect.left,
5033
+ top: rect.top,
5034
+ width: rect.width,
5035
+ height: rect.height
5036
+ // remove 'zoom' if it exists
5037
+ },
4986
5038
  content: tailorContent
4987
5039
  };
4988
5040
  }
4989
5041
  );
4990
- return JSON.stringify(elementInfosDescription);
5042
+ return elementInfosDescription;
4991
5043
  }
4992
5044
  function retrieveElement(prompt, opt) {
4993
5045
  if (opt == null ? void 0 : opt.multi) {
@@ -5294,6 +5346,7 @@ function writeLogFile(opts) {
5294
5346
  # Midscene.js dump files
5295
5347
  ${logDirName}/report
5296
5348
  ${logDirName}/dump
5349
+ ${logDirName}/tmp
5297
5350
  `,
5298
5351
  "utf-8"
5299
5352
  );
@@ -5325,7 +5378,7 @@ function stringifyDumpData(data, indents) {
5325
5378
  return JSON.stringify(data, replacerForPageObject, indents);
5326
5379
  }
5327
5380
  function getVersion() {
5328
- return "0.8.4";
5381
+ return "0.8.5-beta-20241122072506.0";
5329
5382
  }
5330
5383
 
5331
5384
  // src/action/executor.ts
@@ -5348,9 +5401,12 @@ var Executor = class {
5348
5401
  };
5349
5402
  }
5350
5403
  async append(task) {
5404
+ var _a, _b;
5351
5405
  (0, import_node_assert5.default)(
5352
5406
  this.status !== "error",
5353
- "executor is in error state, cannot append task"
5407
+ `executor is in error state, cannot append task
5408
+ error=${(_a = this.latestErrorTask()) == null ? void 0 : _a.error}
5409
+ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
5354
5410
  );
5355
5411
  if (Array.isArray(task)) {
5356
5412
  this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
@@ -6094,6 +6150,7 @@ var src_default = Insight;
6094
6150
  allAIConfig,
6095
6151
  getAIConfig,
6096
6152
  getElement,
6153
+ getLogDirByType,
6097
6154
  getSection,
6098
6155
  getVersion,
6099
6156
  overrideAIConfig,
@@ -1,6 +1,6 @@
1
- export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-690c2a06.js';
1
+ export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-152b8346.js';
2
2
  export { ChatCompletionMessageParam } from 'openai/resources';
3
- import './types-29994b1b.js';
3
+ import './types-0d8eeece.js';
4
4
 
5
5
  declare function systemPromptToFindElement(): string;
6
6
 
@@ -1,5 +1,5 @@
1
1
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ChatCompletionMessageParam } from 'openai/resources';
2
- import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-29994b1b.js';
2
+ import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-0d8eeece.js';
3
3
 
4
4
  type AIArgs = [
5
5
  ChatCompletionSystemMessageParam,
@@ -1,8 +1,8 @@
1
- import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-29994b1b.js';
2
- export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-29994b1b.js';
3
- import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-690c2a06.js';
4
- export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-690c2a06.js';
5
- export { getVersion, setLogDir } from './utils.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-0d8eeece.js';
2
+ export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-0d8eeece.js';
3
+ import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-152b8346.js';
4
+ export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-152b8346.js';
5
+ export { getLogDirByType, getVersion, setLogDir } from './utils.js';
6
6
  import 'openai/resources';
7
7
 
8
8
  declare class Executor {
@@ -9,7 +9,9 @@ interface Size {
9
9
  height: number;
10
10
  dpr?: number;
11
11
  }
12
- type Rect = Point & Size;
12
+ type Rect = Point & Size & {
13
+ zoom?: number;
14
+ };
13
15
  declare enum NodeType {
14
16
  CONTAINER = "CONTAINER Node",
15
17
  FORM_ITEM = "FORM_ITEM Node",