@midscene/core 0.8.2-beta-20241115094249.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4497,7 +4497,7 @@ Input Example:
4497
4497
  },
4498
4498
  "elementInfos": [
4499
4499
  {
4500
- "id": "1231", // ID of the element
4500
+ "id": "we23xsfwe", // ID of the element
4501
4501
  "indexId": "0", // Index of the element,The image is labeled to the left of the element
4502
4502
  "attributes": { // Attributes of the element
4503
4503
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4513,7 +4513,7 @@ Input Example:
4513
4513
  }
4514
4514
  },
4515
4515
  {
4516
- "id": "66551", // ID of the element
4516
+ "id": "wefew2222few2", // ID of the element
4517
4517
  "indexId": "1", // Index of the element,The image is labeled to the left of the element
4518
4518
  "attributes": { // Attributes of the element
4519
4519
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4530,7 +4530,7 @@ Input Example:
4530
4530
  },
4531
4531
  ...
4532
4532
  {
4533
- "id": "12344",
4533
+ "id": "kwekfj2323",
4534
4534
  "indexId": "2", // Index of the element,The image is labeled to the left of the element
4535
4535
  "attributes": {
4536
4536
  "nodeType": "TEXT Node",
@@ -4563,7 +4563,7 @@ Output Example:
4563
4563
  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4564
4564
  "text": "",
4565
4565
  // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
4566
- "id": "1231"
4566
+ "id": "wefew2222few2"
4567
4567
  }
4568
4568
  ],
4569
4569
  "errors": []
@@ -4650,19 +4650,6 @@ var findElementSchema = {
4650
4650
  };
4651
4651
 
4652
4652
  // src/ai-model/prompt/planning.ts
4653
- var quickAnswerFormat = () => {
4654
- const matchByPosition = getAIConfig(MATCH_BY_POSITION);
4655
- const description = `
4656
- ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
4657
- `;
4658
- const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
4659
- const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
4660
- return {
4661
- description,
4662
- format,
4663
- sample
4664
- };
4665
- };
4666
4653
  function systemPromptToTaskPlanning() {
4667
4654
  return `
4668
4655
  ## Role:
@@ -4686,24 +4673,32 @@ Each action has a type and corresponding param. To be detailed:
4686
4673
  * type: 'KeyboardPress', press a key
4687
4674
  * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4688
4675
  * type: 'Scroll'
4689
- * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
4676
+ * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
4690
4677
  * type: 'Error'
4691
4678
  * param: { message: string }, the error message
4692
4679
  * type: 'Sleep'
4693
4680
  * param: { timeMs: number }, wait for timeMs milliseconds
4694
4681
 
4682
+ Here is an example of how to decompose a task.
4683
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4684
+ * Locate: 'The search bar'
4685
+ * Input: 'Weather in Shanghai'
4686
+ * Sleep: 1000
4687
+ * KeyboardPress: 'Enter'
4688
+
4695
4689
  Remember:
4696
4690
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4697
- 2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4698
- 3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
4691
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4699
4692
 
4700
- ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
4693
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4701
4694
 
4702
- If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4695
+ ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
4696
+
4697
+ If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4703
4698
  {
4704
- "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
4699
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4705
4700
  "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4706
- ${quickAnswerFormat().description}
4701
+ ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4707
4702
  }
4708
4703
 
4709
4704
  If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4716,71 +4711,33 @@ Please return the result in JSON format as follows:
4716
4711
  actions: [ // always return in Array
4717
4712
  {
4718
4713
  "thought": "find out the search bar",
4719
- "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
4720
- "param": { //
4714
+ "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4715
+ "param": {
4721
4716
  "prompt": "The search bar"
4722
4717
  },
4723
- "quickAnswer": {
4724
- "reason": "This is ...",
4725
- "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4726
- ${quickAnswerFormat().format}
4718
+ "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
4719
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4720
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4721
+ ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4727
4722
  } | null,
4728
4723
  },
4729
4724
  {
4730
4725
  "thought": "Reasons for generating this task, and why this task is feasible on this page",
4731
- "type": "Tap",
4732
- "param": null,
4726
+ "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4727
+ "param": any, // Parameter towards the task type
4733
4728
  },
4734
- // ... more actions
4735
- ],
4736
- error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4737
- }
4738
-
4739
- ## Here is an example of how to decompose a task
4740
-
4741
- When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
4742
-
4743
- * The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
4744
- * Think and look in detail and fill all the fields in the JSON format.
4745
-
4746
- \`\`\`json
4747
- {
4748
- queryLanguage: 'English',
4749
- actions:[
4750
4729
  {
4751
- thought: "Locate the language switch button with the text '中文'.",
4752
- type: 'Locate',
4753
- param: { prompt: "The language switch button with the text '中文'" },
4754
- quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
4755
- reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
4756
- text: '中文',
4757
- ${quickAnswerFormat().sample}
4730
+ "thought": "Reasons for generating this task, and why this task is feasible on this page",
4731
+ "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4732
+ "param": {
4733
+ "prompt": "The search bar"
4758
4734
  },
4735
+ "quickAnswer": null,
4759
4736
  },
4760
- {
4761
- thought: 'Click the language switch button to open the language options.',
4762
- type: 'Tap',
4763
- param: null,
4764
- },
4765
- {
4766
- thought: 'Wait for 1 second to ensure the language options are displayed.',
4767
- type: 'Sleep',
4768
- param: { timeMs: 1000 },
4769
- },
4770
- {
4771
- thought: "Locate the 'English' option in the language menu.",
4772
- type: 'Locate',
4773
- param: { prompt: "The 'English' option in the language menu" },
4774
- quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
4775
- },
4776
- {
4777
- thought: "Click the 'English' option to switch the language.",
4778
- type: 'Tap',
4779
- param: null,
4780
- }
4737
+ // ... more actions
4781
4738
  ],
4739
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4782
4740
  }
4783
- \`\`\`
4784
4741
  `;
4785
4742
  }
4786
4743
  var planSchema = {
@@ -4818,7 +4775,7 @@ var planSchema = {
4818
4775
  properties: {
4819
4776
  reason: {
4820
4777
  type: "string",
4821
- description: "Reason for finding this element"
4778
+ description: "Reason for finding element 4"
4822
4779
  },
4823
4780
  text: {
4824
4781
  type: "string",
@@ -4876,6 +4833,8 @@ skill content:
4876
4833
 
4877
4834
  Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
4878
4835
 
4836
+
4837
+
4879
4838
  Return in the following JSON format:
4880
4839
  {
4881
4840
  language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
@@ -4923,7 +4882,8 @@ var assertSchema = {
4923
4882
  function describeSize(size) {
4924
4883
  return `${size.width} x ${size.height}`;
4925
4884
  }
4926
- function truncateText(text, maxLength = 20) {
4885
+ function truncateText(text) {
4886
+ const maxLength = 50;
4927
4887
  if (text && text.length > maxLength) {
4928
4888
  return `${text.slice(0, maxLength)}...`;
4929
4889
  }
@@ -4955,15 +4915,16 @@ async function describeUserPage(context) {
4955
4915
  const elementInfosDescription = cropFieldInformation(elementsInfo);
4956
4916
  return {
4957
4917
  description: `
4958
- {
4959
- // The size of the page
4960
- "pageSize": ${describeSize({ width, height })},
4918
+ {
4919
+ // The size of the page
4920
+ "pageSize": ${describeSize({ width, height })},
4961
4921
 
4962
- ${// if match by id, use the description of the element
4963
- getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
4964
- "content": ${JSON.stringify(elementInfosDescription)}
4965
- `}
4966
- }`,
4922
+ ${// if match by id, use the description of the element
4923
+ !getAIConfig(MATCH_BY_POSITION) ? `
4924
+ // json description of the element
4925
+ "content": ${JSON.stringify(elementInfosDescription)}
4926
+ ` : ""}
4927
+ }`,
4967
4928
  elementById(id) {
4968
4929
  (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
4969
4930
  const item = idElementMap[`${id}`];
@@ -4982,13 +4943,7 @@ function cropFieldInformation(elementsInfo) {
4982
4943
  const tailorAttributes = Object.keys(attributes).reduce(
4983
4944
  (res, currentKey) => {
4984
4945
  const attributeVal = attributes[currentKey];
4985
- if (currentKey === "style" || currentKey === "src")
4986
- return res;
4987
- if (currentKey === "nodeType") {
4988
- res[currentKey] = attributeVal.replace(/\sNode$/, "");
4989
- } else {
4990
- res[currentKey] = truncateText(attributeVal);
4991
- }
4946
+ res[currentKey] = truncateText(attributeVal);
4992
4947
  return res;
4993
4948
  },
4994
4949
  {}
@@ -4997,18 +4952,12 @@ function cropFieldInformation(elementsInfo) {
4997
4952
  id,
4998
4953
  markerId: item.indexId,
4999
4954
  attributes: tailorAttributes,
5000
- rect: {
5001
- left: rect.left,
5002
- top: rect.top,
5003
- width: rect.width,
5004
- height: rect.height
5005
- // remove 'zoom' if it exists
5006
- },
4955
+ rect,
5007
4956
  content: tailorContent
5008
4957
  };
5009
4958
  }
5010
4959
  );
5011
- return elementInfosDescription;
4960
+ return JSON.stringify(elementInfosDescription);
5012
4961
  }
5013
4962
 
5014
4963
  // src/ai-model/openai/index.ts
@@ -5106,8 +5055,7 @@ async function call(messages, responseFormat) {
5106
5055
  messages,
5107
5056
  response_format: responseFormat,
5108
5057
  temperature: 0.1,
5109
- stream: false,
5110
- max_tokens: 1e3
5058
+ stream: false
5111
5059
  // betas: ['computer-use-2024-10-22'],
5112
5060
  });
5113
5061
  shouldPrintTiming && console.log(
package/dist/lib/index.js CHANGED
@@ -4524,7 +4524,7 @@ Input Example:
4524
4524
  },
4525
4525
  "elementInfos": [
4526
4526
  {
4527
- "id": "1231", // ID of the element
4527
+ "id": "we23xsfwe", // ID of the element
4528
4528
  "indexId": "0", // Index of the element,The image is labeled to the left of the element
4529
4529
  "attributes": { // Attributes of the element
4530
4530
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4540,7 +4540,7 @@ Input Example:
4540
4540
  }
4541
4541
  },
4542
4542
  {
4543
- "id": "66551", // ID of the element
4543
+ "id": "wefew2222few2", // ID of the element
4544
4544
  "indexId": "1", // Index of the element,The image is labeled to the left of the element
4545
4545
  "attributes": { // Attributes of the element
4546
4546
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4557,7 +4557,7 @@ Input Example:
4557
4557
  },
4558
4558
  ...
4559
4559
  {
4560
- "id": "12344",
4560
+ "id": "kwekfj2323",
4561
4561
  "indexId": "2", // Index of the element,The image is labeled to the left of the element
4562
4562
  "attributes": {
4563
4563
  "nodeType": "TEXT Node",
@@ -4590,7 +4590,7 @@ Output Example:
4590
4590
  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4591
4591
  "text": "",
4592
4592
  // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
4593
- "id": "1231"
4593
+ "id": "wefew2222few2"
4594
4594
  }
4595
4595
  ],
4596
4596
  "errors": []
@@ -4677,19 +4677,6 @@ var findElementSchema = {
4677
4677
  };
4678
4678
 
4679
4679
  // src/ai-model/prompt/planning.ts
4680
- var quickAnswerFormat = () => {
4681
- const matchByPosition = getAIConfig(MATCH_BY_POSITION);
4682
- const description = `
4683
- ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
4684
- `;
4685
- const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
4686
- const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
4687
- return {
4688
- description,
4689
- format,
4690
- sample
4691
- };
4692
- };
4693
4680
  function systemPromptToTaskPlanning() {
4694
4681
  return `
4695
4682
  ## Role:
@@ -4713,24 +4700,32 @@ Each action has a type and corresponding param. To be detailed:
4713
4700
  * type: 'KeyboardPress', press a key
4714
4701
  * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4715
4702
  * type: 'Scroll'
4716
- * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
4703
+ * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
4717
4704
  * type: 'Error'
4718
4705
  * param: { message: string }, the error message
4719
4706
  * type: 'Sleep'
4720
4707
  * param: { timeMs: number }, wait for timeMs milliseconds
4721
4708
 
4709
+ Here is an example of how to decompose a task.
4710
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4711
+ * Locate: 'The search bar'
4712
+ * Input: 'Weather in Shanghai'
4713
+ * Sleep: 1000
4714
+ * KeyboardPress: 'Enter'
4715
+
4722
4716
  Remember:
4723
4717
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4724
- 2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4725
- 3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
4718
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4726
4719
 
4727
- ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
4720
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4728
4721
 
4729
- If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4722
+ ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
4723
+
4724
+ If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4730
4725
  {
4731
- "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
4726
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4732
4727
  "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4733
- ${quickAnswerFormat().description}
4728
+ ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4734
4729
  }
4735
4730
 
4736
4731
  If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4743,71 +4738,33 @@ Please return the result in JSON format as follows:
4743
4738
  actions: [ // always return in Array
4744
4739
  {
4745
4740
  "thought": "find out the search bar",
4746
- "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
4747
- "param": { //
4741
+ "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4742
+ "param": {
4748
4743
  "prompt": "The search bar"
4749
4744
  },
4750
- "quickAnswer": {
4751
- "reason": "This is ...",
4752
- "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4753
- ${quickAnswerFormat().format}
4745
+ "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
4746
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4747
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4748
+ ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
4754
4749
  } | null,
4755
4750
  },
4756
4751
  {
4757
4752
  "thought": "Reasons for generating this task, and why this task is feasible on this page",
4758
- "type": "Tap",
4759
- "param": null,
4753
+ "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4754
+ "param": any, // Parameter towards the task type
4760
4755
  },
4761
- // ... more actions
4762
- ],
4763
- error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4764
- }
4765
-
4766
- ## Here is an example of how to decompose a task
4767
-
4768
- When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
4769
-
4770
- * The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
4771
- * Think and look in detail and fill all the fields in the JSON format.
4772
-
4773
- \`\`\`json
4774
- {
4775
- queryLanguage: 'English',
4776
- actions:[
4777
4756
  {
4778
- thought: "Locate the language switch button with the text '中文'.",
4779
- type: 'Locate',
4780
- param: { prompt: "The language switch button with the text '中文'" },
4781
- quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
4782
- reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
4783
- text: '中文',
4784
- ${quickAnswerFormat().sample}
4757
+ "thought": "Reasons for generating this task, and why this task is feasible on this page",
4758
+ "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4759
+ "param": {
4760
+ "prompt": "The search bar"
4785
4761
  },
4762
+ "quickAnswer": null,
4786
4763
  },
4787
- {
4788
- thought: 'Click the language switch button to open the language options.',
4789
- type: 'Tap',
4790
- param: null,
4791
- },
4792
- {
4793
- thought: 'Wait for 1 second to ensure the language options are displayed.',
4794
- type: 'Sleep',
4795
- param: { timeMs: 1000 },
4796
- },
4797
- {
4798
- thought: "Locate the 'English' option in the language menu.",
4799
- type: 'Locate',
4800
- param: { prompt: "The 'English' option in the language menu" },
4801
- quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
4802
- },
4803
- {
4804
- thought: "Click the 'English' option to switch the language.",
4805
- type: 'Tap',
4806
- param: null,
4807
- }
4764
+ // ... more actions
4808
4765
  ],
4766
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4809
4767
  }
4810
- \`\`\`
4811
4768
  `;
4812
4769
  }
4813
4770
  var planSchema = {
@@ -4845,7 +4802,7 @@ var planSchema = {
4845
4802
  properties: {
4846
4803
  reason: {
4847
4804
  type: "string",
4848
- description: "Reason for finding this element"
4805
+ description: "Reason for finding element 4"
4849
4806
  },
4850
4807
  text: {
4851
4808
  type: "string",
@@ -4906,6 +4863,8 @@ skill content:
4906
4863
 
4907
4864
  Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
4908
4865
 
4866
+
4867
+
4909
4868
  Return in the following JSON format:
4910
4869
  {
4911
4870
  language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
@@ -4953,7 +4912,8 @@ var assertSchema = {
4953
4912
  function describeSize(size) {
4954
4913
  return `${size.width} x ${size.height}`;
4955
4914
  }
4956
- function truncateText(text, maxLength = 20) {
4915
+ function truncateText(text) {
4916
+ const maxLength = 50;
4957
4917
  if (text && text.length > maxLength) {
4958
4918
  return `${text.slice(0, maxLength)}...`;
4959
4919
  }
@@ -4985,15 +4945,16 @@ async function describeUserPage(context) {
4985
4945
  const elementInfosDescription = cropFieldInformation(elementsInfo);
4986
4946
  return {
4987
4947
  description: `
4988
- {
4989
- // The size of the page
4990
- "pageSize": ${describeSize({ width, height })},
4948
+ {
4949
+ // The size of the page
4950
+ "pageSize": ${describeSize({ width, height })},
4991
4951
 
4992
- ${// if match by id, use the description of the element
4993
- getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
4994
- "content": ${JSON.stringify(elementInfosDescription)}
4995
- `}
4996
- }`,
4952
+ ${// if match by id, use the description of the element
4953
+ !getAIConfig(MATCH_BY_POSITION) ? `
4954
+ // json description of the element
4955
+ "content": ${JSON.stringify(elementInfosDescription)}
4956
+ ` : ""}
4957
+ }`,
4997
4958
  elementById(id) {
4998
4959
  (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
4999
4960
  const item = idElementMap[`${id}`];
@@ -5012,13 +4973,7 @@ function cropFieldInformation(elementsInfo) {
5012
4973
  const tailorAttributes = Object.keys(attributes).reduce(
5013
4974
  (res, currentKey) => {
5014
4975
  const attributeVal = attributes[currentKey];
5015
- if (currentKey === "style" || currentKey === "src")
5016
- return res;
5017
- if (currentKey === "nodeType") {
5018
- res[currentKey] = attributeVal.replace(/\sNode$/, "");
5019
- } else {
5020
- res[currentKey] = truncateText(attributeVal);
5021
- }
4976
+ res[currentKey] = truncateText(attributeVal);
5022
4977
  return res;
5023
4978
  },
5024
4979
  {}
@@ -5027,18 +4982,12 @@ function cropFieldInformation(elementsInfo) {
5027
4982
  id,
5028
4983
  markerId: item.indexId,
5029
4984
  attributes: tailorAttributes,
5030
- rect: {
5031
- left: rect.left,
5032
- top: rect.top,
5033
- width: rect.width,
5034
- height: rect.height
5035
- // remove 'zoom' if it exists
5036
- },
4985
+ rect,
5037
4986
  content: tailorContent
5038
4987
  };
5039
4988
  }
5040
4989
  );
5041
- return elementInfosDescription;
4990
+ return JSON.stringify(elementInfosDescription);
5042
4991
  }
5043
4992
  function retrieveElement(prompt, opt) {
5044
4993
  if (opt == null ? void 0 : opt.multi) {
@@ -5173,8 +5122,7 @@ async function call(messages, responseFormat) {
5173
5122
  messages,
5174
5123
  response_format: responseFormat,
5175
5124
  temperature: 0.1,
5176
- stream: false,
5177
- max_tokens: 1e3
5125
+ stream: false
5178
5126
  // betas: ['computer-use-2024-10-22'],
5179
5127
  });
5180
5128
  shouldPrintTiming && console.log(
@@ -5374,7 +5322,7 @@ function stringifyDumpData(data, indents) {
5374
5322
  return JSON.stringify(data, replacerForPageObject, indents);
5375
5323
  }
5376
5324
  function getVersion() {
5377
- return "0.8.2-beta-20241115094249.0";
5325
+ return "0.8.2";
5378
5326
  }
5379
5327
 
5380
5328
  // src/action/executor.ts
@@ -1,6 +1,6 @@
1
- export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-02a3ab02.js';
1
+ export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-691f031e.js';
2
2
  export { ChatCompletionMessageParam } from 'openai/resources';
3
- import './types-0d8eeece.js';
3
+ import './types-29994b1b.js';
4
4
 
5
5
  declare function systemPromptToFindElement(): string;
6
6
 
@@ -1,5 +1,5 @@
1
1
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ChatCompletionMessageParam } from 'openai/resources';
2
- import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-0d8eeece.js';
2
+ import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-29994b1b.js';
3
3
 
4
4
  type AIArgs = [
5
5
  ChatCompletionSystemMessageParam,
@@ -1,7 +1,7 @@
1
- import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-0d8eeece.js';
2
- export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-0d8eeece.js';
3
- import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-02a3ab02.js';
4
- export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-02a3ab02.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-29994b1b.js';
2
+ export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-29994b1b.js';
3
+ import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-691f031e.js';
4
+ export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-691f031e.js';
5
5
  export { getVersion, setLogDir } from './utils.js';
6
6
  import 'openai/resources';
7
7
 
@@ -9,9 +9,7 @@ interface Size {
9
9
  height: number;
10
10
  dpr?: number;
11
11
  }
12
- type Rect = Point & Size & {
13
- zoom?: number;
14
- };
12
+ type Rect = Point & Size;
15
13
  declare enum NodeType {
16
14
  CONTAINER = "CONTAINER Node",
17
15
  FORM_ITEM = "FORM_ITEM Node",
@@ -1,4 +1,4 @@
1
- import { r as ReportDumpWithAttributes, R as Rect } from './types-0d8eeece.js';
1
+ import { r as ReportDumpWithAttributes, R as Rect } from './types-29994b1b.js';
2
2
  import 'openai/resources';
3
3
 
4
4
  declare const insightDumpFileExt = "insight-dump.json";
package/dist/lib/utils.js CHANGED
@@ -213,7 +213,7 @@ function stringifyDumpData(data, indents) {
213
213
  return JSON.stringify(data, replacerForPageObject, indents);
214
214
  }
215
215
  function getVersion() {
216
- return "0.8.2-beta-20241115094249.0";
216
+ return "0.8.2";
217
217
  }
218
218
  // Annotate the CommonJS export names for ESM import in node:
219
219
  0 && (module.exports = {