@midscene/core 0.8.2-beta-20241115094249.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/ai-model.js +52 -104
- package/dist/lib/index.js +53 -105
- package/dist/lib/types/ai-model.d.ts +2 -2
- package/dist/lib/types/{index-02a3ab02.d.ts → index-691f031e.d.ts} +1 -1
- package/dist/lib/types/index.d.ts +4 -4
- package/dist/lib/types/{types-0d8eeece.d.ts → types-29994b1b.d.ts} +1 -3
- package/dist/lib/types/utils.d.ts +1 -1
- package/dist/lib/utils.js +1 -1
- package/package.json +2 -2
- package/report/index.html +3 -4
package/dist/lib/ai-model.js
CHANGED
|
@@ -4497,7 +4497,7 @@ Input Example:
|
|
|
4497
4497
|
},
|
|
4498
4498
|
"elementInfos": [
|
|
4499
4499
|
{
|
|
4500
|
-
"id": "
|
|
4500
|
+
"id": "we23xsfwe", // ID of the element
|
|
4501
4501
|
"indexId": "0", // Index of the element,The image is labeled to the left of the element
|
|
4502
4502
|
"attributes": { // Attributes of the element
|
|
4503
4503
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4513,7 +4513,7 @@ Input Example:
|
|
|
4513
4513
|
}
|
|
4514
4514
|
},
|
|
4515
4515
|
{
|
|
4516
|
-
"id": "
|
|
4516
|
+
"id": "wefew2222few2", // ID of the element
|
|
4517
4517
|
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
4518
4518
|
"attributes": { // Attributes of the element
|
|
4519
4519
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4530,7 +4530,7 @@ Input Example:
|
|
|
4530
4530
|
},
|
|
4531
4531
|
...
|
|
4532
4532
|
{
|
|
4533
|
-
"id": "
|
|
4533
|
+
"id": "kwekfj2323",
|
|
4534
4534
|
"indexId": "2", // Index of the element,The image is labeled to the left of the element
|
|
4535
4535
|
"attributes": {
|
|
4536
4536
|
"nodeType": "TEXT Node",
|
|
@@ -4563,7 +4563,7 @@ Output Example:
|
|
|
4563
4563
|
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4564
4564
|
"text": "",
|
|
4565
4565
|
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
4566
|
-
"id": "
|
|
4566
|
+
"id": "wefew2222few2"
|
|
4567
4567
|
}
|
|
4568
4568
|
],
|
|
4569
4569
|
"errors": []
|
|
@@ -4650,19 +4650,6 @@ var findElementSchema = {
|
|
|
4650
4650
|
};
|
|
4651
4651
|
|
|
4652
4652
|
// src/ai-model/prompt/planning.ts
|
|
4653
|
-
var quickAnswerFormat = () => {
|
|
4654
|
-
const matchByPosition = getAIConfig(MATCH_BY_POSITION);
|
|
4655
|
-
const description = `
|
|
4656
|
-
${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
|
|
4657
|
-
`;
|
|
4658
|
-
const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
|
|
4659
|
-
const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
|
|
4660
|
-
return {
|
|
4661
|
-
description,
|
|
4662
|
-
format,
|
|
4663
|
-
sample
|
|
4664
|
-
};
|
|
4665
|
-
};
|
|
4666
4653
|
function systemPromptToTaskPlanning() {
|
|
4667
4654
|
return `
|
|
4668
4655
|
## Role:
|
|
@@ -4686,24 +4673,32 @@ Each action has a type and corresponding param. To be detailed:
|
|
|
4686
4673
|
* type: 'KeyboardPress', press a key
|
|
4687
4674
|
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4688
4675
|
* type: 'Scroll'
|
|
4689
|
-
* param: { scrollType: 'scrollDownOneScreen'
|
|
4676
|
+
* param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
|
|
4690
4677
|
* type: 'Error'
|
|
4691
4678
|
* param: { message: string }, the error message
|
|
4692
4679
|
* type: 'Sleep'
|
|
4693
4680
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4694
4681
|
|
|
4682
|
+
Here is an example of how to decompose a task.
|
|
4683
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4684
|
+
* Locate: 'The search bar'
|
|
4685
|
+
* Input: 'Weather in Shanghai'
|
|
4686
|
+
* Sleep: 1000
|
|
4687
|
+
* KeyboardPress: 'Enter'
|
|
4688
|
+
|
|
4695
4689
|
Remember:
|
|
4696
4690
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4697
|
-
2. In most cases, you should Locate one element first, then do other actions on it. For example,
|
|
4698
|
-
3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
|
|
4691
|
+
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4699
4692
|
|
|
4700
|
-
|
|
4693
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
4701
4694
|
|
|
4702
|
-
|
|
4695
|
+
## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
|
|
4696
|
+
|
|
4697
|
+
If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4703
4698
|
{
|
|
4704
|
-
"reason": "It is located
|
|
4699
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4705
4700
|
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4706
|
-
${
|
|
4701
|
+
${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
|
|
4707
4702
|
}
|
|
4708
4703
|
|
|
4709
4704
|
If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
|
|
@@ -4716,71 +4711,33 @@ Please return the result in JSON format as follows:
|
|
|
4716
4711
|
actions: [ // always return in Array
|
|
4717
4712
|
{
|
|
4718
4713
|
"thought": "find out the search bar",
|
|
4719
|
-
"type": "Locate", //
|
|
4720
|
-
"param": {
|
|
4714
|
+
"type": "Locate", // Type of action, like 'Tap' 'Hover' ...
|
|
4715
|
+
"param": {
|
|
4721
4716
|
"prompt": "The search bar"
|
|
4722
4717
|
},
|
|
4723
|
-
"quickAnswer": {
|
|
4724
|
-
"reason": "
|
|
4725
|
-
"text":
|
|
4726
|
-
${
|
|
4718
|
+
"quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
|
|
4719
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
|
|
4720
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4721
|
+
${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
|
|
4727
4722
|
} | null,
|
|
4728
4723
|
},
|
|
4729
4724
|
{
|
|
4730
4725
|
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4731
|
-
"type": "Tap",
|
|
4732
|
-
"param":
|
|
4726
|
+
"type": "Tap", // Type of action, like 'Tap' 'Hover' ...
|
|
4727
|
+
"param": any, // Parameter towards the task type
|
|
4733
4728
|
},
|
|
4734
|
-
// ... more actions
|
|
4735
|
-
],
|
|
4736
|
-
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4737
|
-
}
|
|
4738
|
-
|
|
4739
|
-
## Here is an example of how to decompose a task
|
|
4740
|
-
|
|
4741
|
-
When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
|
|
4742
|
-
|
|
4743
|
-
* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
|
|
4744
|
-
* Think and look in detail and fill all the fields in the JSON format.
|
|
4745
|
-
|
|
4746
|
-
\`\`\`json
|
|
4747
|
-
{
|
|
4748
|
-
queryLanguage: 'English',
|
|
4749
|
-
actions:[
|
|
4750
4729
|
{
|
|
4751
|
-
thought: "
|
|
4752
|
-
type:
|
|
4753
|
-
param: {
|
|
4754
|
-
|
|
4755
|
-
reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
|
|
4756
|
-
text: '中文',
|
|
4757
|
-
${quickAnswerFormat().sample}
|
|
4730
|
+
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4731
|
+
"type": "Locate", // Type of action, like 'Tap' 'Hover' ...
|
|
4732
|
+
"param": {
|
|
4733
|
+
"prompt": "The search bar"
|
|
4758
4734
|
},
|
|
4735
|
+
"quickAnswer": null,
|
|
4759
4736
|
},
|
|
4760
|
-
|
|
4761
|
-
thought: 'Click the language switch button to open the language options.',
|
|
4762
|
-
type: 'Tap',
|
|
4763
|
-
param: null,
|
|
4764
|
-
},
|
|
4765
|
-
{
|
|
4766
|
-
thought: 'Wait for 1 second to ensure the language options are displayed.',
|
|
4767
|
-
type: 'Sleep',
|
|
4768
|
-
param: { timeMs: 1000 },
|
|
4769
|
-
},
|
|
4770
|
-
{
|
|
4771
|
-
thought: "Locate the 'English' option in the language menu.",
|
|
4772
|
-
type: 'Locate',
|
|
4773
|
-
param: { prompt: "The 'English' option in the language menu" },
|
|
4774
|
-
quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
|
|
4775
|
-
},
|
|
4776
|
-
{
|
|
4777
|
-
thought: "Click the 'English' option to switch the language.",
|
|
4778
|
-
type: 'Tap',
|
|
4779
|
-
param: null,
|
|
4780
|
-
}
|
|
4737
|
+
// ... more actions
|
|
4781
4738
|
],
|
|
4739
|
+
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4782
4740
|
}
|
|
4783
|
-
\`\`\`
|
|
4784
4741
|
`;
|
|
4785
4742
|
}
|
|
4786
4743
|
var planSchema = {
|
|
@@ -4818,7 +4775,7 @@ var planSchema = {
|
|
|
4818
4775
|
properties: {
|
|
4819
4776
|
reason: {
|
|
4820
4777
|
type: "string",
|
|
4821
|
-
description: "Reason for finding
|
|
4778
|
+
description: "Reason for finding element 4"
|
|
4822
4779
|
},
|
|
4823
4780
|
text: {
|
|
4824
4781
|
type: "string",
|
|
@@ -4876,6 +4833,8 @@ skill content:
|
|
|
4876
4833
|
|
|
4877
4834
|
Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
|
|
4878
4835
|
|
|
4836
|
+
|
|
4837
|
+
|
|
4879
4838
|
Return in the following JSON format:
|
|
4880
4839
|
{
|
|
4881
4840
|
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
@@ -4923,7 +4882,8 @@ var assertSchema = {
|
|
|
4923
4882
|
function describeSize(size) {
|
|
4924
4883
|
return `${size.width} x ${size.height}`;
|
|
4925
4884
|
}
|
|
4926
|
-
function truncateText(text
|
|
4885
|
+
function truncateText(text) {
|
|
4886
|
+
const maxLength = 50;
|
|
4927
4887
|
if (text && text.length > maxLength) {
|
|
4928
4888
|
return `${text.slice(0, maxLength)}...`;
|
|
4929
4889
|
}
|
|
@@ -4955,15 +4915,16 @@ async function describeUserPage(context) {
|
|
|
4955
4915
|
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
4956
4916
|
return {
|
|
4957
4917
|
description: `
|
|
4958
|
-
{
|
|
4959
|
-
|
|
4960
|
-
|
|
4918
|
+
{
|
|
4919
|
+
// The size of the page
|
|
4920
|
+
"pageSize": ${describeSize({ width, height })},
|
|
4961
4921
|
|
|
4962
|
-
|
|
4963
|
-
getAIConfig(MATCH_BY_POSITION) ?
|
|
4964
|
-
|
|
4965
|
-
|
|
4966
|
-
}
|
|
4922
|
+
${// if match by id, use the description of the element
|
|
4923
|
+
!getAIConfig(MATCH_BY_POSITION) ? `
|
|
4924
|
+
// json description of the element
|
|
4925
|
+
"content": ${JSON.stringify(elementInfosDescription)}
|
|
4926
|
+
` : ""}
|
|
4927
|
+
}`,
|
|
4967
4928
|
elementById(id) {
|
|
4968
4929
|
(0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
|
|
4969
4930
|
const item = idElementMap[`${id}`];
|
|
@@ -4982,13 +4943,7 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4982
4943
|
const tailorAttributes = Object.keys(attributes).reduce(
|
|
4983
4944
|
(res, currentKey) => {
|
|
4984
4945
|
const attributeVal = attributes[currentKey];
|
|
4985
|
-
|
|
4986
|
-
return res;
|
|
4987
|
-
if (currentKey === "nodeType") {
|
|
4988
|
-
res[currentKey] = attributeVal.replace(/\sNode$/, "");
|
|
4989
|
-
} else {
|
|
4990
|
-
res[currentKey] = truncateText(attributeVal);
|
|
4991
|
-
}
|
|
4946
|
+
res[currentKey] = truncateText(attributeVal);
|
|
4992
4947
|
return res;
|
|
4993
4948
|
},
|
|
4994
4949
|
{}
|
|
@@ -4997,18 +4952,12 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4997
4952
|
id,
|
|
4998
4953
|
markerId: item.indexId,
|
|
4999
4954
|
attributes: tailorAttributes,
|
|
5000
|
-
rect
|
|
5001
|
-
left: rect.left,
|
|
5002
|
-
top: rect.top,
|
|
5003
|
-
width: rect.width,
|
|
5004
|
-
height: rect.height
|
|
5005
|
-
// remove 'zoom' if it exists
|
|
5006
|
-
},
|
|
4955
|
+
rect,
|
|
5007
4956
|
content: tailorContent
|
|
5008
4957
|
};
|
|
5009
4958
|
}
|
|
5010
4959
|
);
|
|
5011
|
-
return elementInfosDescription;
|
|
4960
|
+
return JSON.stringify(elementInfosDescription);
|
|
5012
4961
|
}
|
|
5013
4962
|
|
|
5014
4963
|
// src/ai-model/openai/index.ts
|
|
@@ -5106,8 +5055,7 @@ async function call(messages, responseFormat) {
|
|
|
5106
5055
|
messages,
|
|
5107
5056
|
response_format: responseFormat,
|
|
5108
5057
|
temperature: 0.1,
|
|
5109
|
-
stream: false
|
|
5110
|
-
max_tokens: 1e3
|
|
5058
|
+
stream: false
|
|
5111
5059
|
// betas: ['computer-use-2024-10-22'],
|
|
5112
5060
|
});
|
|
5113
5061
|
shouldPrintTiming && console.log(
|
package/dist/lib/index.js
CHANGED
|
@@ -4524,7 +4524,7 @@ Input Example:
|
|
|
4524
4524
|
},
|
|
4525
4525
|
"elementInfos": [
|
|
4526
4526
|
{
|
|
4527
|
-
"id": "
|
|
4527
|
+
"id": "we23xsfwe", // ID of the element
|
|
4528
4528
|
"indexId": "0", // Index of the element,The image is labeled to the left of the element
|
|
4529
4529
|
"attributes": { // Attributes of the element
|
|
4530
4530
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4540,7 +4540,7 @@ Input Example:
|
|
|
4540
4540
|
}
|
|
4541
4541
|
},
|
|
4542
4542
|
{
|
|
4543
|
-
"id": "
|
|
4543
|
+
"id": "wefew2222few2", // ID of the element
|
|
4544
4544
|
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
4545
4545
|
"attributes": { // Attributes of the element
|
|
4546
4546
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4557,7 +4557,7 @@ Input Example:
|
|
|
4557
4557
|
},
|
|
4558
4558
|
...
|
|
4559
4559
|
{
|
|
4560
|
-
"id": "
|
|
4560
|
+
"id": "kwekfj2323",
|
|
4561
4561
|
"indexId": "2", // Index of the element,The image is labeled to the left of the element
|
|
4562
4562
|
"attributes": {
|
|
4563
4563
|
"nodeType": "TEXT Node",
|
|
@@ -4590,7 +4590,7 @@ Output Example:
|
|
|
4590
4590
|
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4591
4591
|
"text": "",
|
|
4592
4592
|
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
4593
|
-
"id": "
|
|
4593
|
+
"id": "wefew2222few2"
|
|
4594
4594
|
}
|
|
4595
4595
|
],
|
|
4596
4596
|
"errors": []
|
|
@@ -4677,19 +4677,6 @@ var findElementSchema = {
|
|
|
4677
4677
|
};
|
|
4678
4678
|
|
|
4679
4679
|
// src/ai-model/prompt/planning.ts
|
|
4680
|
-
var quickAnswerFormat = () => {
|
|
4681
|
-
const matchByPosition = getAIConfig(MATCH_BY_POSITION);
|
|
4682
|
-
const description = `
|
|
4683
|
-
${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
|
|
4684
|
-
`;
|
|
4685
|
-
const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
|
|
4686
|
-
const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
|
|
4687
|
-
return {
|
|
4688
|
-
description,
|
|
4689
|
-
format,
|
|
4690
|
-
sample
|
|
4691
|
-
};
|
|
4692
|
-
};
|
|
4693
4680
|
function systemPromptToTaskPlanning() {
|
|
4694
4681
|
return `
|
|
4695
4682
|
## Role:
|
|
@@ -4713,24 +4700,32 @@ Each action has a type and corresponding param. To be detailed:
|
|
|
4713
4700
|
* type: 'KeyboardPress', press a key
|
|
4714
4701
|
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4715
4702
|
* type: 'Scroll'
|
|
4716
|
-
* param: { scrollType: 'scrollDownOneScreen'
|
|
4703
|
+
* param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
|
|
4717
4704
|
* type: 'Error'
|
|
4718
4705
|
* param: { message: string }, the error message
|
|
4719
4706
|
* type: 'Sleep'
|
|
4720
4707
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4721
4708
|
|
|
4709
|
+
Here is an example of how to decompose a task.
|
|
4710
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4711
|
+
* Locate: 'The search bar'
|
|
4712
|
+
* Input: 'Weather in Shanghai'
|
|
4713
|
+
* Sleep: 1000
|
|
4714
|
+
* KeyboardPress: 'Enter'
|
|
4715
|
+
|
|
4722
4716
|
Remember:
|
|
4723
4717
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4724
|
-
2. In most cases, you should Locate one element first, then do other actions on it. For example,
|
|
4725
|
-
3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
|
|
4718
|
+
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4726
4719
|
|
|
4727
|
-
|
|
4720
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
4728
4721
|
|
|
4729
|
-
|
|
4722
|
+
## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
|
|
4723
|
+
|
|
4724
|
+
If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4730
4725
|
{
|
|
4731
|
-
"reason": "It is located
|
|
4726
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4732
4727
|
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4733
|
-
${
|
|
4728
|
+
${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
|
|
4734
4729
|
}
|
|
4735
4730
|
|
|
4736
4731
|
If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
|
|
@@ -4743,71 +4738,33 @@ Please return the result in JSON format as follows:
|
|
|
4743
4738
|
actions: [ // always return in Array
|
|
4744
4739
|
{
|
|
4745
4740
|
"thought": "find out the search bar",
|
|
4746
|
-
"type": "Locate", //
|
|
4747
|
-
"param": {
|
|
4741
|
+
"type": "Locate", // Type of action, like 'Tap' 'Hover' ...
|
|
4742
|
+
"param": {
|
|
4748
4743
|
"prompt": "The search bar"
|
|
4749
4744
|
},
|
|
4750
|
-
"quickAnswer": {
|
|
4751
|
-
"reason": "
|
|
4752
|
-
"text":
|
|
4753
|
-
${
|
|
4745
|
+
"quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
|
|
4746
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
|
|
4747
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4748
|
+
${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
|
|
4754
4749
|
} | null,
|
|
4755
4750
|
},
|
|
4756
4751
|
{
|
|
4757
4752
|
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4758
|
-
"type": "Tap",
|
|
4759
|
-
"param":
|
|
4753
|
+
"type": "Tap", // Type of action, like 'Tap' 'Hover' ...
|
|
4754
|
+
"param": any, // Parameter towards the task type
|
|
4760
4755
|
},
|
|
4761
|
-
// ... more actions
|
|
4762
|
-
],
|
|
4763
|
-
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4764
|
-
}
|
|
4765
|
-
|
|
4766
|
-
## Here is an example of how to decompose a task
|
|
4767
|
-
|
|
4768
|
-
When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
|
|
4769
|
-
|
|
4770
|
-
* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
|
|
4771
|
-
* Think and look in detail and fill all the fields in the JSON format.
|
|
4772
|
-
|
|
4773
|
-
\`\`\`json
|
|
4774
|
-
{
|
|
4775
|
-
queryLanguage: 'English',
|
|
4776
|
-
actions:[
|
|
4777
4756
|
{
|
|
4778
|
-
thought: "
|
|
4779
|
-
type:
|
|
4780
|
-
param: {
|
|
4781
|
-
|
|
4782
|
-
reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
|
|
4783
|
-
text: '中文',
|
|
4784
|
-
${quickAnswerFormat().sample}
|
|
4757
|
+
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4758
|
+
"type": "Locate", // Type of action, like 'Tap' 'Hover' ...
|
|
4759
|
+
"param": {
|
|
4760
|
+
"prompt": "The search bar"
|
|
4785
4761
|
},
|
|
4762
|
+
"quickAnswer": null,
|
|
4786
4763
|
},
|
|
4787
|
-
|
|
4788
|
-
thought: 'Click the language switch button to open the language options.',
|
|
4789
|
-
type: 'Tap',
|
|
4790
|
-
param: null,
|
|
4791
|
-
},
|
|
4792
|
-
{
|
|
4793
|
-
thought: 'Wait for 1 second to ensure the language options are displayed.',
|
|
4794
|
-
type: 'Sleep',
|
|
4795
|
-
param: { timeMs: 1000 },
|
|
4796
|
-
},
|
|
4797
|
-
{
|
|
4798
|
-
thought: "Locate the 'English' option in the language menu.",
|
|
4799
|
-
type: 'Locate',
|
|
4800
|
-
param: { prompt: "The 'English' option in the language menu" },
|
|
4801
|
-
quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
|
|
4802
|
-
},
|
|
4803
|
-
{
|
|
4804
|
-
thought: "Click the 'English' option to switch the language.",
|
|
4805
|
-
type: 'Tap',
|
|
4806
|
-
param: null,
|
|
4807
|
-
}
|
|
4764
|
+
// ... more actions
|
|
4808
4765
|
],
|
|
4766
|
+
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4809
4767
|
}
|
|
4810
|
-
\`\`\`
|
|
4811
4768
|
`;
|
|
4812
4769
|
}
|
|
4813
4770
|
var planSchema = {
|
|
@@ -4845,7 +4802,7 @@ var planSchema = {
|
|
|
4845
4802
|
properties: {
|
|
4846
4803
|
reason: {
|
|
4847
4804
|
type: "string",
|
|
4848
|
-
description: "Reason for finding
|
|
4805
|
+
description: "Reason for finding element 4"
|
|
4849
4806
|
},
|
|
4850
4807
|
text: {
|
|
4851
4808
|
type: "string",
|
|
@@ -4906,6 +4863,8 @@ skill content:
|
|
|
4906
4863
|
|
|
4907
4864
|
Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
|
|
4908
4865
|
|
|
4866
|
+
|
|
4867
|
+
|
|
4909
4868
|
Return in the following JSON format:
|
|
4910
4869
|
{
|
|
4911
4870
|
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
@@ -4953,7 +4912,8 @@ var assertSchema = {
|
|
|
4953
4912
|
function describeSize(size) {
|
|
4954
4913
|
return `${size.width} x ${size.height}`;
|
|
4955
4914
|
}
|
|
4956
|
-
function truncateText(text
|
|
4915
|
+
function truncateText(text) {
|
|
4916
|
+
const maxLength = 50;
|
|
4957
4917
|
if (text && text.length > maxLength) {
|
|
4958
4918
|
return `${text.slice(0, maxLength)}...`;
|
|
4959
4919
|
}
|
|
@@ -4985,15 +4945,16 @@ async function describeUserPage(context) {
|
|
|
4985
4945
|
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
4986
4946
|
return {
|
|
4987
4947
|
description: `
|
|
4988
|
-
{
|
|
4989
|
-
|
|
4990
|
-
|
|
4948
|
+
{
|
|
4949
|
+
// The size of the page
|
|
4950
|
+
"pageSize": ${describeSize({ width, height })},
|
|
4991
4951
|
|
|
4992
|
-
|
|
4993
|
-
getAIConfig(MATCH_BY_POSITION) ?
|
|
4994
|
-
|
|
4995
|
-
|
|
4996
|
-
}
|
|
4952
|
+
${// if match by id, use the description of the element
|
|
4953
|
+
!getAIConfig(MATCH_BY_POSITION) ? `
|
|
4954
|
+
// json description of the element
|
|
4955
|
+
"content": ${JSON.stringify(elementInfosDescription)}
|
|
4956
|
+
` : ""}
|
|
4957
|
+
}`,
|
|
4997
4958
|
elementById(id) {
|
|
4998
4959
|
(0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
|
|
4999
4960
|
const item = idElementMap[`${id}`];
|
|
@@ -5012,13 +4973,7 @@ function cropFieldInformation(elementsInfo) {
|
|
|
5012
4973
|
const tailorAttributes = Object.keys(attributes).reduce(
|
|
5013
4974
|
(res, currentKey) => {
|
|
5014
4975
|
const attributeVal = attributes[currentKey];
|
|
5015
|
-
|
|
5016
|
-
return res;
|
|
5017
|
-
if (currentKey === "nodeType") {
|
|
5018
|
-
res[currentKey] = attributeVal.replace(/\sNode$/, "");
|
|
5019
|
-
} else {
|
|
5020
|
-
res[currentKey] = truncateText(attributeVal);
|
|
5021
|
-
}
|
|
4976
|
+
res[currentKey] = truncateText(attributeVal);
|
|
5022
4977
|
return res;
|
|
5023
4978
|
},
|
|
5024
4979
|
{}
|
|
@@ -5027,18 +4982,12 @@ function cropFieldInformation(elementsInfo) {
|
|
|
5027
4982
|
id,
|
|
5028
4983
|
markerId: item.indexId,
|
|
5029
4984
|
attributes: tailorAttributes,
|
|
5030
|
-
rect
|
|
5031
|
-
left: rect.left,
|
|
5032
|
-
top: rect.top,
|
|
5033
|
-
width: rect.width,
|
|
5034
|
-
height: rect.height
|
|
5035
|
-
// remove 'zoom' if it exists
|
|
5036
|
-
},
|
|
4985
|
+
rect,
|
|
5037
4986
|
content: tailorContent
|
|
5038
4987
|
};
|
|
5039
4988
|
}
|
|
5040
4989
|
);
|
|
5041
|
-
return elementInfosDescription;
|
|
4990
|
+
return JSON.stringify(elementInfosDescription);
|
|
5042
4991
|
}
|
|
5043
4992
|
function retrieveElement(prompt, opt) {
|
|
5044
4993
|
if (opt == null ? void 0 : opt.multi) {
|
|
@@ -5173,8 +5122,7 @@ async function call(messages, responseFormat) {
|
|
|
5173
5122
|
messages,
|
|
5174
5123
|
response_format: responseFormat,
|
|
5175
5124
|
temperature: 0.1,
|
|
5176
|
-
stream: false
|
|
5177
|
-
max_tokens: 1e3
|
|
5125
|
+
stream: false
|
|
5178
5126
|
// betas: ['computer-use-2024-10-22'],
|
|
5179
5127
|
});
|
|
5180
5128
|
shouldPrintTiming && console.log(
|
|
@@ -5374,7 +5322,7 @@ function stringifyDumpData(data, indents) {
|
|
|
5374
5322
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
5375
5323
|
}
|
|
5376
5324
|
function getVersion() {
|
|
5377
|
-
return "0.8.2
|
|
5325
|
+
return "0.8.2";
|
|
5378
5326
|
}
|
|
5379
5327
|
|
|
5380
5328
|
// src/action/executor.ts
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-
|
|
1
|
+
export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-691f031e.js';
|
|
2
2
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
3
|
-
import './types-
|
|
3
|
+
import './types-29994b1b.js';
|
|
4
4
|
|
|
5
5
|
declare function systemPromptToFindElement(): string;
|
|
6
6
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ChatCompletionMessageParam } from 'openai/resources';
|
|
2
|
-
import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-
|
|
2
|
+
import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-29994b1b.js';
|
|
3
3
|
|
|
4
4
|
type AIArgs = [
|
|
5
5
|
ChatCompletionSystemMessageParam,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-
|
|
2
|
-
export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-
|
|
3
|
-
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-
|
|
4
|
-
export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-
|
|
1
|
+
import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-29994b1b.js';
|
|
2
|
+
export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-29994b1b.js';
|
|
3
|
+
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-691f031e.js';
|
|
4
|
+
export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-691f031e.js';
|
|
5
5
|
export { getVersion, setLogDir } from './utils.js';
|
|
6
6
|
import 'openai/resources';
|
|
7
7
|
|
package/dist/lib/utils.js
CHANGED
|
@@ -213,7 +213,7 @@ function stringifyDumpData(data, indents) {
|
|
|
213
213
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
214
214
|
}
|
|
215
215
|
function getVersion() {
|
|
216
|
-
return "0.8.2
|
|
216
|
+
return "0.8.2";
|
|
217
217
|
}
|
|
218
218
|
// Annotate the CommonJS export names for ESM import in node:
|
|
219
219
|
0 && (module.exports = {
|