@midscene/core 0.8.3 → 0.8.5-beta-20241122072506.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/ai-model.js +102 -51
- package/dist/lib/index.js +110 -53
- package/dist/lib/types/ai-model.d.ts +2 -2
- package/dist/lib/types/{index-690c2a06.d.ts → index-152b8346.d.ts} +1 -1
- package/dist/lib/types/index.d.ts +5 -5
- package/dist/lib/types/{types-29994b1b.d.ts → types-0d8eeece.d.ts} +3 -1
- package/dist/lib/types/utils.d.ts +3 -3
- package/dist/lib/utils.js +9 -4
- package/package.json +2 -2
- package/report/index.html +2 -2
package/dist/lib/ai-model.js
CHANGED
|
@@ -4497,7 +4497,7 @@ Input Example:
|
|
|
4497
4497
|
},
|
|
4498
4498
|
"elementInfos": [
|
|
4499
4499
|
{
|
|
4500
|
-
"id": "
|
|
4500
|
+
"id": "1231", // ID of the element
|
|
4501
4501
|
"indexId": "0", // Index of the element,The image is labeled to the left of the element
|
|
4502
4502
|
"attributes": { // Attributes of the element
|
|
4503
4503
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4513,7 +4513,7 @@ Input Example:
|
|
|
4513
4513
|
}
|
|
4514
4514
|
},
|
|
4515
4515
|
{
|
|
4516
|
-
"id": "
|
|
4516
|
+
"id": "66551", // ID of the element
|
|
4517
4517
|
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
4518
4518
|
"attributes": { // Attributes of the element
|
|
4519
4519
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4530,7 +4530,7 @@ Input Example:
|
|
|
4530
4530
|
},
|
|
4531
4531
|
...
|
|
4532
4532
|
{
|
|
4533
|
-
"id": "
|
|
4533
|
+
"id": "12344",
|
|
4534
4534
|
"indexId": "2", // Index of the element,The image is labeled to the left of the element
|
|
4535
4535
|
"attributes": {
|
|
4536
4536
|
"nodeType": "TEXT Node",
|
|
@@ -4563,7 +4563,7 @@ Output Example:
|
|
|
4563
4563
|
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4564
4564
|
"text": "",
|
|
4565
4565
|
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
4566
|
-
"id": "
|
|
4566
|
+
"id": "1231"
|
|
4567
4567
|
}
|
|
4568
4568
|
],
|
|
4569
4569
|
"errors": []
|
|
@@ -4650,6 +4650,19 @@ var findElementSchema = {
|
|
|
4650
4650
|
};
|
|
4651
4651
|
|
|
4652
4652
|
// src/ai-model/prompt/planning.ts
|
|
4653
|
+
var quickAnswerFormat = () => {
|
|
4654
|
+
const matchByPosition = getAIConfig(MATCH_BY_POSITION);
|
|
4655
|
+
const description = `
|
|
4656
|
+
${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
|
|
4657
|
+
`;
|
|
4658
|
+
const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
|
|
4659
|
+
const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
|
|
4660
|
+
return {
|
|
4661
|
+
description,
|
|
4662
|
+
format,
|
|
4663
|
+
sample
|
|
4664
|
+
};
|
|
4665
|
+
};
|
|
4653
4666
|
function systemPromptToTaskPlanning() {
|
|
4654
4667
|
return `
|
|
4655
4668
|
## Role:
|
|
@@ -4673,32 +4686,24 @@ Each action has a type and corresponding param. To be detailed:
|
|
|
4673
4686
|
* type: 'KeyboardPress', press a key
|
|
4674
4687
|
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4675
4688
|
* type: 'Scroll'
|
|
4676
|
-
* param: { scrollType: 'scrollDownOneScreen'
|
|
4689
|
+
* param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
|
|
4677
4690
|
* type: 'Error'
|
|
4678
4691
|
* param: { message: string }, the error message
|
|
4679
4692
|
* type: 'Sleep'
|
|
4680
4693
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4681
4694
|
|
|
4682
|
-
Here is an example of how to decompose a task.
|
|
4683
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4684
|
-
* Locate: 'The search bar'
|
|
4685
|
-
* Input: 'Weather in Shanghai'
|
|
4686
|
-
* Sleep: 1000
|
|
4687
|
-
* KeyboardPress: 'Enter'
|
|
4688
|
-
|
|
4689
4695
|
Remember:
|
|
4690
4696
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4691
|
-
2. In most cases, you should Locate one element first, then do other actions on it. For example,
|
|
4697
|
+
2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4698
|
+
3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
|
|
4692
4699
|
|
|
4693
|
-
|
|
4700
|
+
## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
|
|
4694
4701
|
|
|
4695
|
-
|
|
4696
|
-
|
|
4697
|
-
If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4702
|
+
If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4698
4703
|
{
|
|
4699
|
-
"reason": "
|
|
4704
|
+
"reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
|
|
4700
4705
|
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4701
|
-
${
|
|
4706
|
+
${quickAnswerFormat().description}
|
|
4702
4707
|
}
|
|
4703
4708
|
|
|
4704
4709
|
If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
|
|
@@ -4711,33 +4716,71 @@ Please return the result in JSON format as follows:
|
|
|
4711
4716
|
actions: [ // always return in Array
|
|
4712
4717
|
{
|
|
4713
4718
|
"thought": "find out the search bar",
|
|
4714
|
-
"type": "Locate", //
|
|
4715
|
-
"param": {
|
|
4719
|
+
"type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
|
|
4720
|
+
"param": { //
|
|
4716
4721
|
"prompt": "The search bar"
|
|
4717
4722
|
},
|
|
4718
|
-
"quickAnswer": {
|
|
4719
|
-
"reason": "
|
|
4720
|
-
"text":
|
|
4721
|
-
${
|
|
4723
|
+
"quickAnswer": {
|
|
4724
|
+
"reason": "This is ...",
|
|
4725
|
+
"text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4726
|
+
${quickAnswerFormat().format}
|
|
4722
4727
|
} | null,
|
|
4723
4728
|
},
|
|
4724
4729
|
{
|
|
4725
4730
|
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4726
|
-
"type": "Tap",
|
|
4727
|
-
"param":
|
|
4731
|
+
"type": "Tap",
|
|
4732
|
+
"param": null,
|
|
4728
4733
|
},
|
|
4734
|
+
// ... more actions
|
|
4735
|
+
],
|
|
4736
|
+
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4737
|
+
}
|
|
4738
|
+
|
|
4739
|
+
## Here is an example of how to decompose a task
|
|
4740
|
+
|
|
4741
|
+
When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
|
|
4742
|
+
|
|
4743
|
+
* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
|
|
4744
|
+
* Think and look in detail and fill all the fields in the JSON format.
|
|
4745
|
+
|
|
4746
|
+
\`\`\`json
|
|
4747
|
+
{
|
|
4748
|
+
queryLanguage: 'English',
|
|
4749
|
+
actions:[
|
|
4729
4750
|
{
|
|
4730
|
-
|
|
4731
|
-
|
|
4732
|
-
|
|
4733
|
-
|
|
4751
|
+
thought: "Locate the language switch button with the text '中文'.",
|
|
4752
|
+
type: 'Locate',
|
|
4753
|
+
param: { prompt: "The language switch button with the text '中文'" },
|
|
4754
|
+
quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
|
|
4755
|
+
reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
|
|
4756
|
+
text: '中文',
|
|
4757
|
+
${quickAnswerFormat().sample}
|
|
4734
4758
|
},
|
|
4735
|
-
"quickAnswer": null,
|
|
4736
4759
|
},
|
|
4737
|
-
|
|
4760
|
+
{
|
|
4761
|
+
thought: 'Click the language switch button to open the language options.',
|
|
4762
|
+
type: 'Tap',
|
|
4763
|
+
param: null,
|
|
4764
|
+
},
|
|
4765
|
+
{
|
|
4766
|
+
thought: 'Wait for 1 second to ensure the language options are displayed.',
|
|
4767
|
+
type: 'Sleep',
|
|
4768
|
+
param: { timeMs: 1000 },
|
|
4769
|
+
},
|
|
4770
|
+
{
|
|
4771
|
+
thought: "Locate the 'English' option in the language menu.",
|
|
4772
|
+
type: 'Locate',
|
|
4773
|
+
param: { prompt: "The 'English' option in the language menu" },
|
|
4774
|
+
quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
|
|
4775
|
+
},
|
|
4776
|
+
{
|
|
4777
|
+
thought: "Click the 'English' option to switch the language.",
|
|
4778
|
+
type: 'Tap',
|
|
4779
|
+
param: null,
|
|
4780
|
+
}
|
|
4738
4781
|
],
|
|
4739
|
-
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4740
4782
|
}
|
|
4783
|
+
\`\`\`
|
|
4741
4784
|
`;
|
|
4742
4785
|
}
|
|
4743
4786
|
var planSchema = {
|
|
@@ -4775,7 +4818,7 @@ var planSchema = {
|
|
|
4775
4818
|
properties: {
|
|
4776
4819
|
reason: {
|
|
4777
4820
|
type: "string",
|
|
4778
|
-
description: "Reason for finding element
|
|
4821
|
+
description: "Reason for finding this element"
|
|
4779
4822
|
},
|
|
4780
4823
|
text: {
|
|
4781
4824
|
type: "string",
|
|
@@ -4833,8 +4876,6 @@ skill content:
|
|
|
4833
4876
|
|
|
4834
4877
|
Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
|
|
4835
4878
|
|
|
4836
|
-
|
|
4837
|
-
|
|
4838
4879
|
Return in the following JSON format:
|
|
4839
4880
|
{
|
|
4840
4881
|
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
@@ -4882,8 +4923,7 @@ var assertSchema = {
|
|
|
4882
4923
|
function describeSize(size) {
|
|
4883
4924
|
return `${size.width} x ${size.height}`;
|
|
4884
4925
|
}
|
|
4885
|
-
function truncateText(text) {
|
|
4886
|
-
const maxLength = 50;
|
|
4926
|
+
function truncateText(text, maxLength = 20) {
|
|
4887
4927
|
if (text && text.length > maxLength) {
|
|
4888
4928
|
return `${text.slice(0, maxLength)}...`;
|
|
4889
4929
|
}
|
|
@@ -4915,16 +4955,15 @@ async function describeUserPage(context) {
|
|
|
4915
4955
|
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
4916
4956
|
return {
|
|
4917
4957
|
description: `
|
|
4918
|
-
|
|
4919
|
-
|
|
4920
|
-
|
|
4958
|
+
{
|
|
4959
|
+
// The size of the page
|
|
4960
|
+
"pageSize": ${describeSize({ width, height })},
|
|
4921
4961
|
|
|
4922
|
-
|
|
4923
|
-
|
|
4924
|
-
|
|
4925
|
-
|
|
4926
|
-
|
|
4927
|
-
}`,
|
|
4962
|
+
${// if match by id, use the description of the element
|
|
4963
|
+
getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
|
|
4964
|
+
"content": ${JSON.stringify(elementInfosDescription)}
|
|
4965
|
+
`}
|
|
4966
|
+
}`,
|
|
4928
4967
|
elementById(id) {
|
|
4929
4968
|
(0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
|
|
4930
4969
|
const item = idElementMap[`${id}`];
|
|
@@ -4943,7 +4982,13 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4943
4982
|
const tailorAttributes = Object.keys(attributes).reduce(
|
|
4944
4983
|
(res, currentKey) => {
|
|
4945
4984
|
const attributeVal = attributes[currentKey];
|
|
4946
|
-
|
|
4985
|
+
if (currentKey === "style" || currentKey === "src")
|
|
4986
|
+
return res;
|
|
4987
|
+
if (currentKey === "nodeType") {
|
|
4988
|
+
res[currentKey] = attributeVal.replace(/\sNode$/, "");
|
|
4989
|
+
} else {
|
|
4990
|
+
res[currentKey] = truncateText(attributeVal);
|
|
4991
|
+
}
|
|
4947
4992
|
return res;
|
|
4948
4993
|
},
|
|
4949
4994
|
{}
|
|
@@ -4952,12 +4997,18 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4952
4997
|
id,
|
|
4953
4998
|
markerId: item.indexId,
|
|
4954
4999
|
attributes: tailorAttributes,
|
|
4955
|
-
rect
|
|
5000
|
+
rect: {
|
|
5001
|
+
left: rect.left,
|
|
5002
|
+
top: rect.top,
|
|
5003
|
+
width: rect.width,
|
|
5004
|
+
height: rect.height
|
|
5005
|
+
// remove 'zoom' if it exists
|
|
5006
|
+
},
|
|
4956
5007
|
content: tailorContent
|
|
4957
5008
|
};
|
|
4958
5009
|
}
|
|
4959
5010
|
);
|
|
4960
|
-
return
|
|
5011
|
+
return elementInfosDescription;
|
|
4961
5012
|
}
|
|
4962
5013
|
|
|
4963
5014
|
// src/ai-model/openai/index.ts
|
package/dist/lib/index.js
CHANGED
|
@@ -4292,6 +4292,7 @@ __export(src_exports, {
|
|
|
4292
4292
|
default: () => src_default,
|
|
4293
4293
|
getAIConfig: () => getAIConfig,
|
|
4294
4294
|
getElement: () => getElement,
|
|
4295
|
+
getLogDirByType: () => getLogDirByType,
|
|
4295
4296
|
getSection: () => getSection,
|
|
4296
4297
|
getVersion: () => getVersion,
|
|
4297
4298
|
overrideAIConfig: () => overrideAIConfig,
|
|
@@ -4524,7 +4525,7 @@ Input Example:
|
|
|
4524
4525
|
},
|
|
4525
4526
|
"elementInfos": [
|
|
4526
4527
|
{
|
|
4527
|
-
"id": "
|
|
4528
|
+
"id": "1231", // ID of the element
|
|
4528
4529
|
"indexId": "0", // Index of the element,The image is labeled to the left of the element
|
|
4529
4530
|
"attributes": { // Attributes of the element
|
|
4530
4531
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4540,7 +4541,7 @@ Input Example:
|
|
|
4540
4541
|
}
|
|
4541
4542
|
},
|
|
4542
4543
|
{
|
|
4543
|
-
"id": "
|
|
4544
|
+
"id": "66551", // ID of the element
|
|
4544
4545
|
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
4545
4546
|
"attributes": { // Attributes of the element
|
|
4546
4547
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4557,7 +4558,7 @@ Input Example:
|
|
|
4557
4558
|
},
|
|
4558
4559
|
...
|
|
4559
4560
|
{
|
|
4560
|
-
"id": "
|
|
4561
|
+
"id": "12344",
|
|
4561
4562
|
"indexId": "2", // Index of the element,The image is labeled to the left of the element
|
|
4562
4563
|
"attributes": {
|
|
4563
4564
|
"nodeType": "TEXT Node",
|
|
@@ -4590,7 +4591,7 @@ Output Example:
|
|
|
4590
4591
|
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4591
4592
|
"text": "",
|
|
4592
4593
|
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
4593
|
-
"id": "
|
|
4594
|
+
"id": "1231"
|
|
4594
4595
|
}
|
|
4595
4596
|
],
|
|
4596
4597
|
"errors": []
|
|
@@ -4677,6 +4678,19 @@ var findElementSchema = {
|
|
|
4677
4678
|
};
|
|
4678
4679
|
|
|
4679
4680
|
// src/ai-model/prompt/planning.ts
|
|
4681
|
+
var quickAnswerFormat = () => {
|
|
4682
|
+
const matchByPosition = getAIConfig(MATCH_BY_POSITION);
|
|
4683
|
+
const description = `
|
|
4684
|
+
${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
|
|
4685
|
+
`;
|
|
4686
|
+
const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
|
|
4687
|
+
const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
|
|
4688
|
+
return {
|
|
4689
|
+
description,
|
|
4690
|
+
format,
|
|
4691
|
+
sample
|
|
4692
|
+
};
|
|
4693
|
+
};
|
|
4680
4694
|
function systemPromptToTaskPlanning() {
|
|
4681
4695
|
return `
|
|
4682
4696
|
## Role:
|
|
@@ -4700,32 +4714,24 @@ Each action has a type and corresponding param. To be detailed:
|
|
|
4700
4714
|
* type: 'KeyboardPress', press a key
|
|
4701
4715
|
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4702
4716
|
* type: 'Scroll'
|
|
4703
|
-
* param: { scrollType: 'scrollDownOneScreen'
|
|
4717
|
+
* param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
|
|
4704
4718
|
* type: 'Error'
|
|
4705
4719
|
* param: { message: string }, the error message
|
|
4706
4720
|
* type: 'Sleep'
|
|
4707
4721
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4708
4722
|
|
|
4709
|
-
Here is an example of how to decompose a task.
|
|
4710
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4711
|
-
* Locate: 'The search bar'
|
|
4712
|
-
* Input: 'Weather in Shanghai'
|
|
4713
|
-
* Sleep: 1000
|
|
4714
|
-
* KeyboardPress: 'Enter'
|
|
4715
|
-
|
|
4716
4723
|
Remember:
|
|
4717
4724
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4718
|
-
2. In most cases, you should Locate one element first, then do other actions on it. For example,
|
|
4725
|
+
2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4726
|
+
3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
|
|
4719
4727
|
|
|
4720
|
-
|
|
4728
|
+
## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
|
|
4721
4729
|
|
|
4722
|
-
|
|
4723
|
-
|
|
4724
|
-
If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4730
|
+
If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4725
4731
|
{
|
|
4726
|
-
"reason": "
|
|
4732
|
+
"reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
|
|
4727
4733
|
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4728
|
-
${
|
|
4734
|
+
${quickAnswerFormat().description}
|
|
4729
4735
|
}
|
|
4730
4736
|
|
|
4731
4737
|
If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
|
|
@@ -4738,33 +4744,71 @@ Please return the result in JSON format as follows:
|
|
|
4738
4744
|
actions: [ // always return in Array
|
|
4739
4745
|
{
|
|
4740
4746
|
"thought": "find out the search bar",
|
|
4741
|
-
"type": "Locate", //
|
|
4742
|
-
"param": {
|
|
4747
|
+
"type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
|
|
4748
|
+
"param": { //
|
|
4743
4749
|
"prompt": "The search bar"
|
|
4744
4750
|
},
|
|
4745
|
-
"quickAnswer": {
|
|
4746
|
-
"reason": "
|
|
4747
|
-
"text":
|
|
4748
|
-
${
|
|
4751
|
+
"quickAnswer": {
|
|
4752
|
+
"reason": "This is ...",
|
|
4753
|
+
"text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4754
|
+
${quickAnswerFormat().format}
|
|
4749
4755
|
} | null,
|
|
4750
4756
|
},
|
|
4751
4757
|
{
|
|
4752
4758
|
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4753
|
-
"type": "Tap",
|
|
4754
|
-
"param":
|
|
4759
|
+
"type": "Tap",
|
|
4760
|
+
"param": null,
|
|
4755
4761
|
},
|
|
4762
|
+
// ... more actions
|
|
4763
|
+
],
|
|
4764
|
+
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4765
|
+
}
|
|
4766
|
+
|
|
4767
|
+
## Here is an example of how to decompose a task
|
|
4768
|
+
|
|
4769
|
+
When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
|
|
4770
|
+
|
|
4771
|
+
* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
|
|
4772
|
+
* Think and look in detail and fill all the fields in the JSON format.
|
|
4773
|
+
|
|
4774
|
+
\`\`\`json
|
|
4775
|
+
{
|
|
4776
|
+
queryLanguage: 'English',
|
|
4777
|
+
actions:[
|
|
4756
4778
|
{
|
|
4757
|
-
|
|
4758
|
-
|
|
4759
|
-
|
|
4760
|
-
|
|
4779
|
+
thought: "Locate the language switch button with the text '中文'.",
|
|
4780
|
+
type: 'Locate',
|
|
4781
|
+
param: { prompt: "The language switch button with the text '中文'" },
|
|
4782
|
+
quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
|
|
4783
|
+
reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
|
|
4784
|
+
text: '中文',
|
|
4785
|
+
${quickAnswerFormat().sample}
|
|
4761
4786
|
},
|
|
4762
|
-
"quickAnswer": null,
|
|
4763
4787
|
},
|
|
4764
|
-
|
|
4788
|
+
{
|
|
4789
|
+
thought: 'Click the language switch button to open the language options.',
|
|
4790
|
+
type: 'Tap',
|
|
4791
|
+
param: null,
|
|
4792
|
+
},
|
|
4793
|
+
{
|
|
4794
|
+
thought: 'Wait for 1 second to ensure the language options are displayed.',
|
|
4795
|
+
type: 'Sleep',
|
|
4796
|
+
param: { timeMs: 1000 },
|
|
4797
|
+
},
|
|
4798
|
+
{
|
|
4799
|
+
thought: "Locate the 'English' option in the language menu.",
|
|
4800
|
+
type: 'Locate',
|
|
4801
|
+
param: { prompt: "The 'English' option in the language menu" },
|
|
4802
|
+
quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
|
|
4803
|
+
},
|
|
4804
|
+
{
|
|
4805
|
+
thought: "Click the 'English' option to switch the language.",
|
|
4806
|
+
type: 'Tap',
|
|
4807
|
+
param: null,
|
|
4808
|
+
}
|
|
4765
4809
|
],
|
|
4766
|
-
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4767
4810
|
}
|
|
4811
|
+
\`\`\`
|
|
4768
4812
|
`;
|
|
4769
4813
|
}
|
|
4770
4814
|
var planSchema = {
|
|
@@ -4802,7 +4846,7 @@ var planSchema = {
|
|
|
4802
4846
|
properties: {
|
|
4803
4847
|
reason: {
|
|
4804
4848
|
type: "string",
|
|
4805
|
-
description: "Reason for finding element
|
|
4849
|
+
description: "Reason for finding this element"
|
|
4806
4850
|
},
|
|
4807
4851
|
text: {
|
|
4808
4852
|
type: "string",
|
|
@@ -4863,8 +4907,6 @@ skill content:
|
|
|
4863
4907
|
|
|
4864
4908
|
Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
|
|
4865
4909
|
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
4910
|
Return in the following JSON format:
|
|
4869
4911
|
{
|
|
4870
4912
|
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
@@ -4912,8 +4954,7 @@ var assertSchema = {
|
|
|
4912
4954
|
function describeSize(size) {
|
|
4913
4955
|
return `${size.width} x ${size.height}`;
|
|
4914
4956
|
}
|
|
4915
|
-
function truncateText(text) {
|
|
4916
|
-
const maxLength = 50;
|
|
4957
|
+
function truncateText(text, maxLength = 20) {
|
|
4917
4958
|
if (text && text.length > maxLength) {
|
|
4918
4959
|
return `${text.slice(0, maxLength)}...`;
|
|
4919
4960
|
}
|
|
@@ -4945,16 +4986,15 @@ async function describeUserPage(context) {
|
|
|
4945
4986
|
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
4946
4987
|
return {
|
|
4947
4988
|
description: `
|
|
4948
|
-
|
|
4949
|
-
|
|
4950
|
-
|
|
4989
|
+
{
|
|
4990
|
+
// The size of the page
|
|
4991
|
+
"pageSize": ${describeSize({ width, height })},
|
|
4951
4992
|
|
|
4952
|
-
|
|
4953
|
-
|
|
4954
|
-
|
|
4955
|
-
|
|
4956
|
-
|
|
4957
|
-
}`,
|
|
4993
|
+
${// if match by id, use the description of the element
|
|
4994
|
+
getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
|
|
4995
|
+
"content": ${JSON.stringify(elementInfosDescription)}
|
|
4996
|
+
`}
|
|
4997
|
+
}`,
|
|
4958
4998
|
elementById(id) {
|
|
4959
4999
|
(0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
|
|
4960
5000
|
const item = idElementMap[`${id}`];
|
|
@@ -4973,7 +5013,13 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4973
5013
|
const tailorAttributes = Object.keys(attributes).reduce(
|
|
4974
5014
|
(res, currentKey) => {
|
|
4975
5015
|
const attributeVal = attributes[currentKey];
|
|
4976
|
-
|
|
5016
|
+
if (currentKey === "style" || currentKey === "src")
|
|
5017
|
+
return res;
|
|
5018
|
+
if (currentKey === "nodeType") {
|
|
5019
|
+
res[currentKey] = attributeVal.replace(/\sNode$/, "");
|
|
5020
|
+
} else {
|
|
5021
|
+
res[currentKey] = truncateText(attributeVal);
|
|
5022
|
+
}
|
|
4977
5023
|
return res;
|
|
4978
5024
|
},
|
|
4979
5025
|
{}
|
|
@@ -4982,12 +5028,18 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4982
5028
|
id,
|
|
4983
5029
|
markerId: item.indexId,
|
|
4984
5030
|
attributes: tailorAttributes,
|
|
4985
|
-
rect
|
|
5031
|
+
rect: {
|
|
5032
|
+
left: rect.left,
|
|
5033
|
+
top: rect.top,
|
|
5034
|
+
width: rect.width,
|
|
5035
|
+
height: rect.height
|
|
5036
|
+
// remove 'zoom' if it exists
|
|
5037
|
+
},
|
|
4986
5038
|
content: tailorContent
|
|
4987
5039
|
};
|
|
4988
5040
|
}
|
|
4989
5041
|
);
|
|
4990
|
-
return
|
|
5042
|
+
return elementInfosDescription;
|
|
4991
5043
|
}
|
|
4992
5044
|
function retrieveElement(prompt, opt) {
|
|
4993
5045
|
if (opt == null ? void 0 : opt.multi) {
|
|
@@ -5294,6 +5346,7 @@ function writeLogFile(opts) {
|
|
|
5294
5346
|
# Midscene.js dump files
|
|
5295
5347
|
${logDirName}/report
|
|
5296
5348
|
${logDirName}/dump
|
|
5349
|
+
${logDirName}/tmp
|
|
5297
5350
|
`,
|
|
5298
5351
|
"utf-8"
|
|
5299
5352
|
);
|
|
@@ -5325,7 +5378,7 @@ function stringifyDumpData(data, indents) {
|
|
|
5325
5378
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
5326
5379
|
}
|
|
5327
5380
|
function getVersion() {
|
|
5328
|
-
return "0.8.
|
|
5381
|
+
return "0.8.5-beta-20241122072506.0";
|
|
5329
5382
|
}
|
|
5330
5383
|
|
|
5331
5384
|
// src/action/executor.ts
|
|
@@ -5348,9 +5401,12 @@ var Executor = class {
|
|
|
5348
5401
|
};
|
|
5349
5402
|
}
|
|
5350
5403
|
async append(task) {
|
|
5404
|
+
var _a, _b;
|
|
5351
5405
|
(0, import_node_assert5.default)(
|
|
5352
5406
|
this.status !== "error",
|
|
5353
|
-
|
|
5407
|
+
`executor is in error state, cannot append task
|
|
5408
|
+
error=${(_a = this.latestErrorTask()) == null ? void 0 : _a.error}
|
|
5409
|
+
${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
|
|
5354
5410
|
);
|
|
5355
5411
|
if (Array.isArray(task)) {
|
|
5356
5412
|
this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
|
|
@@ -6094,6 +6150,7 @@ var src_default = Insight;
|
|
|
6094
6150
|
allAIConfig,
|
|
6095
6151
|
getAIConfig,
|
|
6096
6152
|
getElement,
|
|
6153
|
+
getLogDirByType,
|
|
6097
6154
|
getSection,
|
|
6098
6155
|
getVersion,
|
|
6099
6156
|
overrideAIConfig,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-
|
|
1
|
+
export { h as AiAssert, f as AiExtractElementInfo, A as AiInspectElement, c as callAiFn, d as callToGetJSONObject, e as describeUserPage, p as plan, t as transformElementPositionToId } from './index-152b8346.js';
|
|
2
2
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
3
|
-
import './types-
|
|
3
|
+
import './types-0d8eeece.js';
|
|
4
4
|
|
|
5
5
|
declare function systemPromptToFindElement(): string;
|
|
6
6
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ChatCompletionMessageParam } from 'openai/resources';
|
|
2
|
-
import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-
|
|
2
|
+
import { j as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, k as AISectionParseResponse, l as AIAssertionResponse, x as PlanningAction } from './types-0d8eeece.js';
|
|
3
3
|
|
|
4
4
|
type AIArgs = [
|
|
5
5
|
ChatCompletionSystemMessageParam,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-
|
|
2
|
-
export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-
|
|
3
|
-
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-
|
|
4
|
-
export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-
|
|
5
|
-
export { getVersion, setLogDir } from './utils.js';
|
|
1
|
+
import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightOptions, c as InsightTaskInfo, A as AISingleElementResponse, d as InsightAssertionResponse } from './types-0d8eeece.js';
|
|
2
|
+
export { l as AIAssertionResponse, h as AIElementIdResponse, i as AIElementPositionResponse, j as AIElementResponse, e as AIResponseFormat, k as AISectionParseResponse, f as AISingleElementResponseById, g as AISingleElementResponseByPosition, w as AgentAssertOpt, v as AgentWaitForOpt, Q as BaseAgentParserOpt, o as BasicSectionQuery, C as CallAIFn, O as Color, q as DumpMeta, u as ElementById, n as EnsureObject, W as ExecutionRecorderItem, ac as ExecutionTaskAction, ab as ExecutionTaskActionApply, aa as ExecutionTaskInsightAssertion, a9 as ExecutionTaskInsightAssertionApply, a8 as ExecutionTaskInsightAssertionParam, a1 as ExecutionTaskInsightDumpLog, a3 as ExecutionTaskInsightLocate, a2 as ExecutionTaskInsightLocateApply, a0 as ExecutionTaskInsightLocateOutput, $ as ExecutionTaskInsightLocateParam, a7 as ExecutionTaskInsightQuery, a6 as ExecutionTaskInsightQueryApply, a5 as ExecutionTaskInsightQueryOutput, a4 as ExecutionTaskInsightQueryParam, ae as ExecutionTaskPlanning, ad as ExecutionTaskPlanningApply, _ as ExecutionTaskReturn, X as ExecutionTaskType, Y as ExecutorContext, af as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, t as PartialInsightDumpFromSDK, y as PlanningAIResponse, x as PlanningAction, J as PlanningActionParamAssert, M as PlanningActionParamError, F as PlanningActionParamHover, G as PlanningActionParamInputOrKeyPress, H as PlanningActionParamScroll, K as PlanningActionParamSleep, z as PlanningActionParamTap, N as PlanningActionParamWaitFor, V as PlaywrightParserOpt, P as Point, T as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, Z as TaskCacheInfo, m as UISection } from './types-0d8eeece.js';
|
|
3
|
+
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-152b8346.js';
|
|
4
|
+
export { b as allAIConfig, g as getAIConfig, o as overrideAIConfig, p as plan, t as transformElementPositionToId } from './index-152b8346.js';
|
|
5
|
+
export { getLogDirByType, getVersion, setLogDir } from './utils.js';
|
|
6
6
|
import 'openai/resources';
|
|
7
7
|
|
|
8
8
|
declare class Executor {
|