@midscene/core 0.3.0 → 0.3.1-beta-20240821105917.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +2 -2
- package/dist/es/ai-model.js +6 -5
- package/dist/es/image.js +45 -12
- package/dist/es/index.js +11 -9
- package/dist/lib/ai-model.js +6 -5
- package/dist/lib/image.js +35 -5
- package/dist/lib/index.js +11 -9
- package/dist/types/ai-model.d.ts +3 -3
- package/dist/types/image.d.ts +1 -1
- package/dist/types/{index-f43935c0.d.ts → index-0479d487.d.ts} +1 -1
- package/dist/types/index.d.ts +4 -4
- package/dist/types/{types-81f7991c.d.ts → types-3eb61b5c.d.ts} +17 -4
- package/dist/types/utils.d.ts +1 -1
- package/package.json +8 -7
- package/report/index.html +1 -1
package/LICENSE
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
MIT License
|
|
2
2
|
|
|
3
|
-
Copyright (c) 2024-present
|
|
3
|
+
Copyright (c) 2024-present Bytedance, Inc. and its affiliates.
|
|
4
4
|
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
6
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
21
|
+
SOFTWARE.
|
package/dist/es/ai-model.js
CHANGED
|
@@ -4256,7 +4256,8 @@ async function call(messages, responseFormat) {
|
|
|
4256
4256
|
const completion = await openai.chat.completions.create({
|
|
4257
4257
|
model,
|
|
4258
4258
|
messages,
|
|
4259
|
-
response_format: { type: responseFormat }
|
|
4259
|
+
response_format: { type: responseFormat },
|
|
4260
|
+
temperature: 0.2
|
|
4260
4261
|
});
|
|
4261
4262
|
const { content } = completion.choices[0].message;
|
|
4262
4263
|
assert(content, "empty content");
|
|
@@ -4615,8 +4616,8 @@ async function callCozeAi(options) {
|
|
|
4615
4616
|
}
|
|
4616
4617
|
const aiResponse = await completion.json();
|
|
4617
4618
|
if (aiResponse.code !== 0) {
|
|
4618
|
-
console.error("CozeAI error response", aiResponse);
|
|
4619
|
-
throw new Error(
|
|
4619
|
+
console.error("CozeAI error response", aiResponse.msg);
|
|
4620
|
+
throw new Error(`CozeAI error response ${aiResponse.msg}`);
|
|
4620
4621
|
}
|
|
4621
4622
|
if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
|
|
4622
4623
|
console.error("aiResponse", aiResponse);
|
|
@@ -4869,7 +4870,7 @@ function systemPromptToTaskPlanning() {
|
|
|
4869
4870
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4870
4871
|
|
|
4871
4872
|
Here is an example of how to decompose a task.
|
|
4872
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you
|
|
4873
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4873
4874
|
* Find: 'The search bar'
|
|
4874
4875
|
* Input: 'Weather in Shanghai'
|
|
4875
4876
|
* Sleep: 1000
|
|
@@ -4879,7 +4880,7 @@ function systemPromptToTaskPlanning() {
|
|
|
4879
4880
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4880
4881
|
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4881
4882
|
|
|
4882
|
-
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal.
|
|
4883
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
4883
4884
|
|
|
4884
4885
|
Return in the following JSON format:
|
|
4885
4886
|
{
|
package/dist/es/image.js
CHANGED
|
@@ -1,6 +1,23 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
|
|
3
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
4
|
+
var __propIsEnum = Object.prototype.propertyIsEnumerable;
|
|
5
|
+
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
6
|
+
var __spreadValues = (a, b) => {
|
|
7
|
+
for (var prop in b || (b = {}))
|
|
8
|
+
if (__hasOwnProp.call(b, prop))
|
|
9
|
+
__defNormalProp(a, prop, b[prop]);
|
|
10
|
+
if (__getOwnPropSymbols)
|
|
11
|
+
for (var prop of __getOwnPropSymbols(b)) {
|
|
12
|
+
if (__propIsEnum.call(b, prop))
|
|
13
|
+
__defNormalProp(a, prop, b[prop]);
|
|
14
|
+
}
|
|
15
|
+
return a;
|
|
16
|
+
};
|
|
17
|
+
|
|
1
18
|
// src/image/info.ts
|
|
2
19
|
import assert from "assert";
|
|
3
|
-
import { Buffer } from "buffer";
|
|
20
|
+
import { Buffer as Buffer2 } from "buffer";
|
|
4
21
|
import { readFileSync } from "fs";
|
|
5
22
|
import Sharp from "sharp";
|
|
6
23
|
async function imageInfo(image) {
|
|
@@ -10,7 +27,7 @@ async function imageInfo(image) {
|
|
|
10
27
|
}
|
|
11
28
|
async function imageInfoOfBase64(imageBase64) {
|
|
12
29
|
const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
|
|
13
|
-
return imageInfo(
|
|
30
|
+
return imageInfo(Buffer2.from(base64Data, "base64"));
|
|
14
31
|
}
|
|
15
32
|
function base64Encoded(image, withHeader = true) {
|
|
16
33
|
const imageBuffer = readFileSync(image);
|
|
@@ -27,12 +44,12 @@ function base64Encoded(image, withHeader = true) {
|
|
|
27
44
|
}
|
|
28
45
|
|
|
29
46
|
// src/image/transform.ts
|
|
30
|
-
import { Buffer as
|
|
47
|
+
import { Buffer as Buffer3 } from "buffer";
|
|
31
48
|
import Sharp2 from "sharp";
|
|
32
49
|
async function saveBase64Image(options) {
|
|
33
50
|
const { base64Data, outputPath } = options;
|
|
34
51
|
const base64Image = base64Data.split(";base64,").pop() || base64Data;
|
|
35
|
-
const imageBuffer =
|
|
52
|
+
const imageBuffer = Buffer3.from(base64Image, "base64");
|
|
36
53
|
await Sharp2(imageBuffer).toFile(outputPath);
|
|
37
54
|
console.log("Image successfully written to file.");
|
|
38
55
|
}
|
|
@@ -44,7 +61,7 @@ async function transformImgPathToBase64(inputPath) {
|
|
|
44
61
|
}
|
|
45
62
|
async function resizeImg(base64Data) {
|
|
46
63
|
const base64Image = base64Data.split(";base64,").pop() || base64Data;
|
|
47
|
-
const imageBuffer =
|
|
64
|
+
const imageBuffer = Buffer3.from(base64Image, "base64");
|
|
48
65
|
const metadata = await Sharp2(imageBuffer).metadata();
|
|
49
66
|
const { width, height } = metadata;
|
|
50
67
|
if (!width || !height) {
|
|
@@ -99,26 +116,42 @@ async function alignCoordByTrim(image, centerRect) {
|
|
|
99
116
|
if (!(imgInfo == null ? void 0 : imgInfo.width) || !imgInfo.height || imgInfo.width <= 3 || imgInfo.height <= 3) {
|
|
100
117
|
return centerRect;
|
|
101
118
|
}
|
|
119
|
+
const zeroSize = {
|
|
120
|
+
left: 0,
|
|
121
|
+
top: 0,
|
|
122
|
+
width: -1,
|
|
123
|
+
height: -1
|
|
124
|
+
};
|
|
125
|
+
const finalCenterRect = __spreadValues({}, centerRect);
|
|
126
|
+
if (centerRect.left > imgInfo.width || centerRect.top > imgInfo.height) {
|
|
127
|
+
return zeroSize;
|
|
128
|
+
}
|
|
129
|
+
if (centerRect.left + centerRect.width > imgInfo.width) {
|
|
130
|
+
finalCenterRect.width = imgInfo.width - centerRect.left;
|
|
131
|
+
}
|
|
132
|
+
if (centerRect.top + centerRect.height > imgInfo.height) {
|
|
133
|
+
finalCenterRect.height = imgInfo.height - centerRect.top;
|
|
134
|
+
}
|
|
102
135
|
try {
|
|
103
|
-
const img = await Sharp2(image).extract(
|
|
136
|
+
const img = await Sharp2(image).extract(finalCenterRect).toBuffer();
|
|
104
137
|
const trimInfo = await trimImage(img);
|
|
105
138
|
if (!trimInfo) {
|
|
106
|
-
return
|
|
139
|
+
return finalCenterRect;
|
|
107
140
|
}
|
|
108
141
|
return {
|
|
109
|
-
left:
|
|
110
|
-
top:
|
|
142
|
+
left: finalCenterRect.left - trimInfo.trimOffsetLeft,
|
|
143
|
+
top: finalCenterRect.top - trimInfo.trimOffsetTop,
|
|
111
144
|
width: trimInfo.width,
|
|
112
145
|
height: trimInfo.height
|
|
113
146
|
};
|
|
114
147
|
} catch (e) {
|
|
115
|
-
console.
|
|
148
|
+
console.warn(imgInfo, finalCenterRect);
|
|
116
149
|
throw e;
|
|
117
150
|
}
|
|
118
151
|
}
|
|
119
152
|
|
|
120
153
|
// src/image/visualization.ts
|
|
121
|
-
import { Buffer as
|
|
154
|
+
import { Buffer as Buffer4 } from "buffer";
|
|
122
155
|
|
|
123
156
|
// src/utils.ts
|
|
124
157
|
import assert2 from "assert";
|
|
@@ -260,7 +293,7 @@ async function composeSectionDiagram(sections, context) {
|
|
|
260
293
|
${rects.join("\n")}
|
|
261
294
|
</svg>
|
|
262
295
|
`;
|
|
263
|
-
const svgBuffer =
|
|
296
|
+
const svgBuffer = Buffer4.from(rectangles);
|
|
264
297
|
const file = getTmpFile("png");
|
|
265
298
|
await Sharp3({
|
|
266
299
|
create: {
|
package/dist/es/index.js
CHANGED
|
@@ -1228,7 +1228,7 @@ var Executor = class {
|
|
|
1228
1228
|
returnValue = await task.executor(param, executorContext);
|
|
1229
1229
|
}
|
|
1230
1230
|
Object.assign(task, returnValue);
|
|
1231
|
-
task.status = "
|
|
1231
|
+
task.status = "finished";
|
|
1232
1232
|
task.timing.end = Date.now();
|
|
1233
1233
|
task.timing.cost = task.timing.end - task.timing.start;
|
|
1234
1234
|
taskIndex++;
|
|
@@ -1247,12 +1247,13 @@ var Executor = class {
|
|
|
1247
1247
|
}
|
|
1248
1248
|
if (successfullyCompleted) {
|
|
1249
1249
|
this.status = "completed";
|
|
1250
|
-
if (this.tasks.length) {
|
|
1251
|
-
return this.tasks[this.tasks.length - 1].output;
|
|
1252
|
-
}
|
|
1253
1250
|
} else {
|
|
1254
1251
|
this.status = "error";
|
|
1255
1252
|
}
|
|
1253
|
+
if (this.tasks.length) {
|
|
1254
|
+
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
|
|
1255
|
+
return this.tasks[outputIndex].output;
|
|
1256
|
+
}
|
|
1256
1257
|
}
|
|
1257
1258
|
isInErrorState() {
|
|
1258
1259
|
return this.status === "error";
|
|
@@ -4547,7 +4548,8 @@ async function call(messages, responseFormat) {
|
|
|
4547
4548
|
const completion = await openai.chat.completions.create({
|
|
4548
4549
|
model,
|
|
4549
4550
|
messages,
|
|
4550
|
-
response_format: { type: responseFormat }
|
|
4551
|
+
response_format: { type: responseFormat },
|
|
4552
|
+
temperature: 0.2
|
|
4551
4553
|
});
|
|
4552
4554
|
const { content } = completion.choices[0].message;
|
|
4553
4555
|
assert3(content, "empty content");
|
|
@@ -4926,8 +4928,8 @@ async function callCozeAi(options) {
|
|
|
4926
4928
|
}
|
|
4927
4929
|
const aiResponse = await completion.json();
|
|
4928
4930
|
if (aiResponse.code !== 0) {
|
|
4929
|
-
console.error("CozeAI error response", aiResponse);
|
|
4930
|
-
throw new Error(
|
|
4931
|
+
console.error("CozeAI error response", aiResponse.msg);
|
|
4932
|
+
throw new Error(`CozeAI error response ${aiResponse.msg}`);
|
|
4931
4933
|
}
|
|
4932
4934
|
if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
|
|
4933
4935
|
console.error("aiResponse", aiResponse);
|
|
@@ -5180,7 +5182,7 @@ function systemPromptToTaskPlanning() {
|
|
|
5180
5182
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
5181
5183
|
|
|
5182
5184
|
Here is an example of how to decompose a task.
|
|
5183
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you
|
|
5185
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
5184
5186
|
* Find: 'The search bar'
|
|
5185
5187
|
* Input: 'Weather in Shanghai'
|
|
5186
5188
|
* Sleep: 1000
|
|
@@ -5190,7 +5192,7 @@ function systemPromptToTaskPlanning() {
|
|
|
5190
5192
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
5191
5193
|
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
5192
5194
|
|
|
5193
|
-
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal.
|
|
5195
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
5194
5196
|
|
|
5195
5197
|
Return in the following JSON format:
|
|
5196
5198
|
{
|
package/dist/lib/ai-model.js
CHANGED
|
@@ -4276,7 +4276,8 @@ async function call(messages, responseFormat) {
|
|
|
4276
4276
|
const completion = await openai.chat.completions.create({
|
|
4277
4277
|
model,
|
|
4278
4278
|
messages,
|
|
4279
|
-
response_format: { type: responseFormat }
|
|
4279
|
+
response_format: { type: responseFormat },
|
|
4280
|
+
temperature: 0.2
|
|
4280
4281
|
});
|
|
4281
4282
|
const { content } = completion.choices[0].message;
|
|
4282
4283
|
(0, import_node_assert.default)(content, "empty content");
|
|
@@ -4630,8 +4631,8 @@ async function callCozeAi(options) {
|
|
|
4630
4631
|
}
|
|
4631
4632
|
const aiResponse = await completion.json();
|
|
4632
4633
|
if (aiResponse.code !== 0) {
|
|
4633
|
-
console.error("CozeAI error response", aiResponse);
|
|
4634
|
-
throw new Error(
|
|
4634
|
+
console.error("CozeAI error response", aiResponse.msg);
|
|
4635
|
+
throw new Error(`CozeAI error response ${aiResponse.msg}`);
|
|
4635
4636
|
}
|
|
4636
4637
|
if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
|
|
4637
4638
|
console.error("aiResponse", aiResponse);
|
|
@@ -4884,7 +4885,7 @@ function systemPromptToTaskPlanning() {
|
|
|
4884
4885
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4885
4886
|
|
|
4886
4887
|
Here is an example of how to decompose a task.
|
|
4887
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you
|
|
4888
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4888
4889
|
* Find: 'The search bar'
|
|
4889
4890
|
* Input: 'Weather in Shanghai'
|
|
4890
4891
|
* Sleep: 1000
|
|
@@ -4894,7 +4895,7 @@ function systemPromptToTaskPlanning() {
|
|
|
4894
4895
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4895
4896
|
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4896
4897
|
|
|
4897
|
-
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal.
|
|
4898
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
4898
4899
|
|
|
4899
4900
|
Return in the following JSON format:
|
|
4900
4901
|
{
|
package/dist/lib/image.js
CHANGED
|
@@ -3,8 +3,22 @@ var __create = Object.create;
|
|
|
3
3
|
var __defProp = Object.defineProperty;
|
|
4
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
|
|
6
7
|
var __getProtoOf = Object.getPrototypeOf;
|
|
7
8
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
9
|
+
var __propIsEnum = Object.prototype.propertyIsEnumerable;
|
|
10
|
+
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
11
|
+
var __spreadValues = (a, b) => {
|
|
12
|
+
for (var prop in b || (b = {}))
|
|
13
|
+
if (__hasOwnProp.call(b, prop))
|
|
14
|
+
__defNormalProp(a, prop, b[prop]);
|
|
15
|
+
if (__getOwnPropSymbols)
|
|
16
|
+
for (var prop of __getOwnPropSymbols(b)) {
|
|
17
|
+
if (__propIsEnum.call(b, prop))
|
|
18
|
+
__defNormalProp(a, prop, b[prop]);
|
|
19
|
+
}
|
|
20
|
+
return a;
|
|
21
|
+
};
|
|
8
22
|
var __export = (target, all) => {
|
|
9
23
|
for (var name in all)
|
|
10
24
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -144,20 +158,36 @@ async function alignCoordByTrim(image, centerRect) {
|
|
|
144
158
|
if (!(imgInfo == null ? void 0 : imgInfo.width) || !imgInfo.height || imgInfo.width <= 3 || imgInfo.height <= 3) {
|
|
145
159
|
return centerRect;
|
|
146
160
|
}
|
|
161
|
+
const zeroSize = {
|
|
162
|
+
left: 0,
|
|
163
|
+
top: 0,
|
|
164
|
+
width: -1,
|
|
165
|
+
height: -1
|
|
166
|
+
};
|
|
167
|
+
const finalCenterRect = __spreadValues({}, centerRect);
|
|
168
|
+
if (centerRect.left > imgInfo.width || centerRect.top > imgInfo.height) {
|
|
169
|
+
return zeroSize;
|
|
170
|
+
}
|
|
171
|
+
if (centerRect.left + centerRect.width > imgInfo.width) {
|
|
172
|
+
finalCenterRect.width = imgInfo.width - centerRect.left;
|
|
173
|
+
}
|
|
174
|
+
if (centerRect.top + centerRect.height > imgInfo.height) {
|
|
175
|
+
finalCenterRect.height = imgInfo.height - centerRect.top;
|
|
176
|
+
}
|
|
147
177
|
try {
|
|
148
|
-
const img = await (0, import_sharp2.default)(image).extract(
|
|
178
|
+
const img = await (0, import_sharp2.default)(image).extract(finalCenterRect).toBuffer();
|
|
149
179
|
const trimInfo = await trimImage(img);
|
|
150
180
|
if (!trimInfo) {
|
|
151
|
-
return
|
|
181
|
+
return finalCenterRect;
|
|
152
182
|
}
|
|
153
183
|
return {
|
|
154
|
-
left:
|
|
155
|
-
top:
|
|
184
|
+
left: finalCenterRect.left - trimInfo.trimOffsetLeft,
|
|
185
|
+
top: finalCenterRect.top - trimInfo.trimOffsetTop,
|
|
156
186
|
width: trimInfo.width,
|
|
157
187
|
height: trimInfo.height
|
|
158
188
|
};
|
|
159
189
|
} catch (e) {
|
|
160
|
-
console.
|
|
190
|
+
console.warn(imgInfo, finalCenterRect);
|
|
161
191
|
throw e;
|
|
162
192
|
}
|
|
163
193
|
}
|
package/dist/lib/index.js
CHANGED
|
@@ -1244,7 +1244,7 @@ var Executor = class {
|
|
|
1244
1244
|
returnValue = await task.executor(param, executorContext);
|
|
1245
1245
|
}
|
|
1246
1246
|
Object.assign(task, returnValue);
|
|
1247
|
-
task.status = "
|
|
1247
|
+
task.status = "finished";
|
|
1248
1248
|
task.timing.end = Date.now();
|
|
1249
1249
|
task.timing.cost = task.timing.end - task.timing.start;
|
|
1250
1250
|
taskIndex++;
|
|
@@ -1263,12 +1263,13 @@ var Executor = class {
|
|
|
1263
1263
|
}
|
|
1264
1264
|
if (successfullyCompleted) {
|
|
1265
1265
|
this.status = "completed";
|
|
1266
|
-
if (this.tasks.length) {
|
|
1267
|
-
return this.tasks[this.tasks.length - 1].output;
|
|
1268
|
-
}
|
|
1269
1266
|
} else {
|
|
1270
1267
|
this.status = "error";
|
|
1271
1268
|
}
|
|
1269
|
+
if (this.tasks.length) {
|
|
1270
|
+
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
|
|
1271
|
+
return this.tasks[outputIndex].output;
|
|
1272
|
+
}
|
|
1272
1273
|
}
|
|
1273
1274
|
isInErrorState() {
|
|
1274
1275
|
return this.status === "error";
|
|
@@ -4563,7 +4564,8 @@ async function call(messages, responseFormat) {
|
|
|
4563
4564
|
const completion = await openai.chat.completions.create({
|
|
4564
4565
|
model,
|
|
4565
4566
|
messages,
|
|
4566
|
-
response_format: { type: responseFormat }
|
|
4567
|
+
response_format: { type: responseFormat },
|
|
4568
|
+
temperature: 0.2
|
|
4567
4569
|
});
|
|
4568
4570
|
const { content } = completion.choices[0].message;
|
|
4569
4571
|
(0, import_node_assert3.default)(content, "empty content");
|
|
@@ -4942,8 +4944,8 @@ async function callCozeAi(options) {
|
|
|
4942
4944
|
}
|
|
4943
4945
|
const aiResponse = await completion.json();
|
|
4944
4946
|
if (aiResponse.code !== 0) {
|
|
4945
|
-
console.error("CozeAI error response", aiResponse);
|
|
4946
|
-
throw new Error(
|
|
4947
|
+
console.error("CozeAI error response", aiResponse.msg);
|
|
4948
|
+
throw new Error(`CozeAI error response ${aiResponse.msg}`);
|
|
4947
4949
|
}
|
|
4948
4950
|
if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
|
|
4949
4951
|
console.error("aiResponse", aiResponse);
|
|
@@ -5196,7 +5198,7 @@ function systemPromptToTaskPlanning() {
|
|
|
5196
5198
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
5197
5199
|
|
|
5198
5200
|
Here is an example of how to decompose a task.
|
|
5199
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you
|
|
5201
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
5200
5202
|
* Find: 'The search bar'
|
|
5201
5203
|
* Input: 'Weather in Shanghai'
|
|
5202
5204
|
* Sleep: 1000
|
|
@@ -5206,7 +5208,7 @@ function systemPromptToTaskPlanning() {
|
|
|
5206
5208
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
5207
5209
|
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
5208
5210
|
|
|
5209
|
-
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal.
|
|
5211
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
5210
5212
|
|
|
5211
5213
|
Return in the following JSON format:
|
|
5212
5214
|
{
|
package/dist/types/ai-model.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { ChatCompletionMessageParam } from 'openai/resources';
|
|
2
2
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
3
|
-
import { c as callAiFn } from './index-
|
|
4
|
-
export { d as describeUserPage, p as plan } from './index-
|
|
5
|
-
import { B as BaseElement, U as UIContext, e as AIElementParseResponse, f as AISectionParseResponse, g as AIAssertionResponse } from './types-
|
|
3
|
+
import { c as callAiFn } from './index-0479d487.js';
|
|
4
|
+
export { d as describeUserPage, p as plan } from './index-0479d487.js';
|
|
5
|
+
import { B as BaseElement, U as UIContext, e as AIElementParseResponse, f as AISectionParseResponse, g as AIAssertionResponse } from './types-3eb61b5c.js';
|
|
6
6
|
|
|
7
7
|
declare function AiInspectElement<ElementType extends BaseElement = BaseElement>(options: {
|
|
8
8
|
context: UIContext<ElementType>;
|
package/dist/types/image.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Buffer } from 'node:buffer';
|
|
2
|
-
import { S as Size, R as Rect, h as UISection, U as UIContext,
|
|
2
|
+
import { S as Size, R as Rect, h as UISection, U as UIContext, G as Color } from './types-3eb61b5c.js';
|
|
3
3
|
import 'openai/resources';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as BaseElement, U as UIContext,
|
|
1
|
+
import { B as BaseElement, U as UIContext, r as PlanningAction } from './types-3eb61b5c.js';
|
|
2
2
|
import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
|
|
3
3
|
|
|
4
4
|
type AIArgs = [
|
package/dist/types/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightTaskInfo, c as InsightOptions, d as InsightAssertionResponse } from './types-
|
|
2
|
-
export { g as AIAssertionResponse, e as AIElementParseResponse, A as AIResponseFormat, f as AISectionParseResponse,
|
|
3
|
-
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-
|
|
4
|
-
export { p as plan } from './index-
|
|
1
|
+
import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightTaskInfo, c as InsightOptions, d as InsightAssertionResponse } from './types-3eb61b5c.js';
|
|
2
|
+
export { g as AIAssertionResponse, e as AIElementParseResponse, A as AIResponseFormat, f as AISectionParseResponse, q as AgentWaitForOpt, H as BaseAgentParserOpt, j as BasicSectionQuery, C as CallAIFn, G as Color, l as DumpMeta, p as ElementById, i as EnsureObject, M as ExecutionRecorderItem, a6 as ExecutionTaskAction, a5 as ExecutionTaskActionApply, a4 as ExecutionTaskInsightAssertion, a3 as ExecutionTaskInsightAssertionApply, a2 as ExecutionTaskInsightAssertionParam, X as ExecutionTaskInsightDumpLog, Z as ExecutionTaskInsightLocate, Y as ExecutionTaskInsightLocateApply, W as ExecutionTaskInsightLocateOutput, V as ExecutionTaskInsightLocateParam, a1 as ExecutionTaskInsightQuery, a0 as ExecutionTaskInsightQueryApply, $ as ExecutionTaskInsightQueryOutput, _ as ExecutionTaskInsightQueryParam, a8 as ExecutionTaskPlanning, a7 as ExecutionTaskPlanningApply, Q as ExecutionTaskReturn, N as ExecutionTaskType, O as ExecutorContext, a9 as GroupedActionDump, n as InsightDump, k as InsightExtractParam, L as LiteUISection, o as PartialInsightDumpFromSDK, s as PlanningAIResponse, r as PlanningAction, x as PlanningActionParamAssert, z as PlanningActionParamError, u as PlanningActionParamHover, v as PlanningActionParamInputOrKeyPress, w as PlanningActionParamScroll, y as PlanningActionParamSleep, t as PlanningActionParamTap, F as PlanningActionParamWaitFor, K as PlaywrightParserOpt, P as Point, J as PuppeteerParserOpt, R as Rect, m as ReportDumpWithAttributes, S as Size, T as TaskCacheInfo, h as UISection } from './types-3eb61b5c.js';
|
|
3
|
+
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-0479d487.js';
|
|
4
|
+
export { p as plan } from './index-0479d487.js';
|
|
5
5
|
export { setLogDir } from './utils.js';
|
|
6
6
|
import 'openai/resources';
|
|
7
7
|
|
|
@@ -10,7 +10,7 @@ interface Size {
|
|
|
10
10
|
}
|
|
11
11
|
type Rect = Point & Size;
|
|
12
12
|
declare enum NodeType {
|
|
13
|
-
|
|
13
|
+
FORM_ITEM = "FORM_ITEM Node",
|
|
14
14
|
BUTTON = "BUTTON Node",
|
|
15
15
|
IMG = "IMG Node",
|
|
16
16
|
TEXT = "TEXT Node"
|
|
@@ -122,13 +122,20 @@ interface LiteUISection {
|
|
|
122
122
|
}
|
|
123
123
|
type ElementById = (id: string) => BaseElement | null;
|
|
124
124
|
type InsightAssertionResponse = AIAssertionResponse;
|
|
125
|
+
/**
|
|
126
|
+
* agent
|
|
127
|
+
*/
|
|
128
|
+
interface AgentWaitForOpt {
|
|
129
|
+
checkIntervalMs?: number;
|
|
130
|
+
timeoutMs?: number;
|
|
131
|
+
}
|
|
125
132
|
/**
|
|
126
133
|
* planning
|
|
127
134
|
*
|
|
128
135
|
*/
|
|
129
136
|
interface PlanningAction<ParamType = any> {
|
|
130
137
|
thought?: string;
|
|
131
|
-
type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'Sleep';
|
|
138
|
+
type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
|
|
132
139
|
param: ParamType;
|
|
133
140
|
}
|
|
134
141
|
interface PlanningAIResponse {
|
|
@@ -150,6 +157,12 @@ interface PlanningActionParamAssert {
|
|
|
150
157
|
interface PlanningActionParamSleep {
|
|
151
158
|
timeMs: number;
|
|
152
159
|
}
|
|
160
|
+
interface PlanningActionParamError {
|
|
161
|
+
thought: string;
|
|
162
|
+
}
|
|
163
|
+
type PlanningActionParamWaitFor = AgentWaitForOpt & {
|
|
164
|
+
assertion: string;
|
|
165
|
+
};
|
|
153
166
|
/**
|
|
154
167
|
* misc
|
|
155
168
|
*/
|
|
@@ -191,7 +204,7 @@ interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {
|
|
|
191
204
|
cache?: TaskCacheInfo;
|
|
192
205
|
}
|
|
193
206
|
type ExecutionTask<E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<any, any, any>> = E & ExecutionTaskReturn<E extends ExecutionTaskApply<any, any, infer TaskOutput, any> ? TaskOutput : unknown, E extends ExecutionTaskApply<any, any, any, infer TaskLog> ? TaskLog : unknown> & {
|
|
194
|
-
status: 'pending' | 'running' | '
|
|
207
|
+
status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';
|
|
195
208
|
error?: string;
|
|
196
209
|
errorStack?: string;
|
|
197
210
|
timing?: {
|
|
@@ -243,4 +256,4 @@ interface GroupedActionDump {
|
|
|
243
256
|
executions: ExecutionDump[];
|
|
244
257
|
}
|
|
245
258
|
|
|
246
|
-
export { type
|
|
259
|
+
export { type ExecutionTaskInsightQueryOutput as $, AIResponseFormat as A, BaseElement as B, type CallAIFn as C, type DumpSubscriber as D, type ExecutionTask as E, type PlanningActionParamWaitFor as F, type Color as G, type BaseAgentParserOpt as H, type InsightTaskInfo as I, type PuppeteerParserOpt as J, type PlaywrightParserOpt as K, type LiteUISection as L, type ExecutionRecorderItem as M, type ExecutionTaskType as N, type ExecutorContext as O, type Point as P, type ExecutionTaskReturn as Q, type Rect as R, type Size as S, type TaskCacheInfo as T, UIContext as U, type ExecutionTaskInsightLocateParam as V, type ExecutionTaskInsightLocateOutput as W, type ExecutionTaskInsightDumpLog as X, type ExecutionTaskInsightLocateApply as Y, type ExecutionTaskInsightLocate as Z, type ExecutionTaskInsightQueryParam as _, type ExecutionTaskApply as a, type ExecutionTaskInsightQueryApply as a0, type ExecutionTaskInsightQuery as a1, type ExecutionTaskInsightAssertionParam as a2, type ExecutionTaskInsightAssertionApply as a3, type ExecutionTaskInsightAssertion as a4, type ExecutionTaskActionApply as a5, type ExecutionTaskAction as a6, type ExecutionTaskPlanningApply as a7, type ExecutionTaskPlanning as a8, type GroupedActionDump as a9, type ExecutionDump as b, type InsightOptions as c, type InsightAssertionResponse as d, type AIElementParseResponse as e, type AISectionParseResponse as f, type AIAssertionResponse as g, type UISection as h, type EnsureObject as i, type BasicSectionQuery as j, type InsightExtractParam as k, type DumpMeta as l, type ReportDumpWithAttributes as m, type InsightDump as n, type PartialInsightDumpFromSDK as o, type ElementById as p, type AgentWaitForOpt as q, type PlanningAction as r, type PlanningAIResponse as s, type PlanningActionParamTap as t, type PlanningActionParamHover as u, type PlanningActionParamInputOrKeyPress as v, type PlanningActionParamScroll as w, type PlanningActionParamAssert as x, type PlanningActionParamSleep as y, type PlanningActionParamError as z };
|
package/dist/types/utils.d.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@midscene/core",
|
|
3
3
|
"description": "Hello, It's Midscene",
|
|
4
|
-
"version": "0.3.0",
|
|
4
|
+
"version": "0.3.1-beta-20240821105917.0",
|
|
5
5
|
"jsnext:source": "./src/index.ts",
|
|
6
6
|
"main": "./dist/lib/index.js",
|
|
7
7
|
"module": "./dist/es/index.js",
|
|
@@ -60,18 +60,19 @@
|
|
|
60
60
|
}
|
|
61
61
|
},
|
|
62
62
|
"dependencies": {
|
|
63
|
+
"node-fetch": "2.6.7",
|
|
63
64
|
"openai": "4.47.1",
|
|
64
|
-
"
|
|
65
|
-
"
|
|
65
|
+
"optional": "0.1.4",
|
|
66
|
+
"sharp": "0.33.3"
|
|
66
67
|
},
|
|
67
68
|
"devDependencies": {
|
|
68
|
-
"@types/node-fetch": "2.6.11",
|
|
69
69
|
"@modern-js/module-tools": "^2.56.1",
|
|
70
70
|
"@types/node": "^18.0.0",
|
|
71
|
+
"@types/node-fetch": "2.6.11",
|
|
72
|
+
"dotenv": "16.4.5",
|
|
71
73
|
"langsmith": "0.1.36",
|
|
72
74
|
"typescript": "~5.0.4",
|
|
73
|
-
"vitest": "^1.6.0"
|
|
74
|
-
"dotenv": "16.4.5"
|
|
75
|
+
"vitest": "^1.6.0"
|
|
75
76
|
},
|
|
76
77
|
"engines": {
|
|
77
78
|
"node": ">=16.0.0"
|
|
@@ -88,6 +89,6 @@
|
|
|
88
89
|
"new": "modern new",
|
|
89
90
|
"upgrade": "modern upgrade",
|
|
90
91
|
"test": "vitest --run",
|
|
91
|
-
"test:
|
|
92
|
+
"test:ai": "AITEST=true npm run test"
|
|
92
93
|
}
|
|
93
94
|
}
|