@midscene/core 0.3.0 → 0.3.1-beta-20240821105917.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2024-present Midscene.js
3
+ Copyright (c) 2024-present Bytedance, Inc. and its affiliates.
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
21
+ SOFTWARE.
@@ -4256,7 +4256,8 @@ async function call(messages, responseFormat) {
4256
4256
  const completion = await openai.chat.completions.create({
4257
4257
  model,
4258
4258
  messages,
4259
- response_format: { type: responseFormat }
4259
+ response_format: { type: responseFormat },
4260
+ temperature: 0.2
4260
4261
  });
4261
4262
  const { content } = completion.choices[0].message;
4262
4263
  assert(content, "empty content");
@@ -4615,8 +4616,8 @@ async function callCozeAi(options) {
4615
4616
  }
4616
4617
  const aiResponse = await completion.json();
4617
4618
  if (aiResponse.code !== 0) {
4618
- console.error("CozeAI error response", aiResponse);
4619
- throw new Error("CozeAI error response", aiResponse);
4619
+ console.error("CozeAI error response", aiResponse.msg);
4620
+ throw new Error(`CozeAI error response ${aiResponse.msg}`);
4620
4621
  }
4621
4622
  if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
4622
4623
  console.error("aiResponse", aiResponse);
@@ -4869,7 +4870,7 @@ function systemPromptToTaskPlanning() {
4869
4870
  * param: { timeMs: number }, wait for timeMs milliseconds
4870
4871
 
4871
4872
  Here is an example of how to decompose a task.
4872
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you my decompose this task into something like this:
4873
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4873
4874
  * Find: 'The search bar'
4874
4875
  * Input: 'Weather in Shanghai'
4875
4876
  * Sleep: 1000
@@ -4879,7 +4880,7 @@ function systemPromptToTaskPlanning() {
4879
4880
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4880
4881
  2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4881
4882
 
4882
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. If any errors occur during task planning (such as the page content being irrelevant to the task or the mentioned element not existing), please return the error message with an explanation in the errors field. Thoughts, prompts, and error messages should all be in the same language as the user query.
4883
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4883
4884
 
4884
4885
  Return in the following JSON format:
4885
4886
  {
package/dist/es/image.js CHANGED
@@ -1,6 +1,23 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropSymbols = Object.getOwnPropertySymbols;
3
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
4
+ var __propIsEnum = Object.prototype.propertyIsEnumerable;
5
+ var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
6
+ var __spreadValues = (a, b) => {
7
+ for (var prop in b || (b = {}))
8
+ if (__hasOwnProp.call(b, prop))
9
+ __defNormalProp(a, prop, b[prop]);
10
+ if (__getOwnPropSymbols)
11
+ for (var prop of __getOwnPropSymbols(b)) {
12
+ if (__propIsEnum.call(b, prop))
13
+ __defNormalProp(a, prop, b[prop]);
14
+ }
15
+ return a;
16
+ };
17
+
1
18
  // src/image/info.ts
2
19
  import assert from "assert";
3
- import { Buffer } from "buffer";
20
+ import { Buffer as Buffer2 } from "buffer";
4
21
  import { readFileSync } from "fs";
5
22
  import Sharp from "sharp";
6
23
  async function imageInfo(image) {
@@ -10,7 +27,7 @@ async function imageInfo(image) {
10
27
  }
11
28
  async function imageInfoOfBase64(imageBase64) {
12
29
  const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
13
- return imageInfo(Buffer.from(base64Data, "base64"));
30
+ return imageInfo(Buffer2.from(base64Data, "base64"));
14
31
  }
15
32
  function base64Encoded(image, withHeader = true) {
16
33
  const imageBuffer = readFileSync(image);
@@ -27,12 +44,12 @@ function base64Encoded(image, withHeader = true) {
27
44
  }
28
45
 
29
46
  // src/image/transform.ts
30
- import { Buffer as Buffer2 } from "buffer";
47
+ import { Buffer as Buffer3 } from "buffer";
31
48
  import Sharp2 from "sharp";
32
49
  async function saveBase64Image(options) {
33
50
  const { base64Data, outputPath } = options;
34
51
  const base64Image = base64Data.split(";base64,").pop() || base64Data;
35
- const imageBuffer = Buffer2.from(base64Image, "base64");
52
+ const imageBuffer = Buffer3.from(base64Image, "base64");
36
53
  await Sharp2(imageBuffer).toFile(outputPath);
37
54
  console.log("Image successfully written to file.");
38
55
  }
@@ -44,7 +61,7 @@ async function transformImgPathToBase64(inputPath) {
44
61
  }
45
62
  async function resizeImg(base64Data) {
46
63
  const base64Image = base64Data.split(";base64,").pop() || base64Data;
47
- const imageBuffer = Buffer2.from(base64Image, "base64");
64
+ const imageBuffer = Buffer3.from(base64Image, "base64");
48
65
  const metadata = await Sharp2(imageBuffer).metadata();
49
66
  const { width, height } = metadata;
50
67
  if (!width || !height) {
@@ -99,26 +116,42 @@ async function alignCoordByTrim(image, centerRect) {
99
116
  if (!(imgInfo == null ? void 0 : imgInfo.width) || !imgInfo.height || imgInfo.width <= 3 || imgInfo.height <= 3) {
100
117
  return centerRect;
101
118
  }
119
+ const zeroSize = {
120
+ left: 0,
121
+ top: 0,
122
+ width: -1,
123
+ height: -1
124
+ };
125
+ const finalCenterRect = __spreadValues({}, centerRect);
126
+ if (centerRect.left > imgInfo.width || centerRect.top > imgInfo.height) {
127
+ return zeroSize;
128
+ }
129
+ if (centerRect.left + centerRect.width > imgInfo.width) {
130
+ finalCenterRect.width = imgInfo.width - centerRect.left;
131
+ }
132
+ if (centerRect.top + centerRect.height > imgInfo.height) {
133
+ finalCenterRect.height = imgInfo.height - centerRect.top;
134
+ }
102
135
  try {
103
- const img = await Sharp2(image).extract(centerRect).toBuffer();
136
+ const img = await Sharp2(image).extract(finalCenterRect).toBuffer();
104
137
  const trimInfo = await trimImage(img);
105
138
  if (!trimInfo) {
106
- return centerRect;
139
+ return finalCenterRect;
107
140
  }
108
141
  return {
109
- left: centerRect.left - trimInfo.trimOffsetLeft,
110
- top: centerRect.top - trimInfo.trimOffsetTop,
142
+ left: finalCenterRect.left - trimInfo.trimOffsetLeft,
143
+ top: finalCenterRect.top - trimInfo.trimOffsetTop,
111
144
  width: trimInfo.width,
112
145
  height: trimInfo.height
113
146
  };
114
147
  } catch (e) {
115
- console.log(imgInfo);
148
+ console.warn(imgInfo, finalCenterRect);
116
149
  throw e;
117
150
  }
118
151
  }
119
152
 
120
153
  // src/image/visualization.ts
121
- import { Buffer as Buffer3 } from "buffer";
154
+ import { Buffer as Buffer4 } from "buffer";
122
155
 
123
156
  // src/utils.ts
124
157
  import assert2 from "assert";
@@ -260,7 +293,7 @@ async function composeSectionDiagram(sections, context) {
260
293
  ${rects.join("\n")}
261
294
  </svg>
262
295
  `;
263
- const svgBuffer = Buffer3.from(rectangles);
296
+ const svgBuffer = Buffer4.from(rectangles);
264
297
  const file = getTmpFile("png");
265
298
  await Sharp3({
266
299
  create: {
package/dist/es/index.js CHANGED
@@ -1228,7 +1228,7 @@ var Executor = class {
1228
1228
  returnValue = await task.executor(param, executorContext);
1229
1229
  }
1230
1230
  Object.assign(task, returnValue);
1231
- task.status = "success";
1231
+ task.status = "finished";
1232
1232
  task.timing.end = Date.now();
1233
1233
  task.timing.cost = task.timing.end - task.timing.start;
1234
1234
  taskIndex++;
@@ -1247,12 +1247,13 @@ var Executor = class {
1247
1247
  }
1248
1248
  if (successfullyCompleted) {
1249
1249
  this.status = "completed";
1250
- if (this.tasks.length) {
1251
- return this.tasks[this.tasks.length - 1].output;
1252
- }
1253
1250
  } else {
1254
1251
  this.status = "error";
1255
1252
  }
1253
+ if (this.tasks.length) {
1254
+ const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
1255
+ return this.tasks[outputIndex].output;
1256
+ }
1256
1257
  }
1257
1258
  isInErrorState() {
1258
1259
  return this.status === "error";
@@ -4547,7 +4548,8 @@ async function call(messages, responseFormat) {
4547
4548
  const completion = await openai.chat.completions.create({
4548
4549
  model,
4549
4550
  messages,
4550
- response_format: { type: responseFormat }
4551
+ response_format: { type: responseFormat },
4552
+ temperature: 0.2
4551
4553
  });
4552
4554
  const { content } = completion.choices[0].message;
4553
4555
  assert3(content, "empty content");
@@ -4926,8 +4928,8 @@ async function callCozeAi(options) {
4926
4928
  }
4927
4929
  const aiResponse = await completion.json();
4928
4930
  if (aiResponse.code !== 0) {
4929
- console.error("CozeAI error response", aiResponse);
4930
- throw new Error("CozeAI error response", aiResponse);
4931
+ console.error("CozeAI error response", aiResponse.msg);
4932
+ throw new Error(`CozeAI error response ${aiResponse.msg}`);
4931
4933
  }
4932
4934
  if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
4933
4935
  console.error("aiResponse", aiResponse);
@@ -5180,7 +5182,7 @@ function systemPromptToTaskPlanning() {
5180
5182
  * param: { timeMs: number }, wait for timeMs milliseconds
5181
5183
 
5182
5184
  Here is an example of how to decompose a task.
5183
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you my decompose this task into something like this:
5185
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
5184
5186
  * Find: 'The search bar'
5185
5187
  * Input: 'Weather in Shanghai'
5186
5188
  * Sleep: 1000
@@ -5190,7 +5192,7 @@ function systemPromptToTaskPlanning() {
5190
5192
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
5191
5193
  2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
5192
5194
 
5193
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. If any errors occur during task planning (such as the page content being irrelevant to the task or the mentioned element not existing), please return the error message with an explanation in the errors field. Thoughts, prompts, and error messages should all be in the same language as the user query.
5195
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
5194
5196
 
5195
5197
  Return in the following JSON format:
5196
5198
  {
@@ -4276,7 +4276,8 @@ async function call(messages, responseFormat) {
4276
4276
  const completion = await openai.chat.completions.create({
4277
4277
  model,
4278
4278
  messages,
4279
- response_format: { type: responseFormat }
4279
+ response_format: { type: responseFormat },
4280
+ temperature: 0.2
4280
4281
  });
4281
4282
  const { content } = completion.choices[0].message;
4282
4283
  (0, import_node_assert.default)(content, "empty content");
@@ -4630,8 +4631,8 @@ async function callCozeAi(options) {
4630
4631
  }
4631
4632
  const aiResponse = await completion.json();
4632
4633
  if (aiResponse.code !== 0) {
4633
- console.error("CozeAI error response", aiResponse);
4634
- throw new Error("CozeAI error response", aiResponse);
4634
+ console.error("CozeAI error response", aiResponse.msg);
4635
+ throw new Error(`CozeAI error response ${aiResponse.msg}`);
4635
4636
  }
4636
4637
  if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
4637
4638
  console.error("aiResponse", aiResponse);
@@ -4884,7 +4885,7 @@ function systemPromptToTaskPlanning() {
4884
4885
  * param: { timeMs: number }, wait for timeMs milliseconds
4885
4886
 
4886
4887
  Here is an example of how to decompose a task.
4887
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you my decompose this task into something like this:
4888
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4888
4889
  * Find: 'The search bar'
4889
4890
  * Input: 'Weather in Shanghai'
4890
4891
  * Sleep: 1000
@@ -4894,7 +4895,7 @@ function systemPromptToTaskPlanning() {
4894
4895
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4895
4896
  2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4896
4897
 
4897
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. If any errors occur during task planning (such as the page content being irrelevant to the task or the mentioned element not existing), please return the error message with an explanation in the errors field. Thoughts, prompts, and error messages should all be in the same language as the user query.
4898
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4898
4899
 
4899
4900
  Return in the following JSON format:
4900
4901
  {
package/dist/lib/image.js CHANGED
@@ -3,8 +3,22 @@ var __create = Object.create;
3
3
  var __defProp = Object.defineProperty;
4
4
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getOwnPropSymbols = Object.getOwnPropertySymbols;
6
7
  var __getProtoOf = Object.getPrototypeOf;
7
8
  var __hasOwnProp = Object.prototype.hasOwnProperty;
9
+ var __propIsEnum = Object.prototype.propertyIsEnumerable;
10
+ var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
11
+ var __spreadValues = (a, b) => {
12
+ for (var prop in b || (b = {}))
13
+ if (__hasOwnProp.call(b, prop))
14
+ __defNormalProp(a, prop, b[prop]);
15
+ if (__getOwnPropSymbols)
16
+ for (var prop of __getOwnPropSymbols(b)) {
17
+ if (__propIsEnum.call(b, prop))
18
+ __defNormalProp(a, prop, b[prop]);
19
+ }
20
+ return a;
21
+ };
8
22
  var __export = (target, all) => {
9
23
  for (var name in all)
10
24
  __defProp(target, name, { get: all[name], enumerable: true });
@@ -144,20 +158,36 @@ async function alignCoordByTrim(image, centerRect) {
144
158
  if (!(imgInfo == null ? void 0 : imgInfo.width) || !imgInfo.height || imgInfo.width <= 3 || imgInfo.height <= 3) {
145
159
  return centerRect;
146
160
  }
161
+ const zeroSize = {
162
+ left: 0,
163
+ top: 0,
164
+ width: -1,
165
+ height: -1
166
+ };
167
+ const finalCenterRect = __spreadValues({}, centerRect);
168
+ if (centerRect.left > imgInfo.width || centerRect.top > imgInfo.height) {
169
+ return zeroSize;
170
+ }
171
+ if (centerRect.left + centerRect.width > imgInfo.width) {
172
+ finalCenterRect.width = imgInfo.width - centerRect.left;
173
+ }
174
+ if (centerRect.top + centerRect.height > imgInfo.height) {
175
+ finalCenterRect.height = imgInfo.height - centerRect.top;
176
+ }
147
177
  try {
148
- const img = await (0, import_sharp2.default)(image).extract(centerRect).toBuffer();
178
+ const img = await (0, import_sharp2.default)(image).extract(finalCenterRect).toBuffer();
149
179
  const trimInfo = await trimImage(img);
150
180
  if (!trimInfo) {
151
- return centerRect;
181
+ return finalCenterRect;
152
182
  }
153
183
  return {
154
- left: centerRect.left - trimInfo.trimOffsetLeft,
155
- top: centerRect.top - trimInfo.trimOffsetTop,
184
+ left: finalCenterRect.left - trimInfo.trimOffsetLeft,
185
+ top: finalCenterRect.top - trimInfo.trimOffsetTop,
156
186
  width: trimInfo.width,
157
187
  height: trimInfo.height
158
188
  };
159
189
  } catch (e) {
160
- console.log(imgInfo);
190
+ console.warn(imgInfo, finalCenterRect);
161
191
  throw e;
162
192
  }
163
193
  }
package/dist/lib/index.js CHANGED
@@ -1244,7 +1244,7 @@ var Executor = class {
1244
1244
  returnValue = await task.executor(param, executorContext);
1245
1245
  }
1246
1246
  Object.assign(task, returnValue);
1247
- task.status = "success";
1247
+ task.status = "finished";
1248
1248
  task.timing.end = Date.now();
1249
1249
  task.timing.cost = task.timing.end - task.timing.start;
1250
1250
  taskIndex++;
@@ -1263,12 +1263,13 @@ var Executor = class {
1263
1263
  }
1264
1264
  if (successfullyCompleted) {
1265
1265
  this.status = "completed";
1266
- if (this.tasks.length) {
1267
- return this.tasks[this.tasks.length - 1].output;
1268
- }
1269
1266
  } else {
1270
1267
  this.status = "error";
1271
1268
  }
1269
+ if (this.tasks.length) {
1270
+ const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
1271
+ return this.tasks[outputIndex].output;
1272
+ }
1272
1273
  }
1273
1274
  isInErrorState() {
1274
1275
  return this.status === "error";
@@ -4563,7 +4564,8 @@ async function call(messages, responseFormat) {
4563
4564
  const completion = await openai.chat.completions.create({
4564
4565
  model,
4565
4566
  messages,
4566
- response_format: { type: responseFormat }
4567
+ response_format: { type: responseFormat },
4568
+ temperature: 0.2
4567
4569
  });
4568
4570
  const { content } = completion.choices[0].message;
4569
4571
  (0, import_node_assert3.default)(content, "empty content");
@@ -4942,8 +4944,8 @@ async function callCozeAi(options) {
4942
4944
  }
4943
4945
  const aiResponse = await completion.json();
4944
4946
  if (aiResponse.code !== 0) {
4945
- console.error("CozeAI error response", aiResponse);
4946
- throw new Error("CozeAI error response", aiResponse);
4947
+ console.error("CozeAI error response", aiResponse.msg);
4948
+ throw new Error(`CozeAI error response ${aiResponse.msg}`);
4947
4949
  }
4948
4950
  if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
4949
4951
  console.error("aiResponse", aiResponse);
@@ -5196,7 +5198,7 @@ function systemPromptToTaskPlanning() {
5196
5198
  * param: { timeMs: number }, wait for timeMs milliseconds
5197
5199
 
5198
5200
  Here is an example of how to decompose a task.
5199
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you my decompose this task into something like this:
5201
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
5200
5202
  * Find: 'The search bar'
5201
5203
  * Input: 'Weather in Shanghai'
5202
5204
  * Sleep: 1000
@@ -5206,7 +5208,7 @@ function systemPromptToTaskPlanning() {
5206
5208
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
5207
5209
  2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
5208
5210
 
5209
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. If any errors occur during task planning (such as the page content being irrelevant to the task or the mentioned element not existing), please return the error message with an explanation in the errors field. Thoughts, prompts, and error messages should all be in the same language as the user query.
5211
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
5210
5212
 
5211
5213
  Return in the following JSON format:
5212
5214
  {
@@ -1,8 +1,8 @@
1
1
  import { ChatCompletionMessageParam } from 'openai/resources';
2
2
  export { ChatCompletionMessageParam } from 'openai/resources';
3
- import { c as callAiFn } from './index-f43935c0.js';
4
- export { d as describeUserPage, p as plan } from './index-f43935c0.js';
5
- import { B as BaseElement, U as UIContext, e as AIElementParseResponse, f as AISectionParseResponse, g as AIAssertionResponse } from './types-81f7991c.js';
3
+ import { c as callAiFn } from './index-0479d487.js';
4
+ export { d as describeUserPage, p as plan } from './index-0479d487.js';
5
+ import { B as BaseElement, U as UIContext, e as AIElementParseResponse, f as AISectionParseResponse, g as AIAssertionResponse } from './types-3eb61b5c.js';
6
6
 
7
7
  declare function AiInspectElement<ElementType extends BaseElement = BaseElement>(options: {
8
8
  context: UIContext<ElementType>;
@@ -1,5 +1,5 @@
1
1
  import { Buffer } from 'node:buffer';
2
- import { S as Size, R as Rect, h as UISection, U as UIContext, y as Color } from './types-81f7991c.js';
2
+ import { S as Size, R as Rect, h as UISection, U as UIContext, G as Color } from './types-3eb61b5c.js';
3
3
  import 'openai/resources';
4
4
 
5
5
  /**
@@ -1,4 +1,4 @@
1
- import { B as BaseElement, U as UIContext, q as PlanningAction } from './types-81f7991c.js';
1
+ import { B as BaseElement, U as UIContext, r as PlanningAction } from './types-3eb61b5c.js';
2
2
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
3
3
 
4
4
  type AIArgs = [
@@ -1,7 +1,7 @@
1
- import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightTaskInfo, c as InsightOptions, d as InsightAssertionResponse } from './types-81f7991c.js';
2
- export { g as AIAssertionResponse, e as AIElementParseResponse, A as AIResponseFormat, f as AISectionParseResponse, z as BaseAgentParserOpt, j as BasicSectionQuery, C as CallAIFn, y as Color, l as DumpMeta, p as ElementById, i as EnsureObject, H as ExecutionRecorderItem, a3 as ExecutionTaskAction, a2 as ExecutionTaskActionApply, a1 as ExecutionTaskInsightAssertion, a0 as ExecutionTaskInsightAssertionApply, $ as ExecutionTaskInsightAssertionParam, Q as ExecutionTaskInsightDumpLog, W as ExecutionTaskInsightLocate, V as ExecutionTaskInsightLocateApply, O as ExecutionTaskInsightLocateOutput, N as ExecutionTaskInsightLocateParam, _ as ExecutionTaskInsightQuery, Z as ExecutionTaskInsightQueryApply, Y as ExecutionTaskInsightQueryOutput, X as ExecutionTaskInsightQueryParam, a5 as ExecutionTaskPlanning, a4 as ExecutionTaskPlanningApply, M as ExecutionTaskReturn, J as ExecutionTaskType, K as ExecutorContext, a6 as GroupedActionDump, n as InsightDump, k as InsightExtractParam, L as LiteUISection, o as PartialInsightDumpFromSDK, r as PlanningAIResponse, q as PlanningAction, w as PlanningActionParamAssert, t as PlanningActionParamHover, u as PlanningActionParamInputOrKeyPress, v as PlanningActionParamScroll, x as PlanningActionParamSleep, s as PlanningActionParamTap, G as PlaywrightParserOpt, P as Point, F as PuppeteerParserOpt, R as Rect, m as ReportDumpWithAttributes, S as Size, T as TaskCacheInfo, h as UISection } from './types-81f7991c.js';
3
- import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-f43935c0.js';
4
- export { p as plan } from './index-f43935c0.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskApply, b as ExecutionDump, B as BaseElement, U as UIContext, D as DumpSubscriber, I as InsightTaskInfo, c as InsightOptions, d as InsightAssertionResponse } from './types-3eb61b5c.js';
2
+ export { g as AIAssertionResponse, e as AIElementParseResponse, A as AIResponseFormat, f as AISectionParseResponse, q as AgentWaitForOpt, H as BaseAgentParserOpt, j as BasicSectionQuery, C as CallAIFn, G as Color, l as DumpMeta, p as ElementById, i as EnsureObject, M as ExecutionRecorderItem, a6 as ExecutionTaskAction, a5 as ExecutionTaskActionApply, a4 as ExecutionTaskInsightAssertion, a3 as ExecutionTaskInsightAssertionApply, a2 as ExecutionTaskInsightAssertionParam, X as ExecutionTaskInsightDumpLog, Z as ExecutionTaskInsightLocate, Y as ExecutionTaskInsightLocateApply, W as ExecutionTaskInsightLocateOutput, V as ExecutionTaskInsightLocateParam, a1 as ExecutionTaskInsightQuery, a0 as ExecutionTaskInsightQueryApply, $ as ExecutionTaskInsightQueryOutput, _ as ExecutionTaskInsightQueryParam, a8 as ExecutionTaskPlanning, a7 as ExecutionTaskPlanningApply, Q as ExecutionTaskReturn, N as ExecutionTaskType, O as ExecutorContext, a9 as GroupedActionDump, n as InsightDump, k as InsightExtractParam, L as LiteUISection, o as PartialInsightDumpFromSDK, s as PlanningAIResponse, r as PlanningAction, x as PlanningActionParamAssert, z as PlanningActionParamError, u as PlanningActionParamHover, v as PlanningActionParamInputOrKeyPress, w as PlanningActionParamScroll, y as PlanningActionParamSleep, t as PlanningActionParamTap, F as PlanningActionParamWaitFor, K as PlaywrightParserOpt, P as Point, J as PuppeteerParserOpt, R as Rect, m as ReportDumpWithAttributes, S as Size, T as TaskCacheInfo, h as UISection } from './types-3eb61b5c.js';
3
+ import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-0479d487.js';
4
+ export { p as plan } from './index-0479d487.js';
5
5
  export { setLogDir } from './utils.js';
6
6
  import 'openai/resources';
7
7
 
@@ -10,7 +10,7 @@ interface Size {
10
10
  }
11
11
  type Rect = Point & Size;
12
12
  declare enum NodeType {
13
- INPUT = "INPUT Node",
13
+ FORM_ITEM = "FORM_ITEM Node",
14
14
  BUTTON = "BUTTON Node",
15
15
  IMG = "IMG Node",
16
16
  TEXT = "TEXT Node"
@@ -122,13 +122,20 @@ interface LiteUISection {
122
122
  }
123
123
  type ElementById = (id: string) => BaseElement | null;
124
124
  type InsightAssertionResponse = AIAssertionResponse;
125
+ /**
126
+ * agent
127
+ */
128
+ interface AgentWaitForOpt {
129
+ checkIntervalMs?: number;
130
+ timeoutMs?: number;
131
+ }
125
132
  /**
126
133
  * planning
127
134
  *
128
135
  */
129
136
  interface PlanningAction<ParamType = any> {
130
137
  thought?: string;
131
- type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'Sleep';
138
+ type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
132
139
  param: ParamType;
133
140
  }
134
141
  interface PlanningAIResponse {
@@ -150,6 +157,12 @@ interface PlanningActionParamAssert {
150
157
  interface PlanningActionParamSleep {
151
158
  timeMs: number;
152
159
  }
160
+ interface PlanningActionParamError {
161
+ thought: string;
162
+ }
163
+ type PlanningActionParamWaitFor = AgentWaitForOpt & {
164
+ assertion: string;
165
+ };
153
166
  /**
154
167
  * misc
155
168
  */
@@ -191,7 +204,7 @@ interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {
191
204
  cache?: TaskCacheInfo;
192
205
  }
193
206
  type ExecutionTask<E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<any, any, any>> = E & ExecutionTaskReturn<E extends ExecutionTaskApply<any, any, infer TaskOutput, any> ? TaskOutput : unknown, E extends ExecutionTaskApply<any, any, any, infer TaskLog> ? TaskLog : unknown> & {
194
- status: 'pending' | 'running' | 'success' | 'failed' | 'cancelled';
207
+ status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';
195
208
  error?: string;
196
209
  errorStack?: string;
197
210
  timing?: {
@@ -243,4 +256,4 @@ interface GroupedActionDump {
243
256
  executions: ExecutionDump[];
244
257
  }
245
258
 
246
- export { type ExecutionTaskInsightAssertionParam as $, AIResponseFormat as A, BaseElement as B, type CallAIFn as C, type DumpSubscriber as D, type ExecutionTask as E, type PuppeteerParserOpt as F, type PlaywrightParserOpt as G, type ExecutionRecorderItem as H, type InsightTaskInfo as I, type ExecutionTaskType as J, type ExecutorContext as K, type LiteUISection as L, type ExecutionTaskReturn as M, type ExecutionTaskInsightLocateParam as N, type ExecutionTaskInsightLocateOutput as O, type Point as P, type ExecutionTaskInsightDumpLog as Q, type Rect as R, type Size as S, type TaskCacheInfo as T, UIContext as U, type ExecutionTaskInsightLocateApply as V, type ExecutionTaskInsightLocate as W, type ExecutionTaskInsightQueryParam as X, type ExecutionTaskInsightQueryOutput as Y, type ExecutionTaskInsightQueryApply as Z, type ExecutionTaskInsightQuery as _, type ExecutionTaskApply as a, type ExecutionTaskInsightAssertionApply as a0, type ExecutionTaskInsightAssertion as a1, type ExecutionTaskActionApply as a2, type ExecutionTaskAction as a3, type ExecutionTaskPlanningApply as a4, type ExecutionTaskPlanning as a5, type GroupedActionDump as a6, type ExecutionDump as b, type InsightOptions as c, type InsightAssertionResponse as d, type AIElementParseResponse as e, type AISectionParseResponse as f, type AIAssertionResponse as g, type UISection as h, type EnsureObject as i, type BasicSectionQuery as j, type InsightExtractParam as k, type DumpMeta as l, type ReportDumpWithAttributes as m, type InsightDump as n, type PartialInsightDumpFromSDK as o, type ElementById as p, type PlanningAction as q, type PlanningAIResponse as r, type PlanningActionParamTap as s, type PlanningActionParamHover as t, type PlanningActionParamInputOrKeyPress as u, type PlanningActionParamScroll as v, type PlanningActionParamAssert as w, type PlanningActionParamSleep as x, type Color as y, type BaseAgentParserOpt as z };
259
+ export { type ExecutionTaskInsightQueryOutput as $, AIResponseFormat as A, BaseElement as B, type CallAIFn as C, type DumpSubscriber as D, type ExecutionTask as E, type PlanningActionParamWaitFor as F, type Color as G, type BaseAgentParserOpt as H, type InsightTaskInfo as I, type PuppeteerParserOpt as J, type PlaywrightParserOpt as K, type LiteUISection as L, type ExecutionRecorderItem as M, type ExecutionTaskType as N, type ExecutorContext as O, type Point as P, type ExecutionTaskReturn as Q, type Rect as R, type Size as S, type TaskCacheInfo as T, UIContext as U, type ExecutionTaskInsightLocateParam as V, type ExecutionTaskInsightLocateOutput as W, type ExecutionTaskInsightDumpLog as X, type ExecutionTaskInsightLocateApply as Y, type ExecutionTaskInsightLocate as Z, type ExecutionTaskInsightQueryParam as _, type ExecutionTaskApply as a, type ExecutionTaskInsightQueryApply as a0, type ExecutionTaskInsightQuery as a1, type ExecutionTaskInsightAssertionParam as a2, type ExecutionTaskInsightAssertionApply as a3, type ExecutionTaskInsightAssertion as a4, type ExecutionTaskActionApply as a5, type ExecutionTaskAction as a6, type ExecutionTaskPlanningApply as a7, type ExecutionTaskPlanning as a8, type GroupedActionDump as a9, type ExecutionDump as b, type InsightOptions as c, type InsightAssertionResponse as d, type AIElementParseResponse as e, type AISectionParseResponse as f, type AIAssertionResponse as g, type UISection as h, type EnsureObject as i, type BasicSectionQuery as j, type InsightExtractParam as k, type DumpMeta as l, type ReportDumpWithAttributes as m, type InsightDump as n, type PartialInsightDumpFromSDK as o, type ElementById as p, type AgentWaitForOpt as q, type PlanningAction as r, type PlanningAIResponse as s, type PlanningActionParamTap as t, type PlanningActionParamHover as u, type PlanningActionParamInputOrKeyPress as v, type PlanningActionParamScroll as w, type PlanningActionParamAssert as x, type PlanningActionParamSleep as y, type PlanningActionParamError as z };
@@ -1,4 +1,4 @@
1
- import { m as ReportDumpWithAttributes, R as Rect } from './types-81f7991c.js';
1
+ import { m as ReportDumpWithAttributes, R as Rect } from './types-3eb61b5c.js';
2
2
  import 'openai/resources';
3
3
 
4
4
  interface PkgInfo {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Hello, It's Midscene",
4
- "version": "0.3.0",
4
+ "version": "0.3.1-beta-20240821105917.0",
5
5
  "jsnext:source": "./src/index.ts",
6
6
  "main": "./dist/lib/index.js",
7
7
  "module": "./dist/es/index.js",
@@ -60,18 +60,19 @@
60
60
  }
61
61
  },
62
62
  "dependencies": {
63
+ "node-fetch": "2.6.7",
63
64
  "openai": "4.47.1",
64
- "sharp": "0.33.3",
65
- "node-fetch": "2.6.7"
65
+ "optional": "0.1.4",
66
+ "sharp": "0.33.3"
66
67
  },
67
68
  "devDependencies": {
68
- "@types/node-fetch": "2.6.11",
69
69
  "@modern-js/module-tools": "^2.56.1",
70
70
  "@types/node": "^18.0.0",
71
+ "@types/node-fetch": "2.6.11",
72
+ "dotenv": "16.4.5",
71
73
  "langsmith": "0.1.36",
72
74
  "typescript": "~5.0.4",
73
- "vitest": "^1.6.0",
74
- "dotenv": "16.4.5"
75
+ "vitest": "^1.6.0"
75
76
  },
76
77
  "engines": {
77
78
  "node": ">=16.0.0"
@@ -88,6 +89,6 @@
88
89
  "new": "modern new",
89
90
  "upgrade": "modern upgrade",
90
91
  "test": "vitest --run",
91
- "test:all": "AITEST=true vitest --run"
92
+ "test:ai": "AITEST=true npm run test"
92
93
  }
93
94
  }