@midscene/core 0.8.6 → 0.8.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/ai-model.js +71 -33
- package/dist/lib/index.js +72 -34
- package/dist/lib/types/ai-model.d.ts +3 -3
- package/dist/lib/types/{index-2b4593d9.d.ts → index-43fd19f4.d.ts} +2 -3
- package/dist/lib/types/index.d.ts +4 -4
- package/dist/lib/types/{types-7bcbf7fe.d.ts → types-55182ae1.d.ts} +4 -2
- package/dist/lib/types/utils.d.ts +1 -1
- package/dist/lib/utils.js +1 -1
- package/package.json +2 -2
- package/report/index.html +2 -2
package/dist/lib/ai-model.js
CHANGED
|
@@ -4355,8 +4355,8 @@ var allAIConfig = () => {
|
|
|
4355
4355
|
|
|
4356
4356
|
// src/ai-model/common.ts
|
|
4357
4357
|
async function callAiFn(options) {
|
|
4358
|
-
const {
|
|
4359
|
-
if (preferOpenAIModel(
|
|
4358
|
+
const { msgs, AIActionType: AIActionTypeValue } = options;
|
|
4359
|
+
if (preferOpenAIModel("openAI")) {
|
|
4360
4360
|
const { content, usage } = await callToGetJSONObject(
|
|
4361
4361
|
msgs,
|
|
4362
4362
|
AIActionTypeValue
|
|
@@ -4823,14 +4823,14 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
4823
4823
|
|
|
4824
4824
|
## Objective
|
|
4825
4825
|
|
|
4826
|
-
- Decompose the
|
|
4826
|
+
- Decompose the instruction user asked into a series of actions
|
|
4827
4827
|
- Locate the target element if possible
|
|
4828
|
-
- If the
|
|
4828
|
+
- If the instruction cannot be accomplished, give a further plan.
|
|
4829
4829
|
|
|
4830
4830
|
## Workflow
|
|
4831
4831
|
|
|
4832
4832
|
1. Receive the user's element description, screenshot, and instruction.
|
|
4833
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll /
|
|
4833
|
+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
|
|
4834
4834
|
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
|
|
4835
4835
|
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
|
|
4836
4836
|
5. Consider whether the user's instruction will be accomplished after all the actions
|
|
@@ -4841,7 +4841,8 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
4841
4841
|
|
|
4842
4842
|
- All the actions you composed MUST be based on the page context information you get.
|
|
4843
4843
|
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
|
|
4844
|
-
-
|
|
4844
|
+
- Respond only with valid JSON. Do not write an introduction or summary.
|
|
4845
|
+
- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
|
|
4845
4846
|
|
|
4846
4847
|
## About the \`actions\` field
|
|
4847
4848
|
|
|
@@ -4866,10 +4867,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
|
4866
4867
|
* \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
|
|
4867
4868
|
- type: 'KeyboardPress', press a key
|
|
4868
4869
|
* { param: { value: string } }
|
|
4869
|
-
- type: 'Scroll'
|
|
4870
|
-
* {
|
|
4871
|
-
|
|
4872
|
-
|
|
4870
|
+
- type: 'Scroll', scroll up or down.
|
|
4871
|
+
* {
|
|
4872
|
+
locate: LocateParam | null,
|
|
4873
|
+
param: {
|
|
4874
|
+
direction: 'down'(default) | 'up' | 'right' | 'left',
|
|
4875
|
+
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
|
|
4876
|
+
distance: null | number
|
|
4877
|
+
}
|
|
4878
|
+
}
|
|
4879
|
+
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
4880
|
+
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
|
|
4881
|
+
- type: 'FalsyConditionStatement'
|
|
4882
|
+
* { param: null }
|
|
4883
|
+
* use this action when the instruction is an "if" statement and the condition is falsy.
|
|
4873
4884
|
- type: 'Sleep'
|
|
4874
4885
|
* { param: { timeMs: number } }
|
|
4875
4886
|
|
|
@@ -4883,7 +4894,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
|
4883
4894
|
|
|
4884
4895
|
## Output JSON Format:
|
|
4885
4896
|
|
|
4886
|
-
|
|
4897
|
+
The JSON format is as follows:
|
|
4898
|
+
|
|
4887
4899
|
{
|
|
4888
4900
|
"actions": [
|
|
4889
4901
|
{
|
|
@@ -4941,6 +4953,7 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
4941
4953
|
"locate": null
|
|
4942
4954
|
},
|
|
4943
4955
|
],
|
|
4956
|
+
"error": null,
|
|
4944
4957
|
"taskWillBeAccomplished": false,
|
|
4945
4958
|
"furtherPlan": {
|
|
4946
4959
|
"whatToDoNext": "find the 'English' option and click on it",
|
|
@@ -4949,7 +4962,39 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
4949
4962
|
}
|
|
4950
4963
|
\`\`\`
|
|
4951
4964
|
|
|
4952
|
-
|
|
4965
|
+
|
|
4966
|
+
## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
|
|
4967
|
+
|
|
4968
|
+
If the user says "If there is a popup, close it", you should consider this and output the JSON:
|
|
4969
|
+
|
|
4970
|
+
* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
|
|
4971
|
+
* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
|
|
4972
|
+
|
|
4973
|
+
\`\`\`json
|
|
4974
|
+
{
|
|
4975
|
+
"actions": [{
|
|
4976
|
+
"thought": "There is no popup on the page",
|
|
4977
|
+
"type": "FalsyConditionStatement",
|
|
4978
|
+
"param": null
|
|
4979
|
+
}
|
|
4980
|
+
],
|
|
4981
|
+
"taskWillBeAccomplished": true,
|
|
4982
|
+
"furtherPlan": null
|
|
4983
|
+
}
|
|
4984
|
+
\`\`\`
|
|
4985
|
+
|
|
4986
|
+
For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
|
|
4987
|
+
|
|
4988
|
+
\`\`\`json
|
|
4989
|
+
{
|
|
4990
|
+
"actions": [],
|
|
4991
|
+
"error": "The instruction and page context are irrelevant, there is no popup on the page",
|
|
4992
|
+
"taskWillBeAccomplished": true,
|
|
4993
|
+
"furtherPlan": null
|
|
4994
|
+
}
|
|
4995
|
+
\`\`\`
|
|
4996
|
+
|
|
4997
|
+
## Example #3 : When task is accomplished, don't plan more actions
|
|
4953
4998
|
|
|
4954
4999
|
When the user ask to "Wait 4s", you should consider this:
|
|
4955
5000
|
|
|
@@ -5020,7 +5065,7 @@ var planSchema = {
|
|
|
5020
5065
|
},
|
|
5021
5066
|
param: {
|
|
5022
5067
|
type: ["object", "null"],
|
|
5023
|
-
description: "Parameter
|
|
5068
|
+
description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
|
|
5024
5069
|
},
|
|
5025
5070
|
locate: {
|
|
5026
5071
|
type: ["object", "null"],
|
|
@@ -5154,11 +5199,9 @@ async function call(messages, responseFormat) {
|
|
|
5154
5199
|
return { content, usage: completion.usage };
|
|
5155
5200
|
}
|
|
5156
5201
|
async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
5157
|
-
let responseFormat
|
|
5158
|
-
type: "json_object" /* JSON */
|
|
5159
|
-
};
|
|
5202
|
+
let responseFormat;
|
|
5160
5203
|
const model = getModelName();
|
|
5161
|
-
if (model
|
|
5204
|
+
if (model.includes("gpt-4o")) {
|
|
5162
5205
|
switch (AIActionTypeValue) {
|
|
5163
5206
|
case 0 /* ASSERT */:
|
|
5164
5207
|
responseFormat = assertSchema;
|
|
@@ -5172,9 +5215,9 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
|
5172
5215
|
responseFormat = planSchema;
|
|
5173
5216
|
break;
|
|
5174
5217
|
}
|
|
5175
|
-
|
|
5176
|
-
|
|
5177
|
-
|
|
5218
|
+
if (model === "gpt-4o-2024-05-13") {
|
|
5219
|
+
responseFormat = { type: "json_object" /* JSON */ };
|
|
5220
|
+
}
|
|
5178
5221
|
}
|
|
5179
5222
|
const safeJsonParse = (input) => {
|
|
5180
5223
|
try {
|
|
@@ -5192,7 +5235,7 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
|
5192
5235
|
try {
|
|
5193
5236
|
return { content: JSON.parse(jsonContent), usage: response.usage };
|
|
5194
5237
|
} catch (e) {
|
|
5195
|
-
throw Error(`parse json
|
|
5238
|
+
throw Error(`failed to parse json response: ${response.content}`);
|
|
5196
5239
|
}
|
|
5197
5240
|
}
|
|
5198
5241
|
function extractJSONFromCodeBlock(response) {
|
|
@@ -5239,7 +5282,7 @@ function transformElementPositionToId(aiResult, elementsInfo) {
|
|
|
5239
5282
|
};
|
|
5240
5283
|
}
|
|
5241
5284
|
async function AiInspectElement(options) {
|
|
5242
|
-
const { context, multi, targetElementDescription, callAI
|
|
5285
|
+
const { context, multi, targetElementDescription, callAI } = options;
|
|
5243
5286
|
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
|
|
5244
5287
|
const { description, elementById, elementByPosition: elementByPosition2 } = await describeUserPage(context);
|
|
5245
5288
|
if (options.quickAnswer) {
|
|
@@ -5314,8 +5357,7 @@ ${JSON.stringify({
|
|
|
5314
5357
|
if (callAI) {
|
|
5315
5358
|
const res = await callAI({
|
|
5316
5359
|
msgs,
|
|
5317
|
-
AIActionType: 1 /* INSPECT_ELEMENT
|
|
5318
|
-
useModel
|
|
5360
|
+
AIActionType: 1 /* INSPECT_ELEMENT */
|
|
5319
5361
|
});
|
|
5320
5362
|
return {
|
|
5321
5363
|
parseResult: transformElementPositionToId(res.content, context.content),
|
|
@@ -5326,8 +5368,7 @@ ${JSON.stringify({
|
|
|
5326
5368
|
}
|
|
5327
5369
|
const inspectElement = await callAiFn({
|
|
5328
5370
|
msgs,
|
|
5329
|
-
AIActionType: 1 /* INSPECT_ELEMENT
|
|
5330
|
-
useModel
|
|
5371
|
+
AIActionType: 1 /* INSPECT_ELEMENT */
|
|
5331
5372
|
});
|
|
5332
5373
|
return {
|
|
5333
5374
|
parseResult: transformElementPositionToId(
|
|
@@ -5377,7 +5418,6 @@ DATA_DEMAND ends.
|
|
|
5377
5418
|
];
|
|
5378
5419
|
const result = await callAiFn({
|
|
5379
5420
|
msgs,
|
|
5380
|
-
useModel,
|
|
5381
5421
|
AIActionType: 2 /* EXTRACT_DATA */
|
|
5382
5422
|
});
|
|
5383
5423
|
return {
|
|
@@ -5420,8 +5460,7 @@ async function AiAssert(options) {
|
|
|
5420
5460
|
];
|
|
5421
5461
|
const { content: assertResult, usage } = await callAiFn({
|
|
5422
5462
|
msgs,
|
|
5423
|
-
AIActionType: 0 /* ASSERT
|
|
5424
|
-
useModel
|
|
5463
|
+
AIActionType: 0 /* ASSERT */
|
|
5425
5464
|
});
|
|
5426
5465
|
return {
|
|
5427
5466
|
content: assertResult,
|
|
@@ -5431,7 +5470,7 @@ async function AiAssert(options) {
|
|
|
5431
5470
|
|
|
5432
5471
|
// src/ai-model/automation/index.ts
|
|
5433
5472
|
var import_node_assert4 = __toESM(require("assert"));
|
|
5434
|
-
async function plan(userPrompt, opts
|
|
5473
|
+
async function plan(userPrompt, opts) {
|
|
5435
5474
|
const { callAI, context } = opts || {};
|
|
5436
5475
|
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
|
|
5437
5476
|
const { description: pageDescription, elementByPosition: elementByPosition2 } = await describeUserPage(context);
|
|
@@ -5468,7 +5507,7 @@ pageDescription:
|
|
|
5468
5507
|
${pageDescription}
|
|
5469
5508
|
|
|
5470
5509
|
|
|
5471
|
-
Here is
|
|
5510
|
+
Here is the instruction:
|
|
5472
5511
|
=====================================
|
|
5473
5512
|
${userPrompt}
|
|
5474
5513
|
=====================================
|
|
@@ -5482,8 +5521,7 @@ ${taskBackgroundContext}
|
|
|
5482
5521
|
const call2 = callAI || callAiFn;
|
|
5483
5522
|
const { content, usage } = await call2({
|
|
5484
5523
|
msgs,
|
|
5485
|
-
AIActionType: 3 /* PLAN
|
|
5486
|
-
useModel
|
|
5524
|
+
AIActionType: 3 /* PLAN */
|
|
5487
5525
|
});
|
|
5488
5526
|
const planFromAI = content;
|
|
5489
5527
|
const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];
|
package/dist/lib/index.js
CHANGED
|
@@ -4506,7 +4506,7 @@ function stringifyDumpData(data, indents) {
|
|
|
4506
4506
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
4507
4507
|
}
|
|
4508
4508
|
function getVersion() {
|
|
4509
|
-
return "0.8.
|
|
4509
|
+
return "0.8.7";
|
|
4510
4510
|
}
|
|
4511
4511
|
|
|
4512
4512
|
// src/action/executor.ts
|
|
@@ -5170,14 +5170,14 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
5170
5170
|
|
|
5171
5171
|
## Objective
|
|
5172
5172
|
|
|
5173
|
-
- Decompose the
|
|
5173
|
+
- Decompose the instruction user asked into a series of actions
|
|
5174
5174
|
- Locate the target element if possible
|
|
5175
|
-
- If the
|
|
5175
|
+
- If the instruction cannot be accomplished, give a further plan.
|
|
5176
5176
|
|
|
5177
5177
|
## Workflow
|
|
5178
5178
|
|
|
5179
5179
|
1. Receive the user's element description, screenshot, and instruction.
|
|
5180
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll /
|
|
5180
|
+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
|
|
5181
5181
|
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
|
|
5182
5182
|
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
|
|
5183
5183
|
5. Consider whether the user's instruction will be accomplished after all the actions
|
|
@@ -5188,7 +5188,8 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
5188
5188
|
|
|
5189
5189
|
- All the actions you composed MUST be based on the page context information you get.
|
|
5190
5190
|
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
|
|
5191
|
-
-
|
|
5191
|
+
- Respond only with valid JSON. Do not write an introduction or summary.
|
|
5192
|
+
- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
|
|
5192
5193
|
|
|
5193
5194
|
## About the \`actions\` field
|
|
5194
5195
|
|
|
@@ -5213,10 +5214,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
|
5213
5214
|
* \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
|
|
5214
5215
|
- type: 'KeyboardPress', press a key
|
|
5215
5216
|
* { param: { value: string } }
|
|
5216
|
-
- type: 'Scroll'
|
|
5217
|
-
* {
|
|
5218
|
-
|
|
5219
|
-
|
|
5217
|
+
- type: 'Scroll', scroll up or down.
|
|
5218
|
+
* {
|
|
5219
|
+
locate: LocateParam | null,
|
|
5220
|
+
param: {
|
|
5221
|
+
direction: 'down'(default) | 'up' | 'right' | 'left',
|
|
5222
|
+
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
|
|
5223
|
+
distance: null | number
|
|
5224
|
+
}
|
|
5225
|
+
}
|
|
5226
|
+
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
5227
|
+
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
|
|
5228
|
+
- type: 'FalsyConditionStatement'
|
|
5229
|
+
* { param: null }
|
|
5230
|
+
* use this action when the instruction is an "if" statement and the condition is falsy.
|
|
5220
5231
|
- type: 'Sleep'
|
|
5221
5232
|
* { param: { timeMs: number } }
|
|
5222
5233
|
|
|
@@ -5230,7 +5241,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
|
5230
5241
|
|
|
5231
5242
|
## Output JSON Format:
|
|
5232
5243
|
|
|
5233
|
-
|
|
5244
|
+
The JSON format is as follows:
|
|
5245
|
+
|
|
5234
5246
|
{
|
|
5235
5247
|
"actions": [
|
|
5236
5248
|
{
|
|
@@ -5288,6 +5300,7 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
5288
5300
|
"locate": null
|
|
5289
5301
|
},
|
|
5290
5302
|
],
|
|
5303
|
+
"error": null,
|
|
5291
5304
|
"taskWillBeAccomplished": false,
|
|
5292
5305
|
"furtherPlan": {
|
|
5293
5306
|
"whatToDoNext": "find the 'English' option and click on it",
|
|
@@ -5296,7 +5309,39 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
5296
5309
|
}
|
|
5297
5310
|
\`\`\`
|
|
5298
5311
|
|
|
5299
|
-
|
|
5312
|
+
|
|
5313
|
+
## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
|
|
5314
|
+
|
|
5315
|
+
If the user says "If there is a popup, close it", you should consider this and output the JSON:
|
|
5316
|
+
|
|
5317
|
+
* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
|
|
5318
|
+
* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
|
|
5319
|
+
|
|
5320
|
+
\`\`\`json
|
|
5321
|
+
{
|
|
5322
|
+
"actions": [{
|
|
5323
|
+
"thought": "There is no popup on the page",
|
|
5324
|
+
"type": "FalsyConditionStatement",
|
|
5325
|
+
"param": null
|
|
5326
|
+
}
|
|
5327
|
+
],
|
|
5328
|
+
"taskWillBeAccomplished": true,
|
|
5329
|
+
"furtherPlan": null
|
|
5330
|
+
}
|
|
5331
|
+
\`\`\`
|
|
5332
|
+
|
|
5333
|
+
For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
|
|
5334
|
+
|
|
5335
|
+
\`\`\`json
|
|
5336
|
+
{
|
|
5337
|
+
"actions": [],
|
|
5338
|
+
"error": "The instruction and page context are irrelevant, there is no popup on the page",
|
|
5339
|
+
"taskWillBeAccomplished": true,
|
|
5340
|
+
"furtherPlan": null
|
|
5341
|
+
}
|
|
5342
|
+
\`\`\`
|
|
5343
|
+
|
|
5344
|
+
## Example #3 : When task is accomplished, don't plan more actions
|
|
5300
5345
|
|
|
5301
5346
|
When the user ask to "Wait 4s", you should consider this:
|
|
5302
5347
|
|
|
@@ -5367,7 +5412,7 @@ var planSchema = {
|
|
|
5367
5412
|
},
|
|
5368
5413
|
param: {
|
|
5369
5414
|
type: ["object", "null"],
|
|
5370
|
-
description: "Parameter
|
|
5415
|
+
description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
|
|
5371
5416
|
},
|
|
5372
5417
|
locate: {
|
|
5373
5418
|
type: ["object", "null"],
|
|
@@ -5501,11 +5546,9 @@ async function call(messages, responseFormat) {
|
|
|
5501
5546
|
return { content, usage: completion.usage };
|
|
5502
5547
|
}
|
|
5503
5548
|
async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
5504
|
-
let responseFormat
|
|
5505
|
-
type: "json_object" /* JSON */
|
|
5506
|
-
};
|
|
5549
|
+
let responseFormat;
|
|
5507
5550
|
const model = getModelName();
|
|
5508
|
-
if (model
|
|
5551
|
+
if (model.includes("gpt-4o")) {
|
|
5509
5552
|
switch (AIActionTypeValue) {
|
|
5510
5553
|
case 0 /* ASSERT */:
|
|
5511
5554
|
responseFormat = assertSchema;
|
|
@@ -5519,9 +5562,9 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
|
5519
5562
|
responseFormat = planSchema;
|
|
5520
5563
|
break;
|
|
5521
5564
|
}
|
|
5522
|
-
|
|
5523
|
-
|
|
5524
|
-
|
|
5565
|
+
if (model === "gpt-4o-2024-05-13") {
|
|
5566
|
+
responseFormat = { type: "json_object" /* JSON */ };
|
|
5567
|
+
}
|
|
5525
5568
|
}
|
|
5526
5569
|
const safeJsonParse = (input) => {
|
|
5527
5570
|
try {
|
|
@@ -5539,7 +5582,7 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
|
5539
5582
|
try {
|
|
5540
5583
|
return { content: JSON.parse(jsonContent), usage: response.usage };
|
|
5541
5584
|
} catch (e) {
|
|
5542
|
-
throw Error(`parse json
|
|
5585
|
+
throw Error(`failed to parse json response: ${response.content}`);
|
|
5543
5586
|
}
|
|
5544
5587
|
}
|
|
5545
5588
|
function extractJSONFromCodeBlock(response) {
|
|
@@ -5560,8 +5603,8 @@ function extractJSONFromCodeBlock(response) {
|
|
|
5560
5603
|
|
|
5561
5604
|
// src/ai-model/common.ts
|
|
5562
5605
|
async function callAiFn(options) {
|
|
5563
|
-
const {
|
|
5564
|
-
if (preferOpenAIModel(
|
|
5606
|
+
const { msgs, AIActionType: AIActionTypeValue } = options;
|
|
5607
|
+
if (preferOpenAIModel("openAI")) {
|
|
5565
5608
|
const { content, usage } = await callToGetJSONObject(
|
|
5566
5609
|
msgs,
|
|
5567
5610
|
AIActionTypeValue
|
|
@@ -5612,7 +5655,7 @@ function transformElementPositionToId(aiResult, elementsInfo) {
|
|
|
5612
5655
|
};
|
|
5613
5656
|
}
|
|
5614
5657
|
async function AiInspectElement(options) {
|
|
5615
|
-
const { context, multi, targetElementDescription, callAI
|
|
5658
|
+
const { context, multi, targetElementDescription, callAI } = options;
|
|
5616
5659
|
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
|
|
5617
5660
|
const { description, elementById, elementByPosition: elementByPosition2 } = await describeUserPage(context);
|
|
5618
5661
|
if (options.quickAnswer) {
|
|
@@ -5687,8 +5730,7 @@ ${JSON.stringify({
|
|
|
5687
5730
|
if (callAI) {
|
|
5688
5731
|
const res = await callAI({
|
|
5689
5732
|
msgs,
|
|
5690
|
-
AIActionType: 1 /* INSPECT_ELEMENT
|
|
5691
|
-
useModel
|
|
5733
|
+
AIActionType: 1 /* INSPECT_ELEMENT */
|
|
5692
5734
|
});
|
|
5693
5735
|
return {
|
|
5694
5736
|
parseResult: transformElementPositionToId(res.content, context.content),
|
|
@@ -5699,8 +5741,7 @@ ${JSON.stringify({
|
|
|
5699
5741
|
}
|
|
5700
5742
|
const inspectElement = await callAiFn({
|
|
5701
5743
|
msgs,
|
|
5702
|
-
AIActionType: 1 /* INSPECT_ELEMENT
|
|
5703
|
-
useModel
|
|
5744
|
+
AIActionType: 1 /* INSPECT_ELEMENT */
|
|
5704
5745
|
});
|
|
5705
5746
|
return {
|
|
5706
5747
|
parseResult: transformElementPositionToId(
|
|
@@ -5750,7 +5791,6 @@ DATA_DEMAND ends.
|
|
|
5750
5791
|
];
|
|
5751
5792
|
const result = await callAiFn({
|
|
5752
5793
|
msgs,
|
|
5753
|
-
useModel,
|
|
5754
5794
|
AIActionType: 2 /* EXTRACT_DATA */
|
|
5755
5795
|
});
|
|
5756
5796
|
return {
|
|
@@ -5793,8 +5833,7 @@ async function AiAssert(options) {
|
|
|
5793
5833
|
];
|
|
5794
5834
|
const { content: assertResult, usage } = await callAiFn({
|
|
5795
5835
|
msgs,
|
|
5796
|
-
AIActionType: 0 /* ASSERT
|
|
5797
|
-
useModel
|
|
5836
|
+
AIActionType: 0 /* ASSERT */
|
|
5798
5837
|
});
|
|
5799
5838
|
return {
|
|
5800
5839
|
content: assertResult,
|
|
@@ -5804,7 +5843,7 @@ async function AiAssert(options) {
|
|
|
5804
5843
|
|
|
5805
5844
|
// src/ai-model/automation/index.ts
|
|
5806
5845
|
var import_node_assert6 = __toESM(require("assert"));
|
|
5807
|
-
async function plan(userPrompt, opts
|
|
5846
|
+
async function plan(userPrompt, opts) {
|
|
5808
5847
|
const { callAI, context } = opts || {};
|
|
5809
5848
|
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
|
|
5810
5849
|
const { description: pageDescription, elementByPosition: elementByPosition2 } = await describeUserPage(context);
|
|
@@ -5841,7 +5880,7 @@ pageDescription:
|
|
|
5841
5880
|
${pageDescription}
|
|
5842
5881
|
|
|
5843
5882
|
|
|
5844
|
-
Here is
|
|
5883
|
+
Here is the instruction:
|
|
5845
5884
|
=====================================
|
|
5846
5885
|
${userPrompt}
|
|
5847
5886
|
=====================================
|
|
@@ -5855,8 +5894,7 @@ ${taskBackgroundContext}
|
|
|
5855
5894
|
const call2 = callAI || callAiFn;
|
|
5856
5895
|
const { content, usage } = await call2({
|
|
5857
5896
|
msgs,
|
|
5858
|
-
AIActionType: 3 /* PLAN
|
|
5859
|
-
useModel
|
|
5897
|
+
AIActionType: 3 /* PLAN */
|
|
5860
5898
|
});
|
|
5861
5899
|
const planFromAI = content;
|
|
5862
5900
|
const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { g as AIUsageInfo } from './types-
|
|
1
|
+
import { g as AIUsageInfo } from './types-55182ae1.js';
|
|
2
2
|
import { ChatCompletionMessageParam } from 'openai/resources';
|
|
3
3
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
4
|
-
import { A as AIActionType } from './index-
|
|
5
|
-
export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-
|
|
4
|
+
import { A as AIActionType } from './index-43fd19f4.js';
|
|
5
|
+
export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-43fd19f4.js';
|
|
6
6
|
|
|
7
7
|
declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
|
|
8
8
|
content: T;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-
|
|
1
|
+
import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-55182ae1.js';
|
|
2
2
|
import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
|
|
3
3
|
|
|
4
4
|
type AIArgs = [
|
|
@@ -14,7 +14,6 @@ declare enum AIActionType {
|
|
|
14
14
|
declare function callAiFn<T>(options: {
|
|
15
15
|
msgs: AIArgs;
|
|
16
16
|
AIActionType: AIActionType;
|
|
17
|
-
useModel?: 'openAI' | 'coze';
|
|
18
17
|
}): Promise<{
|
|
19
18
|
content: T;
|
|
20
19
|
usage?: AIUsageInfo;
|
|
@@ -116,6 +115,6 @@ declare function plan(userPrompt: string, opts: {
|
|
|
116
115
|
originalPrompt?: string;
|
|
117
116
|
context: UIContext;
|
|
118
117
|
callAI?: typeof callAiFn<PlanningAIResponse>;
|
|
119
|
-
}
|
|
118
|
+
}): Promise<PlanningAIResponse>;
|
|
120
119
|
|
|
121
120
|
export { AIActionType as A, retrieveSection as a, AiInspectElement as b, callAiFn as c, describeUserPage as d, AiExtractElementInfo as e, AiAssert as f, plan as p, retrieveElement as r, transformElementPositionToId as t };
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-
|
|
2
|
-
export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-
|
|
1
|
+
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-55182ae1.js';
|
|
2
|
+
export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-55182ae1.js';
|
|
3
3
|
export { allAIConfig, getAIConfig, overrideAIConfig } from './env.js';
|
|
4
|
-
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-
|
|
5
|
-
export { p as plan, t as transformElementPositionToId } from './index-
|
|
4
|
+
import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-43fd19f4.js';
|
|
5
|
+
export { p as plan, t as transformElementPositionToId } from './index-43fd19f4.js';
|
|
6
6
|
export { getLogDirByType, getVersion, setLogDir } from './utils.js';
|
|
7
7
|
import 'openai/resources';
|
|
8
8
|
|
|
@@ -176,7 +176,7 @@ interface PlanningLocateParam {
|
|
|
176
176
|
}
|
|
177
177
|
interface PlanningAction<ParamType = any> {
|
|
178
178
|
thought?: string;
|
|
179
|
-
type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
|
|
179
|
+
type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
|
|
180
180
|
param: ParamType;
|
|
181
181
|
locate: PlanningLocateParam | null;
|
|
182
182
|
}
|
|
@@ -197,7 +197,9 @@ interface PlanningActionParamInputOrKeyPress {
|
|
|
197
197
|
value: string;
|
|
198
198
|
}
|
|
199
199
|
interface PlanningActionParamScroll {
|
|
200
|
-
|
|
200
|
+
direction: 'down' | 'up' | 'right' | 'left';
|
|
201
|
+
scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
|
|
202
|
+
distance: null | number;
|
|
201
203
|
}
|
|
202
204
|
interface PlanningActionParamAssert {
|
|
203
205
|
assertion: string;
|
package/dist/lib/utils.js
CHANGED
|
@@ -272,7 +272,7 @@ function stringifyDumpData(data, indents) {
|
|
|
272
272
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
273
273
|
}
|
|
274
274
|
function getVersion() {
|
|
275
|
-
return "0.8.
|
|
275
|
+
return "0.8.7";
|
|
276
276
|
}
|
|
277
277
|
function debugLog(...message) {
|
|
278
278
|
const debugMode = getAIConfig(MIDSCENE_DEBUG_MODE);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@midscene/core",
|
|
3
3
|
"description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
|
|
4
|
-
"version": "0.8.
|
|
4
|
+
"version": "0.8.7",
|
|
5
5
|
"repository": "https://github.com/web-infra-dev/midscene",
|
|
6
6
|
"homepage": "https://midscenejs.com/",
|
|
7
7
|
"jsnext:source": "./src/index.ts",
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
"openai": "4.57.1",
|
|
40
40
|
"optional": "0.1.4",
|
|
41
41
|
"socks-proxy-agent": "8.0.4",
|
|
42
|
-
"@midscene/shared": "0.8.
|
|
42
|
+
"@midscene/shared": "0.8.7"
|
|
43
43
|
},
|
|
44
44
|
"devDependencies": {
|
|
45
45
|
"@modern-js/module-tools": "2.60.6",
|