@midscene/core 0.8.6 → 0.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4355,8 +4355,8 @@ var allAIConfig = () => {
4355
4355
 
4356
4356
  // src/ai-model/common.ts
4357
4357
  async function callAiFn(options) {
4358
- const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
4359
- if (preferOpenAIModel(useModel)) {
4358
+ const { msgs, AIActionType: AIActionTypeValue } = options;
4359
+ if (preferOpenAIModel("openAI")) {
4360
4360
  const { content, usage } = await callToGetJSONObject(
4361
4361
  msgs,
4362
4362
  AIActionTypeValue
@@ -4823,14 +4823,14 @@ You are a versatile professional in software UI automation. Your outstanding con
4823
4823
 
4824
4824
  ## Objective
4825
4825
 
4826
- - Decompose the task user asked into a series of actions
4826
+ - Decompose the instruction user asked into a series of actions
4827
4827
  - Locate the target element if possible
4828
- - If the task cannot be accomplished, give a further plan.
4828
+ - If the instruction cannot be accomplished, give a further plan.
4829
4829
 
4830
4830
  ## Workflow
4831
4831
 
4832
4832
  1. Receive the user's element description, screenshot, and instruction.
4833
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
4833
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
4834
4834
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
4835
4835
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
4836
4836
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -4841,7 +4841,8 @@ You are a versatile professional in software UI automation. Your outstanding con
4841
4841
 
4842
4842
  - All the actions you composed MUST be based on the page context information you get.
4843
4843
  - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
4844
- - If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
4844
+ - Respond only with valid JSON. Do not write an introduction or summary.
4845
+ - If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
4845
4846
 
4846
4847
  ## About the \`actions\` field
4847
4848
 
@@ -4866,10 +4867,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
4866
4867
  * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
4867
4868
  - type: 'KeyboardPress', press a key
4868
4869
  * { param: { value: string } }
4869
- - type: 'Scroll'
4870
- * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
4871
- - type: 'Error'
4872
- * { param: { message: string } }
4870
+ - type: 'Scroll', scroll up or down.
4871
+ * {
4872
+ locate: LocateParam | null,
4873
+ param: {
4874
+ direction: 'down'(default) | 'up' | 'right' | 'left',
4875
+ scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
4876
+ distance: null | number
4877
+ }
4878
+ }
4879
+ * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
4880
+ * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
4881
+ - type: 'FalsyConditionStatement'
4882
+ * { param: null }
4883
+ * use this action when the instruction is an "if" statement and the condition is falsy.
4873
4884
  - type: 'Sleep'
4874
4885
  * { param: { timeMs: number } }
4875
4886
 
@@ -4883,7 +4894,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
4883
4894
 
4884
4895
  ## Output JSON Format:
4885
4896
 
4886
- Please return the result in JSON format as follows:
4897
+ The JSON format is as follows:
4898
+
4887
4899
  {
4888
4900
  "actions": [
4889
4901
  {
@@ -4941,6 +4953,7 @@ By viewing the page screenshot and description, you should consider this and out
4941
4953
  "locate": null
4942
4954
  },
4943
4955
  ],
4956
+ "error": null,
4944
4957
  "taskWillBeAccomplished": false,
4945
4958
  "furtherPlan": {
4946
4959
  "whatToDoNext": "find the 'English' option and click on it",
@@ -4949,7 +4962,39 @@ By viewing the page screenshot and description, you should consider this and out
4949
4962
  }
4950
4963
  \`\`\`
4951
4964
 
4952
- ## Example #2 : When task is accomplished, don't plan more actions
4965
+
4966
+ ## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
4967
+
4968
+ If the user says "If there is a popup, close it", you should consider this and output the JSON:
4969
+
4970
+ * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
4971
+ * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
4972
+
4973
+ \`\`\`json
4974
+ {
4975
+ "actions": [{
4976
+ "thought": "There is no popup on the page",
4977
+ "type": "FalsyConditionStatement",
4978
+ "param": null
4979
+ }
4980
+ ],
4981
+ "taskWillBeAccomplished": true,
4982
+ "furtherPlan": null
4983
+ }
4984
+ \`\`\`
4985
+
4986
+ For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
4987
+
4988
+ \`\`\`json
4989
+ {
4990
+ "actions": [],
4991
+ "error": "The instruction and page context are irrelevant, there is no popup on the page",
4992
+ "taskWillBeAccomplished": true,
4993
+ "furtherPlan": null
4994
+ }
4995
+ \`\`\`
4996
+
4997
+ ## Example #3 : When task is accomplished, don't plan more actions
4953
4998
 
4954
4999
  When the user ask to "Wait 4s", you should consider this:
4955
5000
 
@@ -5020,7 +5065,7 @@ var planSchema = {
5020
5065
  },
5021
5066
  param: {
5022
5067
  type: ["object", "null"],
5023
- description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
5068
+ description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
5024
5069
  },
5025
5070
  locate: {
5026
5071
  type: ["object", "null"],
@@ -5154,11 +5199,9 @@ async function call(messages, responseFormat) {
5154
5199
  return { content, usage: completion.usage };
5155
5200
  }
5156
5201
  async function callToGetJSONObject(messages, AIActionTypeValue) {
5157
- let responseFormat = {
5158
- type: "json_object" /* JSON */
5159
- };
5202
+ let responseFormat;
5160
5203
  const model = getModelName();
5161
- if (model === "gpt-4o-2024-08-06") {
5204
+ if (model.includes("gpt-4o")) {
5162
5205
  switch (AIActionTypeValue) {
5163
5206
  case 0 /* ASSERT */:
5164
5207
  responseFormat = assertSchema;
@@ -5172,9 +5215,9 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
5172
5215
  responseFormat = planSchema;
5173
5216
  break;
5174
5217
  }
5175
- }
5176
- if (model.startsWith("gemini")) {
5177
- responseFormat = { type: "text" /* TEXT */ };
5218
+ if (model === "gpt-4o-2024-05-13") {
5219
+ responseFormat = { type: "json_object" /* JSON */ };
5220
+ }
5178
5221
  }
5179
5222
  const safeJsonParse = (input) => {
5180
5223
  try {
@@ -5192,7 +5235,7 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
5192
5235
  try {
5193
5236
  return { content: JSON.parse(jsonContent), usage: response.usage };
5194
5237
  } catch (e) {
5195
- throw Error(`parse json error: ${response.content}`);
5238
+ throw Error(`failed to parse json response: ${response.content}`);
5196
5239
  }
5197
5240
  }
5198
5241
  function extractJSONFromCodeBlock(response) {
@@ -5239,7 +5282,7 @@ function transformElementPositionToId(aiResult, elementsInfo) {
5239
5282
  };
5240
5283
  }
5241
5284
  async function AiInspectElement(options) {
5242
- const { context, multi, targetElementDescription, callAI, useModel } = options;
5285
+ const { context, multi, targetElementDescription, callAI } = options;
5243
5286
  const { screenshotBase64, screenshotBase64WithElementMarker } = context;
5244
5287
  const { description, elementById, elementByPosition: elementByPosition2 } = await describeUserPage(context);
5245
5288
  if (options.quickAnswer) {
@@ -5314,8 +5357,7 @@ ${JSON.stringify({
5314
5357
  if (callAI) {
5315
5358
  const res = await callAI({
5316
5359
  msgs,
5317
- AIActionType: 1 /* INSPECT_ELEMENT */,
5318
- useModel
5360
+ AIActionType: 1 /* INSPECT_ELEMENT */
5319
5361
  });
5320
5362
  return {
5321
5363
  parseResult: transformElementPositionToId(res.content, context.content),
@@ -5326,8 +5368,7 @@ ${JSON.stringify({
5326
5368
  }
5327
5369
  const inspectElement = await callAiFn({
5328
5370
  msgs,
5329
- AIActionType: 1 /* INSPECT_ELEMENT */,
5330
- useModel
5371
+ AIActionType: 1 /* INSPECT_ELEMENT */
5331
5372
  });
5332
5373
  return {
5333
5374
  parseResult: transformElementPositionToId(
@@ -5377,7 +5418,6 @@ DATA_DEMAND ends.
5377
5418
  ];
5378
5419
  const result = await callAiFn({
5379
5420
  msgs,
5380
- useModel,
5381
5421
  AIActionType: 2 /* EXTRACT_DATA */
5382
5422
  });
5383
5423
  return {
@@ -5420,8 +5460,7 @@ async function AiAssert(options) {
5420
5460
  ];
5421
5461
  const { content: assertResult, usage } = await callAiFn({
5422
5462
  msgs,
5423
- AIActionType: 0 /* ASSERT */,
5424
- useModel
5463
+ AIActionType: 0 /* ASSERT */
5425
5464
  });
5426
5465
  return {
5427
5466
  content: assertResult,
@@ -5431,7 +5470,7 @@ async function AiAssert(options) {
5431
5470
 
5432
5471
  // src/ai-model/automation/index.ts
5433
5472
  var import_node_assert4 = __toESM(require("assert"));
5434
- async function plan(userPrompt, opts, useModel) {
5473
+ async function plan(userPrompt, opts) {
5435
5474
  const { callAI, context } = opts || {};
5436
5475
  const { screenshotBase64, screenshotBase64WithElementMarker } = context;
5437
5476
  const { description: pageDescription, elementByPosition: elementByPosition2 } = await describeUserPage(context);
@@ -5468,7 +5507,7 @@ pageDescription:
5468
5507
  ${pageDescription}
5469
5508
 
5470
5509
 
5471
- Here is what you need to do now:
5510
+ Here is the instruction:
5472
5511
  =====================================
5473
5512
  ${userPrompt}
5474
5513
  =====================================
@@ -5482,8 +5521,7 @@ ${taskBackgroundContext}
5482
5521
  const call2 = callAI || callAiFn;
5483
5522
  const { content, usage } = await call2({
5484
5523
  msgs,
5485
- AIActionType: 3 /* PLAN */,
5486
- useModel
5524
+ AIActionType: 3 /* PLAN */
5487
5525
  });
5488
5526
  const planFromAI = content;
5489
5527
  const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];
package/dist/lib/index.js CHANGED
@@ -4506,7 +4506,7 @@ function stringifyDumpData(data, indents) {
4506
4506
  return JSON.stringify(data, replacerForPageObject, indents);
4507
4507
  }
4508
4508
  function getVersion() {
4509
- return "0.8.6";
4509
+ return "0.8.7";
4510
4510
  }
4511
4511
 
4512
4512
  // src/action/executor.ts
@@ -5170,14 +5170,14 @@ You are a versatile professional in software UI automation. Your outstanding con
5170
5170
 
5171
5171
  ## Objective
5172
5172
 
5173
- - Decompose the task user asked into a series of actions
5173
+ - Decompose the instruction user asked into a series of actions
5174
5174
  - Locate the target element if possible
5175
- - If the task cannot be accomplished, give a further plan.
5175
+ - If the instruction cannot be accomplished, give a further plan.
5176
5176
 
5177
5177
  ## Workflow
5178
5178
 
5179
5179
  1. Receive the user's element description, screenshot, and instruction.
5180
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
5180
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
5181
5181
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
5182
5182
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
5183
5183
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -5188,7 +5188,8 @@ You are a versatile professional in software UI automation. Your outstanding con
5188
5188
 
5189
5189
  - All the actions you composed MUST be based on the page context information you get.
5190
5190
  - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
5191
- - If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
5191
+ - Respond only with valid JSON. Do not write an introduction or summary.
5192
+ - If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
5192
5193
 
5193
5194
  ## About the \`actions\` field
5194
5195
 
@@ -5213,10 +5214,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
5213
5214
  * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
5214
5215
  - type: 'KeyboardPress', press a key
5215
5216
  * { param: { value: string } }
5216
- - type: 'Scroll'
5217
- * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
5218
- - type: 'Error'
5219
- * { param: { message: string } }
5217
+ - type: 'Scroll', scroll up or down.
5218
+ * {
5219
+ locate: LocateParam | null,
5220
+ param: {
5221
+ direction: 'down'(default) | 'up' | 'right' | 'left',
5222
+ scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
5223
+ distance: null | number
5224
+ }
5225
+ }
5226
+ * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
5227
+ * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
5228
+ - type: 'FalsyConditionStatement'
5229
+ * { param: null }
5230
+ * use this action when the instruction is an "if" statement and the condition is falsy.
5220
5231
  - type: 'Sleep'
5221
5232
  * { param: { timeMs: number } }
5222
5233
 
@@ -5230,7 +5241,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
5230
5241
 
5231
5242
  ## Output JSON Format:
5232
5243
 
5233
- Please return the result in JSON format as follows:
5244
+ The JSON format is as follows:
5245
+
5234
5246
  {
5235
5247
  "actions": [
5236
5248
  {
@@ -5288,6 +5300,7 @@ By viewing the page screenshot and description, you should consider this and out
5288
5300
  "locate": null
5289
5301
  },
5290
5302
  ],
5303
+ "error": null,
5291
5304
  "taskWillBeAccomplished": false,
5292
5305
  "furtherPlan": {
5293
5306
  "whatToDoNext": "find the 'English' option and click on it",
@@ -5296,7 +5309,39 @@ By viewing the page screenshot and description, you should consider this and out
5296
5309
  }
5297
5310
  \`\`\`
5298
5311
 
5299
- ## Example #2 : When task is accomplished, don't plan more actions
5312
+
5313
+ ## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
5314
+
5315
+ If the user says "If there is a popup, close it", you should consider this and output the JSON:
5316
+
5317
+ * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
5318
+ * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
5319
+
5320
+ \`\`\`json
5321
+ {
5322
+ "actions": [{
5323
+ "thought": "There is no popup on the page",
5324
+ "type": "FalsyConditionStatement",
5325
+ "param": null
5326
+ }
5327
+ ],
5328
+ "taskWillBeAccomplished": true,
5329
+ "furtherPlan": null
5330
+ }
5331
+ \`\`\`
5332
+
5333
+ For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
5334
+
5335
+ \`\`\`json
5336
+ {
5337
+ "actions": [],
5338
+ "error": "The instruction and page context are irrelevant, there is no popup on the page",
5339
+ "taskWillBeAccomplished": true,
5340
+ "furtherPlan": null
5341
+ }
5342
+ \`\`\`
5343
+
5344
+ ## Example #3 : When task is accomplished, don't plan more actions
5300
5345
 
5301
5346
  When the user ask to "Wait 4s", you should consider this:
5302
5347
 
@@ -5367,7 +5412,7 @@ var planSchema = {
5367
5412
  },
5368
5413
  param: {
5369
5414
  type: ["object", "null"],
5370
- description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
5415
+ description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
5371
5416
  },
5372
5417
  locate: {
5373
5418
  type: ["object", "null"],
@@ -5501,11 +5546,9 @@ async function call(messages, responseFormat) {
5501
5546
  return { content, usage: completion.usage };
5502
5547
  }
5503
5548
  async function callToGetJSONObject(messages, AIActionTypeValue) {
5504
- let responseFormat = {
5505
- type: "json_object" /* JSON */
5506
- };
5549
+ let responseFormat;
5507
5550
  const model = getModelName();
5508
- if (model === "gpt-4o-2024-08-06") {
5551
+ if (model.includes("gpt-4o")) {
5509
5552
  switch (AIActionTypeValue) {
5510
5553
  case 0 /* ASSERT */:
5511
5554
  responseFormat = assertSchema;
@@ -5519,9 +5562,9 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
5519
5562
  responseFormat = planSchema;
5520
5563
  break;
5521
5564
  }
5522
- }
5523
- if (model.startsWith("gemini")) {
5524
- responseFormat = { type: "text" /* TEXT */ };
5565
+ if (model === "gpt-4o-2024-05-13") {
5566
+ responseFormat = { type: "json_object" /* JSON */ };
5567
+ }
5525
5568
  }
5526
5569
  const safeJsonParse = (input) => {
5527
5570
  try {
@@ -5539,7 +5582,7 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
5539
5582
  try {
5540
5583
  return { content: JSON.parse(jsonContent), usage: response.usage };
5541
5584
  } catch (e) {
5542
- throw Error(`parse json error: ${response.content}`);
5585
+ throw Error(`failed to parse json response: ${response.content}`);
5543
5586
  }
5544
5587
  }
5545
5588
  function extractJSONFromCodeBlock(response) {
@@ -5560,8 +5603,8 @@ function extractJSONFromCodeBlock(response) {
5560
5603
 
5561
5604
  // src/ai-model/common.ts
5562
5605
  async function callAiFn(options) {
5563
- const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
5564
- if (preferOpenAIModel(useModel)) {
5606
+ const { msgs, AIActionType: AIActionTypeValue } = options;
5607
+ if (preferOpenAIModel("openAI")) {
5565
5608
  const { content, usage } = await callToGetJSONObject(
5566
5609
  msgs,
5567
5610
  AIActionTypeValue
@@ -5612,7 +5655,7 @@ function transformElementPositionToId(aiResult, elementsInfo) {
5612
5655
  };
5613
5656
  }
5614
5657
  async function AiInspectElement(options) {
5615
- const { context, multi, targetElementDescription, callAI, useModel } = options;
5658
+ const { context, multi, targetElementDescription, callAI } = options;
5616
5659
  const { screenshotBase64, screenshotBase64WithElementMarker } = context;
5617
5660
  const { description, elementById, elementByPosition: elementByPosition2 } = await describeUserPage(context);
5618
5661
  if (options.quickAnswer) {
@@ -5687,8 +5730,7 @@ ${JSON.stringify({
5687
5730
  if (callAI) {
5688
5731
  const res = await callAI({
5689
5732
  msgs,
5690
- AIActionType: 1 /* INSPECT_ELEMENT */,
5691
- useModel
5733
+ AIActionType: 1 /* INSPECT_ELEMENT */
5692
5734
  });
5693
5735
  return {
5694
5736
  parseResult: transformElementPositionToId(res.content, context.content),
@@ -5699,8 +5741,7 @@ ${JSON.stringify({
5699
5741
  }
5700
5742
  const inspectElement = await callAiFn({
5701
5743
  msgs,
5702
- AIActionType: 1 /* INSPECT_ELEMENT */,
5703
- useModel
5744
+ AIActionType: 1 /* INSPECT_ELEMENT */
5704
5745
  });
5705
5746
  return {
5706
5747
  parseResult: transformElementPositionToId(
@@ -5750,7 +5791,6 @@ DATA_DEMAND ends.
5750
5791
  ];
5751
5792
  const result = await callAiFn({
5752
5793
  msgs,
5753
- useModel,
5754
5794
  AIActionType: 2 /* EXTRACT_DATA */
5755
5795
  });
5756
5796
  return {
@@ -5793,8 +5833,7 @@ async function AiAssert(options) {
5793
5833
  ];
5794
5834
  const { content: assertResult, usage } = await callAiFn({
5795
5835
  msgs,
5796
- AIActionType: 0 /* ASSERT */,
5797
- useModel
5836
+ AIActionType: 0 /* ASSERT */
5798
5837
  });
5799
5838
  return {
5800
5839
  content: assertResult,
@@ -5804,7 +5843,7 @@ async function AiAssert(options) {
5804
5843
 
5805
5844
  // src/ai-model/automation/index.ts
5806
5845
  var import_node_assert6 = __toESM(require("assert"));
5807
- async function plan(userPrompt, opts, useModel) {
5846
+ async function plan(userPrompt, opts) {
5808
5847
  const { callAI, context } = opts || {};
5809
5848
  const { screenshotBase64, screenshotBase64WithElementMarker } = context;
5810
5849
  const { description: pageDescription, elementByPosition: elementByPosition2 } = await describeUserPage(context);
@@ -5841,7 +5880,7 @@ pageDescription:
5841
5880
  ${pageDescription}
5842
5881
 
5843
5882
 
5844
- Here is what you need to do now:
5883
+ Here is the instruction:
5845
5884
  =====================================
5846
5885
  ${userPrompt}
5847
5886
  =====================================
@@ -5855,8 +5894,7 @@ ${taskBackgroundContext}
5855
5894
  const call2 = callAI || callAiFn;
5856
5895
  const { content, usage } = await call2({
5857
5896
  msgs,
5858
- AIActionType: 3 /* PLAN */,
5859
- useModel
5897
+ AIActionType: 3 /* PLAN */
5860
5898
  });
5861
5899
  const planFromAI = content;
5862
5900
  const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];
@@ -1,8 +1,8 @@
1
- import { g as AIUsageInfo } from './types-7bcbf7fe.js';
1
+ import { g as AIUsageInfo } from './types-55182ae1.js';
2
2
  import { ChatCompletionMessageParam } from 'openai/resources';
3
3
  export { ChatCompletionMessageParam } from 'openai/resources';
4
- import { A as AIActionType } from './index-2b4593d9.js';
5
- export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
4
+ import { A as AIActionType } from './index-43fd19f4.js';
5
+ export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-43fd19f4.js';
6
6
 
7
7
  declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
8
8
  content: T;
@@ -1,4 +1,4 @@
1
- import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-7bcbf7fe.js';
1
+ import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-55182ae1.js';
2
2
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
3
3
 
4
4
  type AIArgs = [
@@ -14,7 +14,6 @@ declare enum AIActionType {
14
14
  declare function callAiFn<T>(options: {
15
15
  msgs: AIArgs;
16
16
  AIActionType: AIActionType;
17
- useModel?: 'openAI' | 'coze';
18
17
  }): Promise<{
19
18
  content: T;
20
19
  usage?: AIUsageInfo;
@@ -116,6 +115,6 @@ declare function plan(userPrompt: string, opts: {
116
115
  originalPrompt?: string;
117
116
  context: UIContext;
118
117
  callAI?: typeof callAiFn<PlanningAIResponse>;
119
- }, useModel?: 'coze' | 'openAI'): Promise<PlanningAIResponse>;
118
+ }): Promise<PlanningAIResponse>;
120
119
 
121
120
  export { AIActionType as A, retrieveSection as a, AiInspectElement as b, callAiFn as c, describeUserPage as d, AiExtractElementInfo as e, AiAssert as f, plan as p, retrieveElement as r, transformElementPositionToId as t };
@@ -1,8 +1,8 @@
1
- import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-7bcbf7fe.js';
2
- export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-7bcbf7fe.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-55182ae1.js';
2
+ export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-55182ae1.js';
3
3
  export { allAIConfig, getAIConfig, overrideAIConfig } from './env.js';
4
- import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-2b4593d9.js';
5
- export { p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
4
+ import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-43fd19f4.js';
5
+ export { p as plan, t as transformElementPositionToId } from './index-43fd19f4.js';
6
6
  export { getLogDirByType, getVersion, setLogDir } from './utils.js';
7
7
  import 'openai/resources';
8
8
 
@@ -176,7 +176,7 @@ interface PlanningLocateParam {
176
176
  }
177
177
  interface PlanningAction<ParamType = any> {
178
178
  thought?: string;
179
- type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
179
+ type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
180
180
  param: ParamType;
181
181
  locate: PlanningLocateParam | null;
182
182
  }
@@ -197,7 +197,9 @@ interface PlanningActionParamInputOrKeyPress {
197
197
  value: string;
198
198
  }
199
199
  interface PlanningActionParamScroll {
200
- scrollType: 'scrollUntilTop' | 'scrollUntilBottom' | 'scrollUpOneScreen' | 'scrollDownOneScreen';
200
+ direction: 'down' | 'up' | 'right' | 'left';
201
+ scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
202
+ distance: null | number;
201
203
  }
202
204
  interface PlanningActionParamAssert {
203
205
  assertion: string;
@@ -1,4 +1,4 @@
1
- import { s as ReportDumpWithAttributes, R as Rect } from './types-7bcbf7fe.js';
1
+ import { s as ReportDumpWithAttributes, R as Rect } from './types-55182ae1.js';
2
2
  import 'openai/resources';
3
3
 
4
4
  declare const insightDumpFileExt = "insight-dump.json";
package/dist/lib/utils.js CHANGED
@@ -272,7 +272,7 @@ function stringifyDumpData(data, indents) {
272
272
  return JSON.stringify(data, replacerForPageObject, indents);
273
273
  }
274
274
  function getVersion() {
275
- return "0.8.6";
275
+ return "0.8.7";
276
276
  }
277
277
  function debugLog(...message) {
278
278
  const debugMode = getAIConfig(MIDSCENE_DEBUG_MODE);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
4
- "version": "0.8.6",
4
+ "version": "0.8.7",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "jsnext:source": "./src/index.ts",
@@ -39,7 +39,7 @@
39
39
  "openai": "4.57.1",
40
40
  "optional": "0.1.4",
41
41
  "socks-proxy-agent": "8.0.4",
42
- "@midscene/shared": "0.8.6"
42
+ "@midscene/shared": "0.8.7"
43
43
  },
44
44
  "devDependencies": {
45
45
  "@modern-js/module-tools": "2.60.6",