@midscene/core 0.8.6 → 0.8.7-beta-20241218070032.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4823,14 +4823,14 @@ You are a versatile professional in software UI automation. Your outstanding con
4823
4823
 
4824
4824
  ## Objective
4825
4825
 
4826
- - Decompose the task user asked into a series of actions
4826
+ - Decompose the instruction user asked into a series of actions
4827
4827
  - Locate the target element if possible
4828
- - If the task cannot be accomplished, give a further plan.
4828
+ - If the instruction cannot be accomplished, give a further plan.
4829
4829
 
4830
4830
  ## Workflow
4831
4831
 
4832
4832
  1. Receive the user's element description, screenshot, and instruction.
4833
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
4833
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyIfStatement / Sleep). The "About the action" section below will give you more details.
4834
4834
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
4835
4835
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
4836
4836
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -4841,7 +4841,7 @@ You are a versatile professional in software UI automation. Your outstanding con
4841
4841
 
4842
4842
  - All the actions you composed MUST be based on the page context information you get.
4843
4843
  - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
4844
- - If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
4844
+ - When the user says "If something is true, do something" in the instruction, follow it, tell if it's truthy, and give the corresponding actions. If it's not truthy, as long as the instruction is an "if" statement, it means the user can tolerate it. Just leave a \`FalsyIfStatement\` action.
4845
4845
 
4846
4846
  ## About the \`actions\` field
4847
4847
 
@@ -4866,10 +4866,19 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
4866
4866
  * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
4867
4867
  - type: 'KeyboardPress', press a key
4868
4868
  * { param: { value: string } }
4869
- - type: 'Scroll'
4870
- * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
4871
- - type: 'Error'
4872
- * { param: { message: string } }
4869
+ - type: 'Scroll', scroll up or down.
4870
+ * {
4871
+ locate: LocateParam | null,
4872
+ param: {
4873
+ direction: 'down'(default) | 'up' | 'right' | 'left',
4874
+ scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
4875
+ distance: null | number
4876
+ }
4877
+ }
4878
+ * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
4879
+ * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
4880
+ - type: 'FalsyIfStatement', when there is a falsy condition and the instruction is an "if" statement (means the user can tolerate this situation)
4881
+ * { param: null }
4873
4882
  - type: 'Sleep'
4874
4883
  * { param: { timeMs: number } }
4875
4884
 
@@ -4941,6 +4950,7 @@ By viewing the page screenshot and description, you should consider this and out
4941
4950
  "locate": null
4942
4951
  },
4943
4952
  ],
4953
+ "error": null,
4944
4954
  "taskWillBeAccomplished": false,
4945
4955
  "furtherPlan": {
4946
4956
  "whatToDoNext": "find the 'English' option and click on it",
@@ -4949,7 +4959,39 @@ By viewing the page screenshot and description, you should consider this and out
4949
4959
  }
4950
4960
  \`\`\`
4951
4961
 
4952
- ## Example #2 : When task is accomplished, don't plan more actions
4962
+
4963
+ ## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
4964
+
4965
+ If the user says "If there is a popup, close it", you should consider this and output the JSON:
4966
+
4967
+ * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
4968
+ * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyIfStatement\` action.
4969
+
4970
+ \`\`\`json
4971
+ {
4972
+ "actions": [{
4973
+ "thought": "There is no popup on the page",
4974
+ "type": "FalsyIfStatement",
4975
+ "param": null
4976
+ }
4977
+ ],
4978
+ "taskWillBeAccomplished": true,
4979
+ "furtherPlan": null
4980
+ }
4981
+ \`\`\`
4982
+
4983
+ For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
4984
+
4985
+ \`\`\`json
4986
+ {
4987
+ "actions": [],
4988
+ "error": "The instruction and page context are irrelevant, there is no popup on the page",
4989
+ "taskWillBeAccomplished": true,
4990
+ "furtherPlan": null
4991
+ }
4992
+ \`\`\`
4993
+
4994
+ ## Example #3 : When task is accomplished, don't plan more actions
4953
4995
 
4954
4996
  When the user ask to "Wait 4s", you should consider this:
4955
4997
 
@@ -5020,7 +5062,7 @@ var planSchema = {
5020
5062
  },
5021
5063
  param: {
5022
5064
  type: ["object", "null"],
5023
- description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
5065
+ description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
5024
5066
  },
5025
5067
  locate: {
5026
5068
  type: ["object", "null"],
@@ -5468,7 +5510,7 @@ pageDescription:
5468
5510
  ${pageDescription}
5469
5511
 
5470
5512
 
5471
- Here is what you need to do now:
5513
+ Here is the instruction:
5472
5514
  =====================================
5473
5515
  ${userPrompt}
5474
5516
  =====================================
package/dist/lib/index.js CHANGED
@@ -4506,7 +4506,7 @@ function stringifyDumpData(data, indents) {
4506
4506
  return JSON.stringify(data, replacerForPageObject, indents);
4507
4507
  }
4508
4508
  function getVersion() {
4509
- return "0.8.6";
4509
+ return "0.8.7-beta-20241218070032.0";
4510
4510
  }
4511
4511
 
4512
4512
  // src/action/executor.ts
@@ -5170,14 +5170,14 @@ You are a versatile professional in software UI automation. Your outstanding con
5170
5170
 
5171
5171
  ## Objective
5172
5172
 
5173
- - Decompose the task user asked into a series of actions
5173
+ - Decompose the instruction user asked into a series of actions
5174
5174
  - Locate the target element if possible
5175
- - If the task cannot be accomplished, give a further plan.
5175
+ - If the instruction cannot be accomplished, give a further plan.
5176
5176
 
5177
5177
  ## Workflow
5178
5178
 
5179
5179
  1. Receive the user's element description, screenshot, and instruction.
5180
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
5180
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyIfStatement / Sleep). The "About the action" section below will give you more details.
5181
5181
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
5182
5182
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
5183
5183
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -5188,7 +5188,7 @@ You are a versatile professional in software UI automation. Your outstanding con
5188
5188
 
5189
5189
  - All the actions you composed MUST be based on the page context information you get.
5190
5190
  - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
5191
- - If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
5191
+ - When the user says "If something is true, do something" in the instruction, follow it, tell if it's truthy, and give the corresponding actions. If it's not truthy, as long as the instruction is an "if" statement, it means the user can tolerate it. Just leave a \`FalsyIfStatement\` action.
5192
5192
 
5193
5193
  ## About the \`actions\` field
5194
5194
 
@@ -5213,10 +5213,19 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
5213
5213
  * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
5214
5214
  - type: 'KeyboardPress', press a key
5215
5215
  * { param: { value: string } }
5216
- - type: 'Scroll'
5217
- * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
5218
- - type: 'Error'
5219
- * { param: { message: string } }
5216
+ - type: 'Scroll', scroll up or down.
5217
+ * {
5218
+ locate: LocateParam | null,
5219
+ param: {
5220
+ direction: 'down'(default) | 'up' | 'right' | 'left',
5221
+ scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
5222
+ distance: null | number
5223
+ }
5224
+ }
5225
+ * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
5226
+ * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
5227
+ - type: 'FalsyIfStatement', when there is a falsy condition and the instruction is an "if" statement (means the user can tolerate this situation)
5228
+ * { param: null }
5220
5229
  - type: 'Sleep'
5221
5230
  * { param: { timeMs: number } }
5222
5231
 
@@ -5288,6 +5297,7 @@ By viewing the page screenshot and description, you should consider this and out
5288
5297
  "locate": null
5289
5298
  },
5290
5299
  ],
5300
+ "error": null,
5291
5301
  "taskWillBeAccomplished": false,
5292
5302
  "furtherPlan": {
5293
5303
  "whatToDoNext": "find the 'English' option and click on it",
@@ -5296,7 +5306,39 @@ By viewing the page screenshot and description, you should consider this and out
5296
5306
  }
5297
5307
  \`\`\`
5298
5308
 
5299
- ## Example #2 : When task is accomplished, don't plan more actions
5309
+
5310
+ ## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
5311
+
5312
+ If the user says "If there is a popup, close it", you should consider this and output the JSON:
5313
+
5314
+ * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
5315
+ * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyIfStatement\` action.
5316
+
5317
+ \`\`\`json
5318
+ {
5319
+ "actions": [{
5320
+ "thought": "There is no popup on the page",
5321
+ "type": "FalsyIfStatement",
5322
+ "param": null
5323
+ }
5324
+ ],
5325
+ "taskWillBeAccomplished": true,
5326
+ "furtherPlan": null
5327
+ }
5328
+ \`\`\`
5329
+
5330
+ For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
5331
+
5332
+ \`\`\`json
5333
+ {
5334
+ "actions": [],
5335
+ "error": "The instruction and page context are irrelevant, there is no popup on the page",
5336
+ "taskWillBeAccomplished": true,
5337
+ "furtherPlan": null
5338
+ }
5339
+ \`\`\`
5340
+
5341
+ ## Example #3 : When task is accomplished, don't plan more actions
5300
5342
 
5301
5343
  When the user ask to "Wait 4s", you should consider this:
5302
5344
 
@@ -5367,7 +5409,7 @@ var planSchema = {
5367
5409
  },
5368
5410
  param: {
5369
5411
  type: ["object", "null"],
5370
- description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
5412
+ description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
5371
5413
  },
5372
5414
  locate: {
5373
5415
  type: ["object", "null"],
@@ -5841,7 +5883,7 @@ pageDescription:
5841
5883
  ${pageDescription}
5842
5884
 
5843
5885
 
5844
- Here is what you need to do now:
5886
+ Here is the instruction:
5845
5887
  =====================================
5846
5888
  ${userPrompt}
5847
5889
  =====================================
@@ -1,8 +1,8 @@
1
- import { g as AIUsageInfo } from './types-7bcbf7fe.js';
1
+ import { g as AIUsageInfo } from './types-20204347.js';
2
2
  import { ChatCompletionMessageParam } from 'openai/resources';
3
3
  export { ChatCompletionMessageParam } from 'openai/resources';
4
- import { A as AIActionType } from './index-2b4593d9.js';
5
- export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
4
+ import { A as AIActionType } from './index-12fdcf10.js';
5
+ export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-12fdcf10.js';
6
6
 
7
7
  declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
8
8
  content: T;
@@ -1,4 +1,4 @@
1
- import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-7bcbf7fe.js';
1
+ import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-20204347.js';
2
2
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
3
3
 
4
4
  type AIArgs = [
@@ -1,8 +1,8 @@
1
- import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-7bcbf7fe.js';
2
- export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-7bcbf7fe.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-20204347.js';
2
+ export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-20204347.js';
3
3
  export { allAIConfig, getAIConfig, overrideAIConfig } from './env.js';
4
- import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-2b4593d9.js';
5
- export { p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
4
+ import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-12fdcf10.js';
5
+ export { p as plan, t as transformElementPositionToId } from './index-12fdcf10.js';
6
6
  export { getLogDirByType, getVersion, setLogDir } from './utils.js';
7
7
  import 'openai/resources';
8
8
 
@@ -176,7 +176,7 @@ interface PlanningLocateParam {
176
176
  }
177
177
  interface PlanningAction<ParamType = any> {
178
178
  thought?: string;
179
- type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
179
+ type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyIfStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
180
180
  param: ParamType;
181
181
  locate: PlanningLocateParam | null;
182
182
  }
@@ -197,7 +197,9 @@ interface PlanningActionParamInputOrKeyPress {
197
197
  value: string;
198
198
  }
199
199
  interface PlanningActionParamScroll {
200
- scrollType: 'scrollUntilTop' | 'scrollUntilBottom' | 'scrollUpOneScreen' | 'scrollDownOneScreen';
200
+ direction: 'down' | 'up' | 'right' | 'left';
201
+ scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
202
+ distance: null | number;
201
203
  }
202
204
  interface PlanningActionParamAssert {
203
205
  assertion: string;
@@ -1,4 +1,4 @@
1
- import { s as ReportDumpWithAttributes, R as Rect } from './types-7bcbf7fe.js';
1
+ import { s as ReportDumpWithAttributes, R as Rect } from './types-20204347.js';
2
2
  import 'openai/resources';
3
3
 
4
4
  declare const insightDumpFileExt = "insight-dump.json";
package/dist/lib/utils.js CHANGED
@@ -272,7 +272,7 @@ function stringifyDumpData(data, indents) {
272
272
  return JSON.stringify(data, replacerForPageObject, indents);
273
273
  }
274
274
  function getVersion() {
275
- return "0.8.6";
275
+ return "0.8.7-beta-20241218070032.0";
276
276
  }
277
277
  function debugLog(...message) {
278
278
  const debugMode = getAIConfig(MIDSCENE_DEBUG_MODE);
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
4
- "version": "0.8.6",
4
+ "version": "0.8.7-beta-20241218070032.0",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "jsnext:source": "./src/index.ts",
@@ -39,7 +39,7 @@
39
39
  "openai": "4.57.1",
40
40
  "optional": "0.1.4",
41
41
  "socks-proxy-agent": "8.0.4",
42
- "@midscene/shared": "0.8.6"
42
+ "@midscene/shared": "0.8.7-beta-20241218070032.0"
43
43
  },
44
44
  "devDependencies": {
45
45
  "@modern-js/module-tools": "2.60.6",