npm - @midscene/core - Versions diffs - 0.8.6 → 0.8.7-beta-20241218070032.0 - Mend

@midscene/core 0.8.6 → 0.8.7-beta-20241218070032.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/lib/ai-model.js +53 -11
package/dist/lib/index.js +54 -12
package/dist/lib/types/ai-model.d.ts +3 -3
package/dist/lib/types/{index-2b4593d9.d.ts → index-12fdcf10.d.ts} +1 -1
package/dist/lib/types/index.d.ts +4 -4
package/dist/lib/types/{types-7bcbf7fe.d.ts → types-20204347.d.ts} +4 -2
package/dist/lib/types/utils.d.ts +1 -1
package/dist/lib/utils.js +1 -1
package/package.json +2 -2
package/report/index.html +2 -2

package/dist/lib/ai-model.js CHANGED Viewed

@@ -4823,14 +4823,14 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Objective
-- Decompose the task user asked into a series of actions
+- Decompose the instruction user asked into a series of actions
 - Locate the target element if possible
-- If the task cannot be accomplished, give a further plan.
+- If the instruction cannot be accomplished, give a further plan.
 ## Workflow
 1. Receive the user's element description, screenshot, and instruction.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyIfStatement / Sleep). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@@ -4841,7 +4841,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
+- When the user says "If something is true, do something" in the instruction, follow it, tell if it's truthy, and give the corresponding actions. If it's not truthy, as long as the instruction is an "if" statement, it means the user can tolerate it. Just leave a \`FalsyIfStatement\` action.
 ## About the \`actions\` field
@@ -4866,10 +4866,19 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
   * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
 - type: 'KeyboardPress', press a key
   * { param: { value: string } }
-- type: 'Scroll'
-  * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
-- type: 'Error'
-  * { param: { message: string } }
+- type: 'Scroll', scroll up or down.
+  * {
+      locate: LocateParam | null,
+      param: {
+        direction: 'down'(default) | 'up' | 'right' | 'left',
+        scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
+        distance: null | number
+      }
+    }
+    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
+    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+- type: 'FalsyIfStatement', when there is a falsy condition and the instruction is an "if" statement (means the user can tolerate this situation)
+  * { param: null }
 - type: 'Sleep'
   * { param: { timeMs: number } }
@@ -4941,6 +4950,7 @@ By viewing the page screenshot and description, you should consider this and out
       "locate": null
     },
   ],
+  "error": null,
   "taskWillBeAccomplished": false,
   "furtherPlan": {
     "whatToDoNext": "find the 'English' option and click on it",
@@ -4949,7 +4959,39 @@ By viewing the page screenshot and description, you should consider this and out
 }
 \`\`\`
-## Example #2 : When task is accomplished, don't plan more actions
+## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
+If the user says "If there is a popup, close it", you should consider this and output the JSON:
+* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
+* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyIfStatement\` action.
+\`\`\`json
+{
+  "actions": [{
+      "thought": "There is no popup on the page",
+      "type": "FalsyIfStatement",
+      "param": null
+    }
+  ],
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
+\`\`\`json
+{
+  "actions": [],
+  "error": "The instruction and page context are irrelevant, there is no popup on the page",
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+## Example #3 : When task is accomplished, don't plan more actions
 When the user ask to "Wait 4s", you should consider this:
@@ -5020,7 +5062,7 @@ var planSchema = {
               },
               param: {
                 type: ["object", "null"],
-                description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
+                description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
               },
               locate: {
                 type: ["object", "null"],
@@ -5468,7 +5510,7 @@ pageDescription:
 ${pageDescription}
-Here is what you need to do now:
+Here is the instruction:
 =====================================
 ${userPrompt}
 =====================================

package/dist/lib/index.js CHANGED Viewed

@@ -4506,7 +4506,7 @@ function stringifyDumpData(data, indents) {
   return JSON.stringify(data, replacerForPageObject, indents);
 }
 function getVersion() {
-  return "0.8.6";
+  return "0.8.7-beta-20241218070032.0";
 }
 // src/action/executor.ts
@@ -5170,14 +5170,14 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Objective
-- Decompose the task user asked into a series of actions
+- Decompose the instruction user asked into a series of actions
 - Locate the target element if possible
-- If the task cannot be accomplished, give a further plan.
+- If the instruction cannot be accomplished, give a further plan.
 ## Workflow
 1. Receive the user's element description, screenshot, and instruction.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyIfStatement / Sleep). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@@ -5188,7 +5188,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
+- When the user says "If something is true, do something" in the instruction, follow it, tell if it's truthy, and give the corresponding actions. If it's not truthy, as long as the instruction is an "if" statement, it means the user can tolerate it. Just leave a \`FalsyIfStatement\` action.
 ## About the \`actions\` field
@@ -5213,10 +5213,19 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
   * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
 - type: 'KeyboardPress', press a key
   * { param: { value: string } }
-- type: 'Scroll'
-  * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
-- type: 'Error'
-  * { param: { message: string } }
+- type: 'Scroll', scroll up or down.
+  * {
+      locate: LocateParam | null,
+      param: {
+        direction: 'down'(default) | 'up' | 'right' | 'left',
+        scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
+        distance: null | number
+      }
+    }
+    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
+    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+- type: 'FalsyIfStatement', when there is a falsy condition and the instruction is an "if" statement (means the user can tolerate this situation)
+  * { param: null }
 - type: 'Sleep'
   * { param: { timeMs: number } }
@@ -5288,6 +5297,7 @@ By viewing the page screenshot and description, you should consider this and out
       "locate": null
     },
   ],
+  "error": null,
   "taskWillBeAccomplished": false,
   "furtherPlan": {
     "whatToDoNext": "find the 'English' option and click on it",
@@ -5296,7 +5306,39 @@ By viewing the page screenshot and description, you should consider this and out
 }
 \`\`\`
-## Example #2 : When task is accomplished, don't plan more actions
+## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
+If the user says "If there is a popup, close it", you should consider this and output the JSON:
+* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
+* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyIfStatement\` action.
+\`\`\`json
+{
+  "actions": [{
+      "thought": "There is no popup on the page",
+      "type": "FalsyIfStatement",
+      "param": null
+    }
+  ],
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
+\`\`\`json
+{
+  "actions": [],
+  "error": "The instruction and page context are irrelevant, there is no popup on the page",
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+## Example #3 : When task is accomplished, don't plan more actions
 When the user ask to "Wait 4s", you should consider this:
@@ -5367,7 +5409,7 @@ var planSchema = {
               },
               param: {
                 type: ["object", "null"],
-                description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
+                description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
               },
               locate: {
                 type: ["object", "null"],
@@ -5841,7 +5883,7 @@ pageDescription:
 ${pageDescription}
-Here is what you need to do now:
+Here is the instruction:
 =====================================
 ${userPrompt}
 =====================================

package/dist/lib/types/ai-model.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-import { g as AIUsageInfo } from './types-7bcbf7fe.js';
+import { g as AIUsageInfo } from './types-20204347.js';
 import { ChatCompletionMessageParam } from 'openai/resources';
 export { ChatCompletionMessageParam } from 'openai/resources';
-import { A as AIActionType } from './index-2b4593d9.js';
-export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
+import { A as AIActionType } from './index-12fdcf10.js';
+export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-12fdcf10.js';
 declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
     content: T;

package/dist/lib/types/{index-2b4593d9.d.ts → index-12fdcf10.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-7bcbf7fe.js';
+import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-20204347.js';
 import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
 type AIArgs = [

package/dist/lib/types/index.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-7bcbf7fe.js';
-export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-7bcbf7fe.js';
+import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-20204347.js';
+export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-20204347.js';
 export { allAIConfig, getAIConfig, overrideAIConfig } from './env.js';
-import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-2b4593d9.js';
-export { p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
+import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-12fdcf10.js';
+export { p as plan, t as transformElementPositionToId } from './index-12fdcf10.js';
 export { getLogDirByType, getVersion, setLogDir } from './utils.js';
 import 'openai/resources';

package/dist/lib/types/{types-7bcbf7fe.d.ts → types-20204347.d.ts} RENAMED Viewed

@@ -176,7 +176,7 @@ interface PlanningLocateParam {
 }
 interface PlanningAction<ParamType = any> {
     thought?: string;
-    type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
+    type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyIfStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
     param: ParamType;
     locate: PlanningLocateParam | null;
 }
@@ -197,7 +197,9 @@ interface PlanningActionParamInputOrKeyPress {
     value: string;
 }
 interface PlanningActionParamScroll {
-    scrollType: 'scrollUntilTop' | 'scrollUntilBottom' | 'scrollUpOneScreen' | 'scrollDownOneScreen';
+    direction: 'down' | 'up' | 'right' | 'left';
+    scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
+    distance: null | number;
 }
 interface PlanningActionParamAssert {
     assertion: string;

package/dist/lib/types/utils.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { s as ReportDumpWithAttributes, R as Rect } from './types-7bcbf7fe.js';
+import { s as ReportDumpWithAttributes, R as Rect } from './types-20204347.js';
 import 'openai/resources';
 declare const insightDumpFileExt = "insight-dump.json";

package/dist/lib/utils.js CHANGED Viewed

@@ -272,7 +272,7 @@ function stringifyDumpData(data, indents) {
   return JSON.stringify(data, replacerForPageObject, indents);
 }
 function getVersion() {
-  return "0.8.6";
+  return "0.8.7-beta-20241218070032.0";
 }
 function debugLog(...message) {
   const debugMode = getAIConfig(MIDSCENE_DEBUG_MODE);

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@midscene/core",
   "description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
-  "version": "0.8.6",
+  "version": "0.8.7-beta-20241218070032.0",
   "repository": "https://github.com/web-infra-dev/midscene",
   "homepage": "https://midscenejs.com/",
   "jsnext:source": "./src/index.ts",
@@ -39,7 +39,7 @@
     "openai": "4.57.1",
     "optional": "0.1.4",
     "socks-proxy-agent": "8.0.4",
-    "@midscene/shared": "0.8.6"
+    "@midscene/shared": "0.8.7-beta-20241218070032.0"
   },
   "devDependencies": {
     "@modern-js/module-tools": "2.60.6",