npm - @midscene/core - Versions diffs - 0.8.6 → 0.8.7 - Mend

@midscene/core 0.8.6 → 0.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/lib/ai-model.js +71 -33
package/dist/lib/index.js +72 -34
package/dist/lib/types/ai-model.d.ts +3 -3
package/dist/lib/types/{index-2b4593d9.d.ts → index-43fd19f4.d.ts} +2 -3
package/dist/lib/types/index.d.ts +4 -4
package/dist/lib/types/{types-7bcbf7fe.d.ts → types-55182ae1.d.ts} +4 -2
package/dist/lib/types/utils.d.ts +1 -1
package/dist/lib/utils.js +1 -1
package/package.json +2 -2
package/report/index.html +2 -2

package/dist/lib/ai-model.js CHANGED Viewed

@@ -4355,8 +4355,8 @@ var allAIConfig = () => {
 // src/ai-model/common.ts
 async function callAiFn(options) {
-  const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
-  if (preferOpenAIModel(useModel)) {
+  const { msgs, AIActionType: AIActionTypeValue } = options;
+  if (preferOpenAIModel("openAI")) {
     const { content, usage } = await callToGetJSONObject(
       msgs,
       AIActionTypeValue
@@ -4823,14 +4823,14 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Objective
-- Decompose the task user asked into a series of actions
+- Decompose the instruction user asked into a series of actions
 - Locate the target element if possible
-- If the task cannot be accomplished, give a further plan.
+- If the instruction cannot be accomplished, give a further plan.
 ## Workflow
 1. Receive the user's element description, screenshot, and instruction.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@@ -4841,7 +4841,8 @@ You are a versatile professional in software UI automation. Your outstanding con
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
+- Respond only with valid JSON. Do not write an introduction or summary.
+- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
 ## About the \`actions\` field
@@ -4866,10 +4867,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
   * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
 - type: 'KeyboardPress', press a key
   * { param: { value: string } }
-- type: 'Scroll'
-  * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
-- type: 'Error'
-  * { param: { message: string } }
+- type: 'Scroll', scroll up or down.
+  * {
+      locate: LocateParam | null,
+      param: {
+        direction: 'down'(default) | 'up' | 'right' | 'left',
+        scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
+        distance: null | number
+      }
+    }
+    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
+    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+- type: 'FalsyConditionStatement'
+  * { param: null }
+  * use this action when the instruction is an "if" statement and the condition is falsy.
 - type: 'Sleep'
   * { param: { timeMs: number } }
@@ -4883,7 +4894,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
 ## Output JSON Format:
-Please return the result in JSON format as follows:
+The JSON format is as follows:
 {
   "actions": [
     {
@@ -4941,6 +4953,7 @@ By viewing the page screenshot and description, you should consider this and out
       "locate": null
     },
   ],
+  "error": null,
   "taskWillBeAccomplished": false,
   "furtherPlan": {
     "whatToDoNext": "find the 'English' option and click on it",
@@ -4949,7 +4962,39 @@ By viewing the page screenshot and description, you should consider this and out
 }
 \`\`\`
-## Example #2 : When task is accomplished, don't plan more actions
+## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
+If the user says "If there is a popup, close it", you should consider this and output the JSON:
+* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
+* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
+\`\`\`json
+{
+  "actions": [{
+      "thought": "There is no popup on the page",
+      "type": "FalsyConditionStatement",
+      "param": null
+    }
+  ],
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
+\`\`\`json
+{
+  "actions": [],
+  "error": "The instruction and page context are irrelevant, there is no popup on the page",
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+## Example #3 : When task is accomplished, don't plan more actions
 When the user ask to "Wait 4s", you should consider this:
@@ -5020,7 +5065,7 @@ var planSchema = {
               },
               param: {
                 type: ["object", "null"],
-                description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
+                description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
               },
               locate: {
                 type: ["object", "null"],
@@ -5154,11 +5199,9 @@ async function call(messages, responseFormat) {
   return { content, usage: completion.usage };
 }
 async function callToGetJSONObject(messages, AIActionTypeValue) {
-  let responseFormat = {
-    type: "json_object" /* JSON */
-  };
+  let responseFormat;
   const model = getModelName();
-  if (model === "gpt-4o-2024-08-06") {
+  if (model.includes("gpt-4o")) {
     switch (AIActionTypeValue) {
       case 0 /* ASSERT */:
         responseFormat = assertSchema;
@@ -5172,9 +5215,9 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
         responseFormat = planSchema;
         break;
     }
-  }
-  if (model.startsWith("gemini")) {
-    responseFormat = { type: "text" /* TEXT */ };
+    if (model === "gpt-4o-2024-05-13") {
+      responseFormat = { type: "json_object" /* JSON */ };
+    }
   }
   const safeJsonParse = (input) => {
     try {
@@ -5192,7 +5235,7 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
   try {
     return { content: JSON.parse(jsonContent), usage: response.usage };
   } catch (e) {
-    throw Error(`parse json error: ${response.content}`);
+    throw Error(`failed to parse json response: ${response.content}`);
   }
 }
 function extractJSONFromCodeBlock(response) {
@@ -5239,7 +5282,7 @@ function transformElementPositionToId(aiResult, elementsInfo) {
   };
 }
 async function AiInspectElement(options) {
-  const { context, multi, targetElementDescription, callAI, useModel } = options;
+  const { context, multi, targetElementDescription, callAI } = options;
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
   const { description, elementById, elementByPosition: elementByPosition2 } = await describeUserPage(context);
   if (options.quickAnswer) {
@@ -5314,8 +5357,7 @@ ${JSON.stringify({
   if (callAI) {
     const res = await callAI({
       msgs,
-      AIActionType: 1 /* INSPECT_ELEMENT */,
-      useModel
+      AIActionType: 1 /* INSPECT_ELEMENT */
     });
     return {
       parseResult: transformElementPositionToId(res.content, context.content),
@@ -5326,8 +5368,7 @@ ${JSON.stringify({
   }
   const inspectElement = await callAiFn({
     msgs,
-    AIActionType: 1 /* INSPECT_ELEMENT */,
-    useModel
+    AIActionType: 1 /* INSPECT_ELEMENT */
   });
   return {
     parseResult: transformElementPositionToId(
@@ -5377,7 +5418,6 @@ DATA_DEMAND ends.
   ];
   const result = await callAiFn({
     msgs,
-    useModel,
     AIActionType: 2 /* EXTRACT_DATA */
   });
   return {
@@ -5420,8 +5460,7 @@ async function AiAssert(options) {
   ];
   const { content: assertResult, usage } = await callAiFn({
     msgs,
-    AIActionType: 0 /* ASSERT */,
-    useModel
+    AIActionType: 0 /* ASSERT */
   });
   return {
     content: assertResult,
@@ -5431,7 +5470,7 @@ async function AiAssert(options) {
 // src/ai-model/automation/index.ts
 var import_node_assert4 = __toESM(require("assert"));
-async function plan(userPrompt, opts, useModel) {
+async function plan(userPrompt, opts) {
   const { callAI, context } = opts || {};
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
   const { description: pageDescription, elementByPosition: elementByPosition2 } = await describeUserPage(context);
@@ -5468,7 +5507,7 @@ pageDescription:
 ${pageDescription}
-Here is what you need to do now:
+Here is the instruction:
 =====================================
 ${userPrompt}
 =====================================
@@ -5482,8 +5521,7 @@ ${taskBackgroundContext}
   const call2 = callAI || callAiFn;
   const { content, usage } = await call2({
     msgs,
-    AIActionType: 3 /* PLAN */,
-    useModel
+    AIActionType: 3 /* PLAN */
   });
   const planFromAI = content;
   const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];

package/dist/lib/index.js CHANGED Viewed

@@ -4506,7 +4506,7 @@ function stringifyDumpData(data, indents) {
   return JSON.stringify(data, replacerForPageObject, indents);
 }
 function getVersion() {
-  return "0.8.6";
+  return "0.8.7";
 }
 // src/action/executor.ts
@@ -5170,14 +5170,14 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Objective
-- Decompose the task user asked into a series of actions
+- Decompose the instruction user asked into a series of actions
 - Locate the target element if possible
-- If the task cannot be accomplished, give a further plan.
+- If the instruction cannot be accomplished, give a further plan.
 ## Workflow
 1. Receive the user's element description, screenshot, and instruction.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@@ -5188,7 +5188,8 @@ You are a versatile professional in software UI automation. Your outstanding con
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
+- Respond only with valid JSON. Do not write an introduction or summary.
+- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
 ## About the \`actions\` field
@@ -5213,10 +5214,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
   * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
 - type: 'KeyboardPress', press a key
   * { param: { value: string } }
-- type: 'Scroll'
-  * { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
-- type: 'Error'
-  * { param: { message: string } }
+- type: 'Scroll', scroll up or down.
+  * {
+      locate: LocateParam | null,
+      param: {
+        direction: 'down'(default) | 'up' | 'right' | 'left',
+        scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
+        distance: null | number
+      }
+    }
+    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
+    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+- type: 'FalsyConditionStatement'
+  * { param: null }
+  * use this action when the instruction is an "if" statement and the condition is falsy.
 - type: 'Sleep'
   * { param: { timeMs: number } }
@@ -5230,7 +5241,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
 ## Output JSON Format:
-Please return the result in JSON format as follows:
+The JSON format is as follows:
 {
   "actions": [
     {
@@ -5288,6 +5300,7 @@ By viewing the page screenshot and description, you should consider this and out
       "locate": null
     },
   ],
+  "error": null,
   "taskWillBeAccomplished": false,
   "furtherPlan": {
     "whatToDoNext": "find the 'English' option and click on it",
@@ -5296,7 +5309,39 @@ By viewing the page screenshot and description, you should consider this and out
 }
 \`\`\`
-## Example #2 : When task is accomplished, don't plan more actions
+## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
+If the user says "If there is a popup, close it", you should consider this and output the JSON:
+* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
+* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
+\`\`\`json
+{
+  "actions": [{
+      "thought": "There is no popup on the page",
+      "type": "FalsyConditionStatement",
+      "param": null
+    }
+  ],
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
+\`\`\`json
+{
+  "actions": [],
+  "error": "The instruction and page context are irrelevant, there is no popup on the page",
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+\`\`\`
+## Example #3 : When task is accomplished, don't plan more actions
 When the user ask to "Wait 4s", you should consider this:
@@ -5367,7 +5412,7 @@ var planSchema = {
               },
               param: {
                 type: ["object", "null"],
-                description: "Parameter towards the task type, can be null only when the type field is Tap or Hover"
+                description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
               },
               locate: {
                 type: ["object", "null"],
@@ -5501,11 +5546,9 @@ async function call(messages, responseFormat) {
   return { content, usage: completion.usage };
 }
 async function callToGetJSONObject(messages, AIActionTypeValue) {
-  let responseFormat = {
-    type: "json_object" /* JSON */
-  };
+  let responseFormat;
   const model = getModelName();
-  if (model === "gpt-4o-2024-08-06") {
+  if (model.includes("gpt-4o")) {
     switch (AIActionTypeValue) {
       case 0 /* ASSERT */:
         responseFormat = assertSchema;
@@ -5519,9 +5562,9 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
         responseFormat = planSchema;
         break;
     }
-  }
-  if (model.startsWith("gemini")) {
-    responseFormat = { type: "text" /* TEXT */ };
+    if (model === "gpt-4o-2024-05-13") {
+      responseFormat = { type: "json_object" /* JSON */ };
+    }
   }
   const safeJsonParse = (input) => {
     try {
@@ -5539,7 +5582,7 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
   try {
     return { content: JSON.parse(jsonContent), usage: response.usage };
   } catch (e) {
-    throw Error(`parse json error: ${response.content}`);
+    throw Error(`failed to parse json response: ${response.content}`);
   }
 }
 function extractJSONFromCodeBlock(response) {
@@ -5560,8 +5603,8 @@ function extractJSONFromCodeBlock(response) {
 // src/ai-model/common.ts
 async function callAiFn(options) {
-  const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
-  if (preferOpenAIModel(useModel)) {
+  const { msgs, AIActionType: AIActionTypeValue } = options;
+  if (preferOpenAIModel("openAI")) {
     const { content, usage } = await callToGetJSONObject(
       msgs,
       AIActionTypeValue
@@ -5612,7 +5655,7 @@ function transformElementPositionToId(aiResult, elementsInfo) {
   };
 }
 async function AiInspectElement(options) {
-  const { context, multi, targetElementDescription, callAI, useModel } = options;
+  const { context, multi, targetElementDescription, callAI } = options;
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
   const { description, elementById, elementByPosition: elementByPosition2 } = await describeUserPage(context);
   if (options.quickAnswer) {
@@ -5687,8 +5730,7 @@ ${JSON.stringify({
   if (callAI) {
     const res = await callAI({
       msgs,
-      AIActionType: 1 /* INSPECT_ELEMENT */,
-      useModel
+      AIActionType: 1 /* INSPECT_ELEMENT */
     });
     return {
       parseResult: transformElementPositionToId(res.content, context.content),
@@ -5699,8 +5741,7 @@ ${JSON.stringify({
   }
   const inspectElement = await callAiFn({
     msgs,
-    AIActionType: 1 /* INSPECT_ELEMENT */,
-    useModel
+    AIActionType: 1 /* INSPECT_ELEMENT */
   });
   return {
     parseResult: transformElementPositionToId(
@@ -5750,7 +5791,6 @@ DATA_DEMAND ends.
   ];
   const result = await callAiFn({
     msgs,
-    useModel,
     AIActionType: 2 /* EXTRACT_DATA */
   });
   return {
@@ -5793,8 +5833,7 @@ async function AiAssert(options) {
   ];
   const { content: assertResult, usage } = await callAiFn({
     msgs,
-    AIActionType: 0 /* ASSERT */,
-    useModel
+    AIActionType: 0 /* ASSERT */
   });
   return {
     content: assertResult,
@@ -5804,7 +5843,7 @@ async function AiAssert(options) {
 // src/ai-model/automation/index.ts
 var import_node_assert6 = __toESM(require("assert"));
-async function plan(userPrompt, opts, useModel) {
+async function plan(userPrompt, opts) {
   const { callAI, context } = opts || {};
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
   const { description: pageDescription, elementByPosition: elementByPosition2 } = await describeUserPage(context);
@@ -5841,7 +5880,7 @@ pageDescription:
 ${pageDescription}
-Here is what you need to do now:
+Here is the instruction:
 =====================================
 ${userPrompt}
 =====================================
@@ -5855,8 +5894,7 @@ ${taskBackgroundContext}
   const call2 = callAI || callAiFn;
   const { content, usage } = await call2({
     msgs,
-    AIActionType: 3 /* PLAN */,
-    useModel
+    AIActionType: 3 /* PLAN */
   });
   const planFromAI = content;
   const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];

package/dist/lib/types/ai-model.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-import { g as AIUsageInfo } from './types-7bcbf7fe.js';
+import { g as AIUsageInfo } from './types-55182ae1.js';
 import { ChatCompletionMessageParam } from 'openai/resources';
 export { ChatCompletionMessageParam } from 'openai/resources';
-import { A as AIActionType } from './index-2b4593d9.js';
-export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
+import { A as AIActionType } from './index-43fd19f4.js';
+export { f as AiAssert, e as AiExtractElementInfo, b as AiInspectElement, c as callAiFn, d as describeUserPage, p as plan, t as transformElementPositionToId } from './index-43fd19f4.js';
 declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
     content: T;

package/dist/lib/types/{index-2b4593d9.d.ts → index-43fd19f4.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-7bcbf7fe.js';
+import { g as AIUsageInfo, B as BaseElement, U as UIContext, m as AIElementResponse, A as AISingleElementResponse, i as AISingleElementResponseById, n as AISectionParseResponse, o as AIAssertionResponse, F as PlanningAIResponse } from './types-55182ae1.js';
 import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
 type AIArgs = [
@@ -14,7 +14,6 @@ declare enum AIActionType {
 declare function callAiFn<T>(options: {
     msgs: AIArgs;
     AIActionType: AIActionType;
-    useModel?: 'openAI' | 'coze';
 }): Promise<{
     content: T;
     usage?: AIUsageInfo;
@@ -116,6 +115,6 @@ declare function plan(userPrompt: string, opts: {
     originalPrompt?: string;
     context: UIContext;
     callAI?: typeof callAiFn<PlanningAIResponse>;
-}, useModel?: 'coze' | 'openAI'): Promise<PlanningAIResponse>;
+}): Promise<PlanningAIResponse>;
 export { AIActionType as A, retrieveSection as a, AiInspectElement as b, callAiFn as c, describeUserPage as d, AiExtractElementInfo as e, AiAssert as f, plan as p, retrieveElement as r, transformElementPositionToId as t };

package/dist/lib/types/index.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-7bcbf7fe.js';
-export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-7bcbf7fe.js';
+import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, A as AISingleElementResponse, f as InsightAssertionResponse } from './types-55182ae1.js';
+export { o as AIAssertionResponse, k as AIElementIdResponse, l as AIElementPositionResponse, m as AIElementResponse, h as AIResponseFormat, n as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, x as AgentAssertOpt, w as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, r as DumpMeta, v as ElementById, p as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, t as InsightDump, q as InsightExtractParam, L as LiteUISection, u as PartialInsightDumpFromSDK, F as PlanningAIResponse, z as PlanningAction, O as PlanningActionParamAssert, T as PlanningActionParamError, K as PlanningActionParamHover, M as PlanningActionParamInputOrKeyPress, H as PlanningActionParamPlan, N as PlanningActionParamScroll, Q as PlanningActionParamSleep, J as PlanningActionParamTap, V as PlanningActionParamWaitFor, G as PlanningFurtherPlan, y as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, s as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-55182ae1.js';
 export { allAIConfig, getAIConfig, overrideAIConfig } from './env.js';
-import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-2b4593d9.js';
-export { p as plan, t as transformElementPositionToId } from './index-2b4593d9.js';
+import { c as callAiFn, r as retrieveElement, a as retrieveSection } from './index-43fd19f4.js';
+export { p as plan, t as transformElementPositionToId } from './index-43fd19f4.js';
 export { getLogDirByType, getVersion, setLogDir } from './utils.js';
 import 'openai/resources';

package/dist/lib/types/{types-7bcbf7fe.d.ts → types-55182ae1.d.ts} RENAMED Viewed

@@ -176,7 +176,7 @@ interface PlanningLocateParam {
 }
 interface PlanningAction<ParamType = any> {
     thought?: string;
-    type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
+    type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep';
     param: ParamType;
     locate: PlanningLocateParam | null;
 }
@@ -197,7 +197,9 @@ interface PlanningActionParamInputOrKeyPress {
     value: string;
 }
 interface PlanningActionParamScroll {
-    scrollType: 'scrollUntilTop' | 'scrollUntilBottom' | 'scrollUpOneScreen' | 'scrollDownOneScreen';
+    direction: 'down' | 'up' | 'right' | 'left';
+    scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
+    distance: null | number;
 }
 interface PlanningActionParamAssert {
     assertion: string;

package/dist/lib/types/utils.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { s as ReportDumpWithAttributes, R as Rect } from './types-7bcbf7fe.js';
+import { s as ReportDumpWithAttributes, R as Rect } from './types-55182ae1.js';
 import 'openai/resources';
 declare const insightDumpFileExt = "insight-dump.json";

package/dist/lib/utils.js CHANGED Viewed

@@ -272,7 +272,7 @@ function stringifyDumpData(data, indents) {
   return JSON.stringify(data, replacerForPageObject, indents);
 }
 function getVersion() {
-  return "0.8.6";
+  return "0.8.7";
 }
 function debugLog(...message) {
   const debugMode = getAIConfig(MIDSCENE_DEBUG_MODE);

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@midscene/core",
   "description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
-  "version": "0.8.6",
+  "version": "0.8.7",
   "repository": "https://github.com/web-infra-dev/midscene",
   "homepage": "https://midscenejs.com/",
   "jsnext:source": "./src/index.ts",
@@ -39,7 +39,7 @@
     "openai": "4.57.1",
     "optional": "0.1.4",
     "socks-proxy-agent": "8.0.4",
-    "@midscene/shared": "0.8.6"
+    "@midscene/shared": "0.8.7"
   },
   "devDependencies": {
     "@modern-js/module-tools": "2.60.6",