npm - @midscene/core - Versions diffs - 1.2.1-beta-20260108154312.0 → 1.2.1-beta-20260109060244.0 - Mend

@midscene/core 1.2.1-beta-20260108154312.0 → 1.2.1-beta-20260109060244.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/dist/es/agent/agent.mjs +14 -13
package/dist/es/agent/agent.mjs.map +1 -1
package/dist/es/agent/tasks.mjs +21 -14
package/dist/es/agent/tasks.mjs.map +1 -1
package/dist/es/agent/utils.mjs +1 -1
package/dist/es/ai-model/llm-planning.mjs +3 -12
package/dist/es/ai-model/llm-planning.mjs.map +1 -1
package/dist/es/ai-model/prompt/llm-planning.mjs +7 -2
package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
package/dist/es/ai-model/ui-tars-planning.mjs +1 -1
package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
package/dist/es/common.mjs +5 -14
package/dist/es/common.mjs.map +1 -1
package/dist/es/device/index.mjs +3 -28
package/dist/es/device/index.mjs.map +1 -1
package/dist/es/types.mjs.map +1 -1
package/dist/es/utils.mjs +2 -2
package/dist/lib/agent/agent.js +13 -12
package/dist/lib/agent/agent.js.map +1 -1
package/dist/lib/agent/tasks.js +21 -14
package/dist/lib/agent/tasks.js.map +1 -1
package/dist/lib/agent/utils.js +1 -1
package/dist/lib/ai-model/llm-planning.js +2 -11
package/dist/lib/ai-model/llm-planning.js.map +1 -1
package/dist/lib/ai-model/prompt/llm-planning.js +7 -2
package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
package/dist/lib/ai-model/ui-tars-planning.js +1 -1
package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
package/dist/lib/common.js +5 -20
package/dist/lib/common.js.map +1 -1
package/dist/lib/device/index.js +15 -52
package/dist/lib/device/index.js.map +1 -1
package/dist/lib/types.js.map +1 -1
package/dist/lib/utils.js +2 -2
package/dist/types/agent/agent.d.ts +15 -4
package/dist/types/agent/tasks.d.ts +1 -2
package/dist/types/common.d.ts +1 -8
package/dist/types/device/index.d.ts +0 -22
package/dist/types/types.d.ts +2 -1
package/package.json +2 -2
package/dist/es/ai-model/prompt/ui-tars-locator.mjs +0 -34
package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +0 -1
package/dist/lib/ai-model/prompt/ui-tars-locator.js +0 -68
package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +0 -1
package/dist/types/ai-model/prompt/ui-tars-locator.d.ts +0 -1

package/dist/types/agent/agent.d.ts CHANGED Viewed

@@ -47,7 +47,6 @@ export declare class Agent<InterfaceType extends AbstractInterface = AbstractInt
      */
     private screenshotScalePromise?;
     private executionDumpIndexByRunner;
-    private fullActionSpace;
     get page(): InterfaceType;
     /**
      * Ensures VL model warning is shown once when needed
@@ -108,11 +107,19 @@ export declare class Agent<InterfaceType extends AbstractInterface = AbstractInt
      * @deprecated Use aiScroll(locatePrompt, opt) instead where opt contains the scroll parameters
      */
     aiScroll(scrollParam: ScrollParam, locatePrompt?: TUserPrompt, opt?: LocateOption): Promise<any>;
-    aiAct(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
+    aiAct(taskPrompt: string, opt?: AiActOptions): Promise<{
+        result: Record<string, any>;
+    } | {
+        yamlFlow?: import("../yaml").MidsceneYamlFlowItem[];
+    } | undefined>;
     /**
      * @deprecated Use {@link Agent.aiAct} instead.
      */
-    aiAction(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
+    aiAction(taskPrompt: string, opt?: AiActOptions): Promise<{
+        result: Record<string, any>;
+    } | {
+        yamlFlow?: import("../yaml").MidsceneYamlFlowItem[];
+    } | undefined>;
     aiQuery<ReturnType = any>(demand: ServiceExtractParam, opt?: ServiceExtractOption): Promise<ReturnType>;
     aiBoolean(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<boolean>;
     aiNumber(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<number>;
@@ -133,7 +140,11 @@ export declare class Agent<InterfaceType extends AbstractInterface = AbstractInt
         message: string | undefined;
     } | undefined>;
     aiWaitFor(assertion: TUserPrompt, opt?: AgentWaitForOpt): Promise<void>;
-    ai(...args: Parameters<typeof this.aiAct>): Promise<string | undefined>;
+    ai(...args: Parameters<typeof this.aiAct>): Promise<{
+        result: Record<string, any>;
+    } | {
+        yamlFlow?: import("../yaml").MidsceneYamlFlowItem[];
+    } | undefined>;
     runYaml(yamlScriptContent: string): Promise<{
         result: Record<string, any>;
     }>;

package/dist/types/agent/tasks.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { type TMultimodalPrompt, type TUserPrompt } from '../common';
+import type { TMultimodalPrompt, TUserPrompt } from '../common';
 import type { AbstractInterface } from '../device';
 import type Service from '../service';
 import type { TaskRunner } from '../task-runner';
@@ -48,7 +48,6 @@ export declare class TaskExecutor {
     runPlans(title: string, plans: PlanningAction[], modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig): Promise<ExecutionResult>;
     action(userPrompt: string, modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig, includeBboxInPlanning: boolean, aiActContext?: string, cacheable?: boolean, replanningCycleLimitOverride?: number, imagesIncludeCount?: number, deepThink?: DeepThinkOption, fileChooserAccept?: string[]): Promise<ExecutionResult<{
         yamlFlow?: MidsceneYamlFlowItem[];
-        output?: string;
     } | undefined>>;
     private runAction;
     private createTypeQueryTask;

package/dist/types/common.d.ts CHANGED Viewed

@@ -20,7 +20,7 @@ export declare function mergeRects(rects: Rect[]): {
 };
 export declare function expandSearchArea(rect: Rect, screenSize: Size, vlMode: TVlModeTypes | undefined): Rect;
 export declare function markupImageForLLM(screenshotBase64: string, tree: ElementTreeNode<BaseElement>, size: Size): Promise<string>;
-export declare function buildYamlFlowFromPlans(plans: PlanningAction[], actionSpace: DeviceAction<any>[]): MidsceneYamlFlowItem[];
+export declare function buildYamlFlowFromPlans(plans: PlanningAction[], actionSpace: DeviceAction<any>[], sleep?: number): MidsceneYamlFlowItem[];
 export declare const PointSchema: z.ZodObject<{
     left: z.ZodNumber;
     top: z.ZodNumber;
@@ -558,11 +558,4 @@ export declare const loadActionParam: (jsonObject: Record<string, any>, zodSchem
  * so they are intentionally excluded from Zod parsing and use existing validation logic.
  */
 export declare const parseActionParam: (rawParam: Record<string, any> | undefined, zodSchema?: z.ZodType<any>) => Record<string, any> | undefined;
-export declare const finalizeActionName = "Finalize";
-/**
- * Get a readable time string for the current time
- * @param format - Optional format string. Supports: YYYY, MM, DD, HH, mm, ss. Default: 'YYYY-MM-DD HH:mm:ss'
- * @returns A formatted time string with format label
- */
-export declare const getReadableTimeString: (format?: string) => string;
 export {};

package/dist/types/device/index.d.ts CHANGED Viewed

@@ -2177,27 +2177,5 @@ export type ActionAssertParam = {
     result: boolean;
 };
 export declare const defineActionAssert: () => DeviceAction<ActionAssertParam>;
-export declare const ActionSleepParamSchema: z.ZodObject<{
-    millisecond: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
-}, "strip", z.ZodTypeAny, {
-    millisecond?: number | undefined;
-}, {
-    millisecond?: number | undefined;
-}>;
-export type ActionSleepParam = {
-    millisecond?: number;
-};
-export declare const defineActionSleep: () => DeviceAction<ActionSleepParam>;
-export declare const actionFinalizeParamSchema: z.ZodObject<{
-    message: z.ZodOptional<z.ZodString>;
-}, "strip", z.ZodTypeAny, {
-    message?: string | undefined;
-}, {
-    message?: string | undefined;
-}>;
-export type ActionFinalizeParam = {
-    message?: string;
-};
-export declare const defineActionFinalize: () => DeviceAction<ActionFinalizeParam>;
 export type { DeviceAction } from '../types';
 export type { AndroidDeviceOpt, AndroidDeviceInputOpt, IOSDeviceOpt, IOSDeviceInputOpt, } from './device-options';

package/dist/types/types.d.ts CHANGED Viewed

@@ -177,7 +177,9 @@ export interface PlanningAction<ParamType = any> {
 }
 export interface RawResponsePlanningAIResponse {
     action: PlanningAction;
+    more_actions_needed_by_instruction: boolean;
     log: string;
+    sleep?: number;
     error?: string;
 }
 export interface PlanningAIResponse extends Omit<RawResponsePlanningAIResponse, 'action'> {
@@ -188,7 +190,6 @@ export interface PlanningAIResponse extends Omit<RawResponsePlanningAIResponse,
     yamlString?: string;
     error?: string;
     reasoning_content?: string;
-    shouldContinuePlanning: boolean;
 }
 export interface PlanningActionParamSleep {
     timeMs: number;

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@midscene/core",
   "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
-  "version": "1.2.1-beta-20260108154312.0",
+  "version": "1.2.1-beta-20260109060244.0",
   "repository": "https://github.com/web-infra-dev/midscene",
   "homepage": "https://midscenejs.com/",
   "main": "./dist/lib/index.js",
@@ -89,7 +89,7 @@
     "semver": "7.5.2",
     "undici": "^6.0.0",
     "zod": "3.24.3",
-    "@midscene/shared": "1.2.1-beta-20260108154312.0"
+    "@midscene/shared": "1.2.1-beta-20260109060244.0"
   },
   "devDependencies": {
     "@rslib/core": "^0.18.3",

package/dist/es/ai-model/prompt/ui-tars-locator.mjs DELETED Viewed

@@ -1,34 +0,0 @@
-import { getPreferredLanguage } from "@midscene/shared/env";
-function systemPromptToLocateElementPosition() {
-    const preferredLanguage = getPreferredLanguage();
-    return `
-You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
-## Output Format
-\`\`\`
-Thought: ...
-Action: ...
-\`\`\`
-## Action Space
-click(start_box='[x1, y1, x2, y2]')
-left_double(start_box='[x1, y1, x2, y2]')
-right_single(start_box='[x1, y1, x2, y2]')
-drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
-hotkey(key='')
-type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
-scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
-wait() #Sleep for 5s and take a screenshot to check for any changes.
-finished()
-call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
-## Note
-- Use ${preferredLanguage} in \`Thought\` part.
-- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
-## User Instruction
-    `;
-}
-export { systemPromptToLocateElementPosition };
-//# sourceMappingURL=ui-tars-locator.mjs.map

package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"ai-model/prompt/ui-tars-locator.mjs","sources":["../../../../src/ai-model/prompt/ui-tars-locator.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\n\n// claude 3.5 sonnet computer The ability to understand the content of the image is better, Does not provide element snapshot effect\nexport function systemPromptToLocateElementPosition() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\nclick(start_box='[x1, y1, x2, y2]')\nleft_double(start_box='[x1, y1, x2, y2]')\nright_single(start_box='[x1, y1, x2, y2]')\ndrag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\\\n\" at the end of \\`content\\`.\nscroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished()\ncall_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.\n\n## Note\n- Use ${preferredLanguage} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n `;\n}\n"],"names":["systemPromptToLocateElementPosition","preferredLanguage","getPreferredLanguage"],"mappings":";AAGO,SAASA;IACd,MAAMC,oBAAoBC;IAE1B,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;MAsBJ,EAAED,kBAAkB;;;;IAItB,CAAC;AACL"}

package/dist/lib/ai-model/prompt/ui-tars-locator.js DELETED Viewed

@@ -1,68 +0,0 @@
-"use strict";
-var __webpack_require__ = {};
-(()=>{
-    __webpack_require__.d = (exports1, definition)=>{
-        for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
-            enumerable: true,
-            get: definition[key]
-        });
-    };
-})();
-(()=>{
-    __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
-})();
-(()=>{
-    __webpack_require__.r = (exports1)=>{
-        if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
-            value: 'Module'
-        });
-        Object.defineProperty(exports1, '__esModule', {
-            value: true
-        });
-    };
-})();
-var __webpack_exports__ = {};
-__webpack_require__.r(__webpack_exports__);
-__webpack_require__.d(__webpack_exports__, {
-    systemPromptToLocateElementPosition: ()=>systemPromptToLocateElementPosition
-});
-const env_namespaceObject = require("@midscene/shared/env");
-function systemPromptToLocateElementPosition() {
-    const preferredLanguage = (0, env_namespaceObject.getPreferredLanguage)();
-    return `
-You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
-## Output Format
-\`\`\`
-Thought: ...
-Action: ...
-\`\`\`
-## Action Space
-click(start_box='[x1, y1, x2, y2]')
-left_double(start_box='[x1, y1, x2, y2]')
-right_single(start_box='[x1, y1, x2, y2]')
-drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
-hotkey(key='')
-type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
-scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
-wait() #Sleep for 5s and take a screenshot to check for any changes.
-finished()
-call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
-## Note
-- Use ${preferredLanguage} in \`Thought\` part.
-- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
-## User Instruction
-    `;
-}
-exports.systemPromptToLocateElementPosition = __webpack_exports__.systemPromptToLocateElementPosition;
-for(var __rspack_i in __webpack_exports__)if (-1 === [
-    "systemPromptToLocateElementPosition"
-].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
-Object.defineProperty(exports, '__esModule', {
-    value: true
-});
-//# sourceMappingURL=ui-tars-locator.js.map

package/dist/lib/ai-model/prompt/ui-tars-locator.js.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"ai-model/prompt/ui-tars-locator.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/ui-tars-locator.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { getPreferredLanguage } from '@midscene/shared/env';\n\n// claude 3.5 sonnet computer The ability to understand the content of the image is better, Does not provide element snapshot effect\nexport function systemPromptToLocateElementPosition() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\nclick(start_box='[x1, y1, x2, y2]')\nleft_double(start_box='[x1, y1, x2, y2]')\nright_single(start_box='[x1, y1, x2, y2]')\ndrag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\\\n\" at the end of \\`content\\`.\nscroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished()\ncall_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.\n\n## Note\n- Use ${preferredLanguage} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n `;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","systemPromptToLocateElementPosition","preferredLanguage","getPreferredLanguage"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACHO,SAASI;IACd,MAAMC,oBAAoBC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA;IAE1B,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;MAsBJ,EAAED,kBAAkB;;;;IAItB,CAAC;AACL"}

package/dist/types/ai-model/prompt/ui-tars-locator.d.ts DELETED Viewed

	@@ -1 +0,0 @@
1	- export declare function systemPromptToLocateElementPosition(): string;