@midscene/core 1.2.1-beta-20260108154312.0 → 1.2.1-beta-20260109060244.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/es/agent/agent.mjs +14 -13
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/tasks.mjs +21 -14
  4. package/dist/es/agent/tasks.mjs.map +1 -1
  5. package/dist/es/agent/utils.mjs +1 -1
  6. package/dist/es/ai-model/llm-planning.mjs +3 -12
  7. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  8. package/dist/es/ai-model/prompt/llm-planning.mjs +7 -2
  9. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  10. package/dist/es/ai-model/ui-tars-planning.mjs +1 -1
  11. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  12. package/dist/es/common.mjs +5 -14
  13. package/dist/es/common.mjs.map +1 -1
  14. package/dist/es/device/index.mjs +3 -28
  15. package/dist/es/device/index.mjs.map +1 -1
  16. package/dist/es/types.mjs.map +1 -1
  17. package/dist/es/utils.mjs +2 -2
  18. package/dist/lib/agent/agent.js +13 -12
  19. package/dist/lib/agent/agent.js.map +1 -1
  20. package/dist/lib/agent/tasks.js +21 -14
  21. package/dist/lib/agent/tasks.js.map +1 -1
  22. package/dist/lib/agent/utils.js +1 -1
  23. package/dist/lib/ai-model/llm-planning.js +2 -11
  24. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  25. package/dist/lib/ai-model/prompt/llm-planning.js +7 -2
  26. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  27. package/dist/lib/ai-model/ui-tars-planning.js +1 -1
  28. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  29. package/dist/lib/common.js +5 -20
  30. package/dist/lib/common.js.map +1 -1
  31. package/dist/lib/device/index.js +15 -52
  32. package/dist/lib/device/index.js.map +1 -1
  33. package/dist/lib/types.js.map +1 -1
  34. package/dist/lib/utils.js +2 -2
  35. package/dist/types/agent/agent.d.ts +15 -4
  36. package/dist/types/agent/tasks.d.ts +1 -2
  37. package/dist/types/common.d.ts +1 -8
  38. package/dist/types/device/index.d.ts +0 -22
  39. package/dist/types/types.d.ts +2 -1
  40. package/package.json +2 -2
  41. package/dist/es/ai-model/prompt/ui-tars-locator.mjs +0 -34
  42. package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +0 -1
  43. package/dist/lib/ai-model/prompt/ui-tars-locator.js +0 -68
  44. package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +0 -1
  45. package/dist/types/ai-model/prompt/ui-tars-locator.d.ts +0 -1
@@ -47,7 +47,6 @@ export declare class Agent<InterfaceType extends AbstractInterface = AbstractInt
47
47
  */
48
48
  private screenshotScalePromise?;
49
49
  private executionDumpIndexByRunner;
50
- private fullActionSpace;
51
50
  get page(): InterfaceType;
52
51
  /**
53
52
  * Ensures VL model warning is shown once when needed
@@ -108,11 +107,19 @@ export declare class Agent<InterfaceType extends AbstractInterface = AbstractInt
108
107
  * @deprecated Use aiScroll(locatePrompt, opt) instead where opt contains the scroll parameters
109
108
  */
110
109
  aiScroll(scrollParam: ScrollParam, locatePrompt?: TUserPrompt, opt?: LocateOption): Promise<any>;
111
- aiAct(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
110
+ aiAct(taskPrompt: string, opt?: AiActOptions): Promise<{
111
+ result: Record<string, any>;
112
+ } | {
113
+ yamlFlow?: import("../yaml").MidsceneYamlFlowItem[];
114
+ } | undefined>;
112
115
  /**
113
116
  * @deprecated Use {@link Agent.aiAct} instead.
114
117
  */
115
- aiAction(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
118
+ aiAction(taskPrompt: string, opt?: AiActOptions): Promise<{
119
+ result: Record<string, any>;
120
+ } | {
121
+ yamlFlow?: import("../yaml").MidsceneYamlFlowItem[];
122
+ } | undefined>;
116
123
  aiQuery<ReturnType = any>(demand: ServiceExtractParam, opt?: ServiceExtractOption): Promise<ReturnType>;
117
124
  aiBoolean(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<boolean>;
118
125
  aiNumber(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<number>;
@@ -133,7 +140,11 @@ export declare class Agent<InterfaceType extends AbstractInterface = AbstractInt
133
140
  message: string | undefined;
134
141
  } | undefined>;
135
142
  aiWaitFor(assertion: TUserPrompt, opt?: AgentWaitForOpt): Promise<void>;
136
- ai(...args: Parameters<typeof this.aiAct>): Promise<string | undefined>;
143
+ ai(...args: Parameters<typeof this.aiAct>): Promise<{
144
+ result: Record<string, any>;
145
+ } | {
146
+ yamlFlow?: import("../yaml").MidsceneYamlFlowItem[];
147
+ } | undefined>;
137
148
  runYaml(yamlScriptContent: string): Promise<{
138
149
  result: Record<string, any>;
139
150
  }>;
@@ -1,4 +1,4 @@
1
- import { type TMultimodalPrompt, type TUserPrompt } from '../common';
1
+ import type { TMultimodalPrompt, TUserPrompt } from '../common';
2
2
  import type { AbstractInterface } from '../device';
3
3
  import type Service from '../service';
4
4
  import type { TaskRunner } from '../task-runner';
@@ -48,7 +48,6 @@ export declare class TaskExecutor {
48
48
  runPlans(title: string, plans: PlanningAction[], modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig): Promise<ExecutionResult>;
49
49
  action(userPrompt: string, modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig, includeBboxInPlanning: boolean, aiActContext?: string, cacheable?: boolean, replanningCycleLimitOverride?: number, imagesIncludeCount?: number, deepThink?: DeepThinkOption, fileChooserAccept?: string[]): Promise<ExecutionResult<{
50
50
  yamlFlow?: MidsceneYamlFlowItem[];
51
- output?: string;
52
51
  } | undefined>>;
53
52
  private runAction;
54
53
  private createTypeQueryTask;
@@ -20,7 +20,7 @@ export declare function mergeRects(rects: Rect[]): {
20
20
  };
21
21
  export declare function expandSearchArea(rect: Rect, screenSize: Size, vlMode: TVlModeTypes | undefined): Rect;
22
22
  export declare function markupImageForLLM(screenshotBase64: string, tree: ElementTreeNode<BaseElement>, size: Size): Promise<string>;
23
- export declare function buildYamlFlowFromPlans(plans: PlanningAction[], actionSpace: DeviceAction<any>[]): MidsceneYamlFlowItem[];
23
+ export declare function buildYamlFlowFromPlans(plans: PlanningAction[], actionSpace: DeviceAction<any>[], sleep?: number): MidsceneYamlFlowItem[];
24
24
  export declare const PointSchema: z.ZodObject<{
25
25
  left: z.ZodNumber;
26
26
  top: z.ZodNumber;
@@ -558,11 +558,4 @@ export declare const loadActionParam: (jsonObject: Record<string, any>, zodSchem
558
558
  * so they are intentionally excluded from Zod parsing and use existing validation logic.
559
559
  */
560
560
  export declare const parseActionParam: (rawParam: Record<string, any> | undefined, zodSchema?: z.ZodType<any>) => Record<string, any> | undefined;
561
- export declare const finalizeActionName = "Finalize";
562
- /**
563
- * Get a readable time string for the current time
564
- * @param format - Optional format string. Supports: YYYY, MM, DD, HH, mm, ss. Default: 'YYYY-MM-DD HH:mm:ss'
565
- * @returns A formatted time string with format label
566
- */
567
- export declare const getReadableTimeString: (format?: string) => string;
568
561
  export {};
@@ -2177,27 +2177,5 @@ export type ActionAssertParam = {
2177
2177
  result: boolean;
2178
2178
  };
2179
2179
  export declare const defineActionAssert: () => DeviceAction<ActionAssertParam>;
2180
- export declare const ActionSleepParamSchema: z.ZodObject<{
2181
- millisecond: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
2182
- }, "strip", z.ZodTypeAny, {
2183
- millisecond?: number | undefined;
2184
- }, {
2185
- millisecond?: number | undefined;
2186
- }>;
2187
- export type ActionSleepParam = {
2188
- millisecond?: number;
2189
- };
2190
- export declare const defineActionSleep: () => DeviceAction<ActionSleepParam>;
2191
- export declare const actionFinalizeParamSchema: z.ZodObject<{
2192
- message: z.ZodOptional<z.ZodString>;
2193
- }, "strip", z.ZodTypeAny, {
2194
- message?: string | undefined;
2195
- }, {
2196
- message?: string | undefined;
2197
- }>;
2198
- export type ActionFinalizeParam = {
2199
- message?: string;
2200
- };
2201
- export declare const defineActionFinalize: () => DeviceAction<ActionFinalizeParam>;
2202
2180
  export type { DeviceAction } from '../types';
2203
2181
  export type { AndroidDeviceOpt, AndroidDeviceInputOpt, IOSDeviceOpt, IOSDeviceInputOpt, } from './device-options';
@@ -177,7 +177,9 @@ export interface PlanningAction<ParamType = any> {
177
177
  }
178
178
  export interface RawResponsePlanningAIResponse {
179
179
  action: PlanningAction;
180
+ more_actions_needed_by_instruction: boolean;
180
181
  log: string;
182
+ sleep?: number;
181
183
  error?: string;
182
184
  }
183
185
  export interface PlanningAIResponse extends Omit<RawResponsePlanningAIResponse, 'action'> {
@@ -188,7 +190,6 @@ export interface PlanningAIResponse extends Omit<RawResponsePlanningAIResponse,
188
190
  yamlString?: string;
189
191
  error?: string;
190
192
  reasoning_content?: string;
191
- shouldContinuePlanning: boolean;
192
193
  }
193
194
  export interface PlanningActionParamSleep {
194
195
  timeMs: number;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
4
- "version": "1.2.1-beta-20260108154312.0",
4
+ "version": "1.2.1-beta-20260109060244.0",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "main": "./dist/lib/index.js",
@@ -89,7 +89,7 @@
89
89
  "semver": "7.5.2",
90
90
  "undici": "^6.0.0",
91
91
  "zod": "3.24.3",
92
- "@midscene/shared": "1.2.1-beta-20260108154312.0"
92
+ "@midscene/shared": "1.2.1-beta-20260109060244.0"
93
93
  },
94
94
  "devDependencies": {
95
95
  "@rslib/core": "^0.18.3",
@@ -1,34 +0,0 @@
1
- import { getPreferredLanguage } from "@midscene/shared/env";
2
- function systemPromptToLocateElementPosition() {
3
- const preferredLanguage = getPreferredLanguage();
4
- return `
5
- You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
6
-
7
- ## Output Format
8
- \`\`\`
9
- Thought: ...
10
- Action: ...
11
- \`\`\`
12
-
13
- ## Action Space
14
- click(start_box='[x1, y1, x2, y2]')
15
- left_double(start_box='[x1, y1, x2, y2]')
16
- right_single(start_box='[x1, y1, x2, y2]')
17
- drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
18
- hotkey(key='')
19
- type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
20
- scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
21
- wait() #Sleep for 5s and take a screenshot to check for any changes.
22
- finished()
23
- call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
24
-
25
- ## Note
26
- - Use ${preferredLanguage} in \`Thought\` part.
27
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
28
-
29
- ## User Instruction
30
- `;
31
- }
32
- export { systemPromptToLocateElementPosition };
33
-
34
- //# sourceMappingURL=ui-tars-locator.mjs.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"ai-model/prompt/ui-tars-locator.mjs","sources":["../../../../src/ai-model/prompt/ui-tars-locator.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\n\n// claude 3.5 sonnet computer The ability to understand the content of the image is better, Does not provide element snapshot effect\nexport function systemPromptToLocateElementPosition() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\nclick(start_box='[x1, y1, x2, y2]')\nleft_double(start_box='[x1, y1, x2, y2]')\nright_single(start_box='[x1, y1, x2, y2]')\ndrag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\\\n\" at the end of \\`content\\`.\nscroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished()\ncall_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.\n\n## Note\n- Use ${preferredLanguage} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n `;\n}\n"],"names":["systemPromptToLocateElementPosition","preferredLanguage","getPreferredLanguage"],"mappings":";AAGO,SAASA;IACd,MAAMC,oBAAoBC;IAE1B,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;MAsBJ,EAAED,kBAAkB;;;;IAItB,CAAC;AACL"}
@@ -1,68 +0,0 @@
1
- "use strict";
2
- var __webpack_require__ = {};
3
- (()=>{
4
- __webpack_require__.d = (exports1, definition)=>{
5
- for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
6
- enumerable: true,
7
- get: definition[key]
8
- });
9
- };
10
- })();
11
- (()=>{
12
- __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
13
- })();
14
- (()=>{
15
- __webpack_require__.r = (exports1)=>{
16
- if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
17
- value: 'Module'
18
- });
19
- Object.defineProperty(exports1, '__esModule', {
20
- value: true
21
- });
22
- };
23
- })();
24
- var __webpack_exports__ = {};
25
- __webpack_require__.r(__webpack_exports__);
26
- __webpack_require__.d(__webpack_exports__, {
27
- systemPromptToLocateElementPosition: ()=>systemPromptToLocateElementPosition
28
- });
29
- const env_namespaceObject = require("@midscene/shared/env");
30
- function systemPromptToLocateElementPosition() {
31
- const preferredLanguage = (0, env_namespaceObject.getPreferredLanguage)();
32
- return `
33
- You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
34
-
35
- ## Output Format
36
- \`\`\`
37
- Thought: ...
38
- Action: ...
39
- \`\`\`
40
-
41
- ## Action Space
42
- click(start_box='[x1, y1, x2, y2]')
43
- left_double(start_box='[x1, y1, x2, y2]')
44
- right_single(start_box='[x1, y1, x2, y2]')
45
- drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
46
- hotkey(key='')
47
- type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
48
- scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
49
- wait() #Sleep for 5s and take a screenshot to check for any changes.
50
- finished()
51
- call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
52
-
53
- ## Note
54
- - Use ${preferredLanguage} in \`Thought\` part.
55
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
56
-
57
- ## User Instruction
58
- `;
59
- }
60
- exports.systemPromptToLocateElementPosition = __webpack_exports__.systemPromptToLocateElementPosition;
61
- for(var __rspack_i in __webpack_exports__)if (-1 === [
62
- "systemPromptToLocateElementPosition"
63
- ].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
64
- Object.defineProperty(exports, '__esModule', {
65
- value: true
66
- });
67
-
68
- //# sourceMappingURL=ui-tars-locator.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"ai-model/prompt/ui-tars-locator.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/ui-tars-locator.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { getPreferredLanguage } from '@midscene/shared/env';\n\n// claude 3.5 sonnet computer The ability to understand the content of the image is better, Does not provide element snapshot effect\nexport function systemPromptToLocateElementPosition() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\nclick(start_box='[x1, y1, x2, y2]')\nleft_double(start_box='[x1, y1, x2, y2]')\nright_single(start_box='[x1, y1, x2, y2]')\ndrag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\\\n\" at the end of \\`content\\`.\nscroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished()\ncall_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.\n\n## Note\n- Use ${preferredLanguage} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n `;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","systemPromptToLocateElementPosition","preferredLanguage","getPreferredLanguage"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACHO,SAASI;IACd,MAAMC,oBAAoBC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA;IAE1B,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;MAsBJ,EAAED,kBAAkB;;;;IAItB,CAAC;AACL"}
@@ -1 +0,0 @@
1
- export declare function systemPromptToLocateElementPosition(): string;