@midscene/core 0.30.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/dist/es/agent/agent.mjs +233 -144
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/execution-session.mjs +41 -0
  4. package/dist/es/agent/execution-session.mjs.map +1 -0
  5. package/dist/es/agent/index.mjs +3 -3
  6. package/dist/es/agent/task-builder.mjs +319 -0
  7. package/dist/es/agent/task-builder.mjs.map +1 -0
  8. package/dist/es/agent/task-cache.mjs +4 -4
  9. package/dist/es/agent/task-cache.mjs.map +1 -1
  10. package/dist/es/agent/tasks.mjs +197 -504
  11. package/dist/es/agent/tasks.mjs.map +1 -1
  12. package/dist/es/agent/ui-utils.mjs +54 -35
  13. package/dist/es/agent/ui-utils.mjs.map +1 -1
  14. package/dist/es/agent/utils.mjs +16 -58
  15. package/dist/es/agent/utils.mjs.map +1 -1
  16. package/dist/es/ai-model/conversation-history.mjs +25 -13
  17. package/dist/es/ai-model/conversation-history.mjs.map +1 -1
  18. package/dist/es/ai-model/index.mjs +4 -4
  19. package/dist/es/ai-model/inspect.mjs +45 -54
  20. package/dist/es/ai-model/inspect.mjs.map +1 -1
  21. package/dist/es/ai-model/llm-planning.mjs +47 -65
  22. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  23. package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
  24. package/dist/es/ai-model/prompt/common.mjs.map +1 -1
  25. package/dist/es/ai-model/prompt/describe.mjs.map +1 -1
  26. package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
  27. package/dist/es/ai-model/prompt/llm-locator.mjs +11 -235
  28. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
  29. package/dist/es/ai-model/prompt/llm-planning.mjs +76 -322
  30. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  31. package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -14
  32. package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
  33. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
  34. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
  35. package/dist/es/ai-model/prompt/playwright-generator.mjs +2 -2
  36. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  37. package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -1
  38. package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -1
  39. package/dist/es/ai-model/prompt/util.mjs +3 -88
  40. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  41. package/dist/es/ai-model/prompt/yaml-generator.mjs +10 -10
  42. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  43. package/dist/es/ai-model/service-caller/index.mjs +182 -274
  44. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  45. package/dist/es/ai-model/ui-tars-planning.mjs +69 -8
  46. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  47. package/dist/es/{ai-model/common.mjs → common.mjs} +18 -30
  48. package/dist/es/common.mjs.map +1 -0
  49. package/dist/es/device/device-options.mjs +0 -0
  50. package/dist/es/device/index.mjs +29 -12
  51. package/dist/es/device/index.mjs.map +1 -1
  52. package/dist/es/index.mjs +5 -4
  53. package/dist/es/index.mjs.map +1 -1
  54. package/dist/es/report.mjs.map +1 -1
  55. package/dist/es/{insight → service}/index.mjs +38 -51
  56. package/dist/es/service/index.mjs.map +1 -0
  57. package/dist/es/{insight → service}/utils.mjs +3 -3
  58. package/dist/es/service/utils.mjs.map +1 -0
  59. package/dist/es/task-runner.mjs +264 -0
  60. package/dist/es/task-runner.mjs.map +1 -0
  61. package/dist/es/tree.mjs +13 -2
  62. package/dist/es/tree.mjs.map +1 -0
  63. package/dist/es/types.mjs +18 -1
  64. package/dist/es/types.mjs.map +1 -1
  65. package/dist/es/utils.mjs +6 -7
  66. package/dist/es/utils.mjs.map +1 -1
  67. package/dist/es/yaml/builder.mjs.map +1 -1
  68. package/dist/es/yaml/player.mjs +121 -98
  69. package/dist/es/yaml/player.mjs.map +1 -1
  70. package/dist/es/yaml/utils.mjs +1 -1
  71. package/dist/es/yaml/utils.mjs.map +1 -1
  72. package/dist/lib/agent/agent.js +231 -142
  73. package/dist/lib/agent/agent.js.map +1 -1
  74. package/dist/lib/agent/common.js +1 -1
  75. package/dist/lib/agent/execution-session.js +75 -0
  76. package/dist/lib/agent/execution-session.js.map +1 -0
  77. package/dist/lib/agent/index.js +14 -14
  78. package/dist/lib/agent/index.js.map +1 -1
  79. package/dist/lib/agent/task-builder.js +356 -0
  80. package/dist/lib/agent/task-builder.js.map +1 -0
  81. package/dist/lib/agent/task-cache.js +8 -8
  82. package/dist/lib/agent/task-cache.js.map +1 -1
  83. package/dist/lib/agent/tasks.js +202 -506
  84. package/dist/lib/agent/tasks.js.map +1 -1
  85. package/dist/lib/agent/ui-utils.js +58 -36
  86. package/dist/lib/agent/ui-utils.js.map +1 -1
  87. package/dist/lib/agent/utils.js +26 -68
  88. package/dist/lib/agent/utils.js.map +1 -1
  89. package/dist/lib/ai-model/conversation-history.js +27 -15
  90. package/dist/lib/ai-model/conversation-history.js.map +1 -1
  91. package/dist/lib/ai-model/index.js +27 -27
  92. package/dist/lib/ai-model/index.js.map +1 -1
  93. package/dist/lib/ai-model/inspect.js +51 -57
  94. package/dist/lib/ai-model/inspect.js.map +1 -1
  95. package/dist/lib/ai-model/llm-planning.js +49 -67
  96. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  97. package/dist/lib/ai-model/prompt/assertion.js +2 -2
  98. package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
  99. package/dist/lib/ai-model/prompt/common.js +2 -2
  100. package/dist/lib/ai-model/prompt/common.js.map +1 -1
  101. package/dist/lib/ai-model/prompt/describe.js +2 -2
  102. package/dist/lib/ai-model/prompt/describe.js.map +1 -1
  103. package/dist/lib/ai-model/prompt/extraction.js +2 -2
  104. package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
  105. package/dist/lib/ai-model/prompt/llm-locator.js +14 -241
  106. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
  107. package/dist/lib/ai-model/prompt/llm-planning.js +79 -328
  108. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  109. package/dist/lib/ai-model/prompt/llm-section-locator.js +17 -16
  110. package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
  111. package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
  112. package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
  113. package/dist/lib/ai-model/prompt/playwright-generator.js +11 -11
  114. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  115. package/dist/lib/ai-model/prompt/ui-tars-locator.js +2 -2
  116. package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -1
  117. package/dist/lib/ai-model/prompt/ui-tars-planning.js +2 -2
  118. package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -1
  119. package/dist/lib/ai-model/prompt/util.js +7 -95
  120. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  121. package/dist/lib/ai-model/prompt/yaml-generator.js +18 -18
  122. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  123. package/dist/lib/ai-model/service-caller/index.js +288 -401
  124. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  125. package/dist/lib/ai-model/ui-tars-planning.js +71 -10
  126. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  127. package/dist/lib/{ai-model/common.js → common.js} +40 -55
  128. package/dist/lib/common.js.map +1 -0
  129. package/dist/lib/device/device-options.js +20 -0
  130. package/dist/lib/device/device-options.js.map +1 -0
  131. package/dist/lib/device/index.js +63 -40
  132. package/dist/lib/device/index.js.map +1 -1
  133. package/dist/lib/image/index.js +5 -5
  134. package/dist/lib/image/index.js.map +1 -1
  135. package/dist/lib/index.js +24 -20
  136. package/dist/lib/index.js.map +1 -1
  137. package/dist/lib/report.js +2 -2
  138. package/dist/lib/report.js.map +1 -1
  139. package/dist/lib/{insight → service}/index.js +41 -54
  140. package/dist/lib/service/index.js.map +1 -0
  141. package/dist/lib/{insight → service}/utils.js +7 -7
  142. package/dist/lib/service/utils.js.map +1 -0
  143. package/dist/lib/task-runner.js +301 -0
  144. package/dist/lib/task-runner.js.map +1 -0
  145. package/dist/lib/tree.js +13 -4
  146. package/dist/lib/tree.js.map +1 -1
  147. package/dist/lib/types.js +31 -12
  148. package/dist/lib/types.js.map +1 -1
  149. package/dist/lib/utils.js +16 -17
  150. package/dist/lib/utils.js.map +1 -1
  151. package/dist/lib/yaml/builder.js +2 -2
  152. package/dist/lib/yaml/builder.js.map +1 -1
  153. package/dist/lib/yaml/index.js +16 -22
  154. package/dist/lib/yaml/index.js.map +1 -1
  155. package/dist/lib/yaml/player.js +123 -100
  156. package/dist/lib/yaml/player.js.map +1 -1
  157. package/dist/lib/yaml/utils.js +6 -6
  158. package/dist/lib/yaml/utils.js.map +1 -1
  159. package/dist/lib/yaml.js +1 -1
  160. package/dist/lib/yaml.js.map +1 -1
  161. package/dist/types/agent/agent.d.ts +62 -17
  162. package/dist/types/agent/execution-session.d.ts +36 -0
  163. package/dist/types/agent/index.d.ts +3 -2
  164. package/dist/types/agent/task-builder.d.ts +35 -0
  165. package/dist/types/agent/tasks.d.ts +32 -23
  166. package/dist/types/agent/ui-utils.d.ts +9 -2
  167. package/dist/types/agent/utils.d.ts +9 -35
  168. package/dist/types/ai-model/conversation-history.d.ts +8 -4
  169. package/dist/types/ai-model/index.d.ts +5 -5
  170. package/dist/types/ai-model/inspect.d.ts +20 -12
  171. package/dist/types/ai-model/llm-planning.d.ts +3 -1
  172. package/dist/types/ai-model/prompt/llm-locator.d.ts +1 -6
  173. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -3
  174. package/dist/types/ai-model/prompt/llm-section-locator.d.ts +1 -3
  175. package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
  176. package/dist/types/ai-model/prompt/util.d.ts +2 -34
  177. package/dist/types/ai-model/service-caller/index.d.ts +2 -3
  178. package/dist/types/ai-model/ui-tars-planning.d.ts +15 -2
  179. package/dist/types/{ai-model/common.d.ts → common.d.ts} +6 -6
  180. package/dist/types/device/device-options.d.ts +57 -0
  181. package/dist/types/device/index.d.ts +55 -39
  182. package/dist/types/index.d.ts +7 -6
  183. package/dist/types/service/index.d.ts +26 -0
  184. package/dist/types/service/utils.d.ts +2 -0
  185. package/dist/types/task-runner.d.ts +49 -0
  186. package/dist/types/tree.d.ts +4 -1
  187. package/dist/types/types.d.ts +103 -66
  188. package/dist/types/yaml/utils.d.ts +1 -1
  189. package/dist/types/yaml.d.ts +68 -43
  190. package/package.json +9 -12
  191. package/dist/es/ai-model/action-executor.mjs +0 -129
  192. package/dist/es/ai-model/action-executor.mjs.map +0 -1
  193. package/dist/es/ai-model/common.mjs.map +0 -1
  194. package/dist/es/insight/index.mjs.map +0 -1
  195. package/dist/es/insight/utils.mjs.map +0 -1
  196. package/dist/lib/ai-model/action-executor.js +0 -163
  197. package/dist/lib/ai-model/action-executor.js.map +0 -1
  198. package/dist/lib/ai-model/common.js.map +0 -1
  199. package/dist/lib/insight/index.js.map +0 -1
  200. package/dist/lib/insight/utils.js.map +0 -1
  201. package/dist/types/ai-model/action-executor.d.ts +0 -19
  202. package/dist/types/insight/index.d.ts +0 -31
  203. package/dist/types/insight/utils.d.ts +0 -2
@@ -1,8 +1,8 @@
1
1
  import type { NodeType } from '@midscene/shared/constants';
2
- import type { TModelConfigFn } from '@midscene/shared/env';
3
- import type { BaseElement, ElementTreeNode, Rect, Size } from '@midscene/shared/types';
2
+ import type { CreateOpenAIClientFn, TModelConfig } from '@midscene/shared/env';
3
+ import type { BaseElement, LocateResultElement, Rect, Size } from '@midscene/shared/types';
4
4
  import type { z } from 'zod';
5
- import type { TUserPrompt } from './ai-model/common';
5
+ import type { TUserPrompt } from './common';
6
6
  import type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';
7
7
  export type { ElementTreeNode, BaseElement, Rect, Size, Point, } from '@midscene/shared/types';
8
8
  export * from './yaml';
@@ -10,11 +10,13 @@ export type AIUsageInfo = Record<string, any> & {
10
10
  prompt_tokens: number | undefined;
11
11
  completion_tokens: number | undefined;
12
12
  total_tokens: number | undefined;
13
+ cached_input: number | undefined;
13
14
  time_cost: number | undefined;
14
15
  model_name: string | undefined;
15
16
  model_description: string | undefined;
16
17
  intent: string | undefined;
17
18
  };
19
+ export type { LocateResultElement };
18
20
  /**
19
21
  * openai
20
22
  *
@@ -39,23 +41,11 @@ export type AISingleElementResponseByPosition = {
39
41
  text: string;
40
42
  };
41
43
  export type AISingleElementResponse = AISingleElementResponseById;
42
- export interface AIElementLocatorResponse {
43
- elements: {
44
- id: string;
45
- reason?: string;
46
- text?: string;
47
- xpaths?: string[];
48
- }[];
49
- bbox?: [number, number, number, number];
50
- isOrderSensitive?: boolean;
51
- errors?: string[];
52
- }
53
44
  export interface AIElementCoordinatesResponse {
54
45
  bbox: [number, number, number, number];
55
- isOrderSensitive?: boolean;
56
46
  errors?: string[];
57
47
  }
58
- export type AIElementResponse = AIElementLocatorResponse | AIElementCoordinatesResponse;
48
+ export type AIElementResponse = AIElementCoordinatesResponse;
59
49
  export interface AIDataExtractionResponse<DataDemand> {
60
50
  data: DataDemand;
61
51
  errors?: string[];
@@ -91,35 +81,23 @@ export interface AgentDescribeElementAtPointResult {
91
81
  /**
92
82
  * context
93
83
  */
94
- export declare abstract class UIContext<ElementType extends BaseElement = BaseElement> {
84
+ export declare abstract class UIContext {
95
85
  abstract screenshotBase64: string;
96
- abstract tree: ElementTreeNode<ElementType>;
97
86
  abstract size: Size;
98
87
  abstract _isFrozen?: boolean;
99
88
  }
100
89
  export type EnsureObject<T> = {
101
90
  [K in keyof T]: any;
102
91
  };
103
- export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';
104
- export type InsightExtractParam = string | Record<string, string>;
92
+ export type ServiceAction = 'locate' | 'extract' | 'assert' | 'describe';
93
+ export type ServiceExtractParam = string | Record<string, string>;
105
94
  export type ElementCacheFeature = Record<string, unknown>;
106
- export type LocateResultElement = {
107
- center: [number, number];
108
- rect: Rect;
109
- id: string;
110
- indexId?: number;
111
- xpaths: string[];
112
- attributes: {
113
- nodeType: NodeType;
114
- [key: string]: string;
115
- };
116
- isOrderSensitive?: boolean;
117
- };
118
95
  export interface LocateResult {
119
96
  element: LocateResultElement | null;
120
97
  rect?: Rect;
121
98
  }
122
- export interface InsightTaskInfo {
99
+ export type ThinkingLevel = 'off' | 'medium' | 'high';
100
+ export interface ServiceTaskInfo {
123
101
  durationMs: number;
124
102
  formatResponse?: string;
125
103
  rawResponse?: string;
@@ -135,26 +113,38 @@ export interface ReportDumpWithAttributes {
135
113
  dumpString: string;
136
114
  attributes?: Record<string, any>;
137
115
  }
138
- export interface InsightDump extends DumpMeta {
116
+ export interface ServiceDump extends DumpMeta {
139
117
  type: 'locate' | 'extract' | 'assert';
140
118
  logId: string;
141
119
  userQuery: {
142
120
  element?: TUserPrompt;
143
- dataDemand?: InsightExtractParam;
121
+ dataDemand?: ServiceExtractParam;
144
122
  assertion?: TUserPrompt;
145
123
  };
146
- matchedElement: BaseElement[];
124
+ matchedElement: LocateResultElement[];
147
125
  matchedRect?: Rect;
148
126
  deepThink?: boolean;
149
127
  data: any;
150
128
  assertionPass?: boolean;
151
129
  assertionThought?: string;
152
- taskInfo: InsightTaskInfo;
130
+ taskInfo: ServiceTaskInfo;
153
131
  error?: string;
154
132
  output?: any;
155
133
  }
156
- export type PartialInsightDumpFromSDK = Omit<InsightDump, 'logTime' | 'logId' | 'model_name'>;
157
- export type DumpSubscriber = (dump: InsightDump) => Promise<void> | void;
134
+ export type PartialServiceDumpFromSDK = Omit<ServiceDump, 'logTime' | 'logId' | 'model_name'>;
135
+ export interface ServiceResultBase {
136
+ dump: ServiceDump;
137
+ }
138
+ export type LocateResultWithDump = LocateResult & ServiceResultBase;
139
+ export interface ServiceExtractResult<T> extends ServiceResultBase {
140
+ data: T;
141
+ thought?: string;
142
+ usage?: AIUsageInfo;
143
+ }
144
+ export declare class ServiceError extends Error {
145
+ dump: ServiceDump;
146
+ constructor(message: string, dump: ServiceDump);
147
+ }
158
148
  export interface LiteUISection {
159
149
  name: string;
160
150
  description: string;
@@ -162,7 +152,7 @@ export interface LiteUISection {
162
152
  textIds: string[];
163
153
  }
164
154
  export type ElementById = (id: string) => BaseElement | null;
165
- export type InsightAssertionResponse = AIAssertionResponse & {
155
+ export type ServiceAssertionResponse = AIAssertionResponse & {
166
156
  usage?: AIUsageInfo;
167
157
  };
168
158
  /**
@@ -172,6 +162,7 @@ export type OnTaskStartTip = (tip: string) => Promise<void> | void;
172
162
  export interface AgentWaitForOpt {
173
163
  checkIntervalMs?: number;
174
164
  timeoutMs?: number;
165
+ [key: string]: unknown;
175
166
  }
176
167
  export interface AgentAssertOpt {
177
168
  keepRawResponse?: boolean;
@@ -181,33 +172,27 @@ export interface AgentAssertOpt {
181
172
  *
182
173
  */
183
174
  export interface PlanningLocateParam extends DetailedLocateParam {
184
- id?: string;
185
175
  bbox?: [number, number, number, number];
186
176
  }
187
177
  export interface PlanningAction<ParamType = any> {
188
178
  thought?: string;
189
179
  type: string;
190
180
  param: ParamType;
191
- locate?: PlanningLocateParam | null;
192
181
  }
193
- export interface PlanningAIResponse {
194
- action?: PlanningAction;
195
- actions?: PlanningAction[];
182
+ export interface RawResponsePlanningAIResponse {
183
+ action: PlanningAction;
196
184
  more_actions_needed_by_instruction: boolean;
197
185
  log: string;
198
186
  sleep?: number;
199
187
  error?: string;
188
+ }
189
+ export interface PlanningAIResponse extends Omit<RawResponsePlanningAIResponse, 'action'> {
190
+ actions?: PlanningAction[];
200
191
  usage?: AIUsageInfo;
201
192
  rawResponse?: string;
202
193
  yamlFlow?: MidsceneYamlFlowItem[];
203
194
  yamlString?: string;
204
- }
205
- export type PlanningActionParamTap = null;
206
- export type PlanningActionParamHover = null;
207
- export type PlanningActionParamRightClick = null;
208
- export interface PlanningActionParamInputOrKeyPress {
209
- value: string;
210
- autoDismissKeyboard?: boolean;
195
+ error?: string;
211
196
  }
212
197
  export interface PlanningActionParamSleep {
213
198
  timeMs: number;
@@ -216,10 +201,10 @@ export interface PlanningActionParamError {
216
201
  thought: string;
217
202
  }
218
203
  export type PlanningActionParamWaitFor = AgentWaitForOpt & {};
219
- export interface AndroidLongPressParam {
204
+ export interface LongPressParam {
220
205
  duration?: number;
221
206
  }
222
- export interface AndroidPullParam {
207
+ export interface PullParam {
223
208
  direction: 'up' | 'down';
224
209
  distance?: number;
225
210
  duration?: number;
@@ -247,17 +232,18 @@ export interface ExecutionRecorderItem {
247
232
  screenshot?: string;
248
233
  timing?: string;
249
234
  }
250
- export type ExecutionTaskType = 'Planning' | 'Insight' | 'Action' | 'Assertion' | 'Log';
235
+ export type ExecutionTaskType = 'Planning' | 'Insight' | 'Action Space' | 'Log';
251
236
  export interface ExecutorContext {
252
237
  task: ExecutionTask;
253
238
  element?: LocateResultElement | null;
239
+ uiContext?: UIContext;
254
240
  }
255
241
  export interface ExecutionTaskApply<Type extends ExecutionTaskType = any, TaskParam = any, TaskOutput = any, TaskLog = any> {
256
242
  type: Type;
257
243
  subType?: string;
244
+ subTask?: boolean;
258
245
  param?: TaskParam;
259
246
  thought?: string;
260
- locate?: PlanningLocateParam | null;
261
247
  uiContext?: UIContext;
262
248
  executor: (param: TaskParam, context: ExecutorContext) => Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void> | undefined | void;
263
249
  }
@@ -288,17 +274,17 @@ export interface ExecutionDump extends DumpMeta {
288
274
  name: string;
289
275
  description?: string;
290
276
  tasks: ExecutionTask[];
291
- aiActionContext?: string;
277
+ aiActContext?: string;
292
278
  }
293
279
  export type ExecutionTaskInsightLocateParam = PlanningLocateParam;
294
280
  export interface ExecutionTaskInsightLocateOutput {
295
281
  element: LocateResultElement | null;
296
282
  }
297
- export type ExecutionTaskInsightDump = InsightDump;
283
+ export type ExecutionTaskInsightDump = ServiceDump;
298
284
  export type ExecutionTaskInsightLocateApply = ExecutionTaskApply<'Insight', ExecutionTaskInsightLocateParam, ExecutionTaskInsightLocateOutput, ExecutionTaskInsightDump>;
299
285
  export type ExecutionTaskInsightLocate = ExecutionTask<ExecutionTaskInsightLocateApply>;
300
286
  export interface ExecutionTaskInsightQueryParam {
301
- dataDemand: InsightExtractParam;
287
+ dataDemand: ServiceExtractParam;
302
288
  }
303
289
  export interface ExecutionTaskInsightQueryOutput {
304
290
  data: any;
@@ -308,9 +294,9 @@ export type ExecutionTaskInsightQuery = ExecutionTask<ExecutionTaskInsightQueryA
308
294
  export interface ExecutionTaskInsightAssertionParam {
309
295
  assertion: string;
310
296
  }
311
- export type ExecutionTaskInsightAssertionApply = ExecutionTaskApply<'Insight', ExecutionTaskInsightAssertionParam, InsightAssertionResponse, ExecutionTaskInsightDump>;
297
+ export type ExecutionTaskInsightAssertionApply = ExecutionTaskApply<'Insight', ExecutionTaskInsightAssertionParam, ServiceAssertionResponse, ExecutionTaskInsightDump>;
312
298
  export type ExecutionTaskInsightAssertion = ExecutionTask<ExecutionTaskInsightAssertionApply>;
313
- export type ExecutionTaskActionApply<ActionParam = any> = ExecutionTaskApply<'Action', ActionParam, void, void>;
299
+ export type ExecutionTaskActionApply<ActionParam = any> = ExecutionTaskApply<'Action Space', ActionParam, void, void>;
314
300
  export type ExecutionTaskAction = ExecutionTask<ExecutionTaskActionApply>;
315
301
  export type ExecutionTaskLogApply<LogParam = {
316
302
  content: string;
@@ -318,8 +304,16 @@ export type ExecutionTaskLogApply<LogParam = {
318
304
  export type ExecutionTaskLog = ExecutionTask<ExecutionTaskLogApply>;
319
305
  export type ExecutionTaskPlanningApply = ExecutionTaskApply<'Planning', {
320
306
  userInstruction: string;
307
+ aiActContext?: string;
321
308
  }, PlanningAIResponse>;
322
309
  export type ExecutionTaskPlanning = ExecutionTask<ExecutionTaskPlanningApply>;
310
+ export type ExecutionTaskPlanningLocateParam = PlanningLocateParam;
311
+ export interface ExecutionTaskPlanningLocateOutput {
312
+ element: LocateResultElement | null;
313
+ }
314
+ export type ExecutionTaskPlanningDump = ServiceDump;
315
+ export type ExecutionTaskPlanningLocateApply = ExecutionTaskApply<'Planning', ExecutionTaskPlanningLocateParam, ExecutionTaskPlanningLocateOutput, ExecutionTaskPlanningDump>;
316
+ export type ExecutionTaskPlanningLocate = ExecutionTask<ExecutionTaskPlanningLocateApply>;
323
317
  export interface GroupedActionDump {
324
318
  sdkVersion: string;
325
319
  groupName: string;
@@ -359,13 +353,25 @@ export interface StreamingAIResponse {
359
353
  /** Whether the response was streamed */
360
354
  isStreamed: boolean;
361
355
  }
362
- export interface DeviceAction<T = any> {
356
+ export interface DeviceAction<TParam = any, TReturn = any> {
363
357
  name: string;
364
358
  description?: string;
365
359
  interfaceAlias?: string;
366
- paramSchema?: z.ZodType<T>;
367
- call: (param: T, context: ExecutorContext) => Promise<void> | void;
360
+ paramSchema?: z.ZodType<TParam>;
361
+ call: (param: TParam, context: ExecutorContext) => Promise<TReturn> | TReturn;
362
+ delayAfterRunner?: number;
368
363
  }
364
+ /**
365
+ * Type utilities for extracting types from DeviceAction definitions
366
+ */
367
+ /**
368
+ * Extract parameter type from a DeviceAction
369
+ */
370
+ export type ActionParam<Action extends DeviceAction<any, any>> = Action extends DeviceAction<infer P, any> ? P : never;
371
+ /**
372
+ * Extract return type from a DeviceAction
373
+ */
374
+ export type ActionReturn<Action extends DeviceAction<any, any>> = Action extends DeviceAction<any, infer R> ? R : never;
369
375
  /**
370
376
  * Web-specific types
371
377
  */
@@ -376,7 +382,7 @@ export interface WebElementInfo extends BaseElement {
376
382
  [key: string]: string;
377
383
  };
378
384
  }
379
- export type WebUIContext = UIContext<WebElementInfo>;
385
+ export type WebUIContext = UIContext;
380
386
  /**
381
387
  * Agent
382
388
  */
@@ -393,11 +399,42 @@ export interface AgentOpt {
393
399
  generateReport?: boolean;
394
400
  autoPrintReportMsg?: boolean;
395
401
  onTaskStartTip?: OnTaskStartTip;
402
+ aiActContext?: string;
396
403
  aiActionContext?: string;
397
404
  reportFileName?: string;
398
- modelConfig?: TModelConfigFn;
405
+ modelConfig?: TModelConfig;
399
406
  cache?: Cache;
407
+ /**
408
+ * Maximum number of replanning cycles for aiAct.
409
+ * Defaults to 20 (40 for `vlm-ui-tars`) when not provided.
410
+ * If omitted, the agent will also read `MIDSCENE_REPLANNING_CYCLE_LIMIT` for backward compatibility.
411
+ */
400
412
  replanningCycleLimit?: number;
413
+ /**
414
+ * Custom OpenAI client factory function
415
+ *
416
+ * If provided, this function will be called to create OpenAI client instances
417
+ * for each AI call, allowing you to:
418
+ * - Wrap clients with observability tools (langsmith, langfuse)
419
+ * - Use custom OpenAI-compatible clients
420
+ * - Apply different configurations based on intent
421
+ *
422
+ * @param config - Resolved model configuration
423
+ * @returns OpenAI client instance (original or wrapped)
424
+ *
425
+ * @example
426
+ * ```typescript
427
+ * createOpenAIClient: async (openai, opts) => {
428
+ * // Wrap with langsmith for planning tasks
429
+ * if (opts.baseURL?.includes('planning')) {
430
+ * return wrapOpenAI(openai, { metadata: { task: 'planning' } });
431
+ * }
432
+ *
433
+ * return openai;
434
+ * }
435
+ * ```
436
+ */
437
+ createOpenAIClient?: CreateOpenAIClientFn;
401
438
  }
402
439
  export type TestStatus = 'passed' | 'failed' | 'timedOut' | 'skipped' | 'interrupted';
403
440
  export interface ReportFileWithAttributes {
@@ -1,4 +1,4 @@
1
- import type { TUserPrompt } from '../ai-model/common';
1
+ import type { TUserPrompt } from '../common';
2
2
  import type { DetailedLocateParam, LocateOption, MidsceneYamlScript } from '../types';
3
3
  export declare function interpolateEnvVars(content: string): string;
4
4
  export declare function parseYamlScript(content: string, filePath?: string): MidsceneYamlScript;
@@ -1,17 +1,18 @@
1
- import type { TUserPrompt } from './ai-model/common';
2
- import type { AgentOpt, Rect } from './types';
3
- import type { BaseElement, UIContext } from './types';
1
+ import type { TUserPrompt } from './common';
2
+ import type { AndroidDeviceOpt, IOSDeviceOpt } from './device';
3
+ import type { AgentOpt, LocateResultElement, Rect } from './types';
4
+ import type { UIContext } from './types';
4
5
  export interface LocateOption {
5
6
  prompt?: TUserPrompt;
6
7
  deepThink?: boolean;
7
8
  cacheable?: boolean;
8
9
  xpath?: string;
9
- uiContext?: UIContext<BaseElement>;
10
+ uiContext?: UIContext;
10
11
  }
11
- export interface InsightExtractOption {
12
+ export interface ServiceExtractOption {
12
13
  domIncluded?: boolean | 'visible-only';
13
14
  screenshotIncluded?: boolean;
14
- doNotThrowError?: boolean;
15
+ [key: string]: unknown;
15
16
  }
16
17
  export interface ReferenceImage {
17
18
  base64: string;
@@ -21,11 +22,13 @@ export interface DetailedLocateParam extends LocateOption {
21
22
  prompt: TUserPrompt;
22
23
  referenceImage?: ReferenceImage;
23
24
  }
24
- export interface ScrollParam {
25
- direction: 'down' | 'up' | 'right' | 'left';
26
- scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
27
- distance?: null | number;
28
- }
25
+ export type ActionScrollParam = {
26
+ direction?: 'down' | 'up' | 'right' | 'left';
27
+ scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft';
28
+ distance?: number | null;
29
+ locate?: LocateResultElement;
30
+ };
31
+ export type ScrollParam = Omit<ActionScrollParam, 'locate'>;
29
32
  export interface MidsceneYamlScript {
30
33
  target?: MidsceneYamlScriptWebEnv;
31
34
  web?: MidsceneYamlScriptWebEnv;
@@ -41,7 +44,30 @@ export interface MidsceneYamlTask {
41
44
  flow: MidsceneYamlFlowItem[];
42
45
  continueOnError?: boolean;
43
46
  }
44
- export type MidsceneYamlScriptAgentOpt = Pick<AgentOpt, 'aiActionContext' | 'cache'>;
47
+ /**
48
+ * Agent configuration options that can be specified in YAML scripts.
49
+ *
50
+ * This type includes serializable fields from AgentOpt, excluding non-serializable
51
+ * fields like functions and complex objects. All fields are optional.
52
+ *
53
+ * @remarks
54
+ * - testId priority: CLI parameter > YAML agent.testId > filename
55
+ * - These settings apply to all platforms (Web, Android, iOS, Generic Interface)
56
+ * - modelConfig is configured through environment variables, not in YAML
57
+ *
58
+ * @example
59
+ * ```yaml
60
+ * agent:
61
+ * testId: "checkout-test"
62
+ * groupName: "E2E Test Suite"
63
+ * generateReport: true
64
+ * replanningCycleLimit: 30
65
+ * cache:
66
+ * id: "checkout-cache"
67
+ * strategy: "read-write"
68
+ * ```
69
+ */
70
+ export type MidsceneYamlScriptAgentOpt = Pick<AgentOpt, 'testId' | 'groupName' | 'groupDescription' | 'generateReport' | 'autoPrintReportMsg' | 'reportFileName' | 'replanningCycleLimit' | 'aiActContext' | 'aiActionContext' | 'cache'>;
45
71
  export interface MidsceneYamlScriptConfig {
46
72
  output?: string;
47
73
  unstableLogContent?: boolean | string;
@@ -65,58 +91,56 @@ export interface MidsceneYamlScriptWebEnv extends MidsceneYamlScriptConfig, Mids
65
91
  };
66
92
  cookie?: string;
67
93
  forceSameTabNavigation?: boolean;
94
+ /**
95
+ * Custom Chrome launch arguments (Puppeteer only, not supported in bridge mode).
96
+ *
97
+ * Allows passing custom command-line arguments to Chrome/Chromium when launching the browser.
98
+ * This is useful for testing scenarios that require specific browser configurations.
99
+ *
100
+ * ⚠️ Security Warning: Some arguments (e.g., --no-sandbox, --disable-web-security) may
101
+ * reduce browser security. Use only in controlled testing environments.
102
+ *
103
+ * @example
104
+ * ```yaml
105
+ * web:
106
+ * url: https://example.com
107
+ * chromeArgs:
108
+ * - '--disable-features=ThirdPartyCookiePhaseout'
109
+ * - '--disable-features=SameSiteByDefaultCookies'
110
+ * - '--window-size=1920,1080'
111
+ * ```
112
+ */
113
+ chromeArgs?: string[];
68
114
  bridgeMode?: false | 'newTabWithUrl' | 'currentTab';
69
115
  closeNewTabsAfterDisconnect?: boolean;
70
116
  }
71
- export interface MidsceneYamlScriptAndroidEnv extends MidsceneYamlScriptConfig {
117
+ export interface MidsceneYamlScriptAndroidEnv extends MidsceneYamlScriptConfig, Omit<AndroidDeviceOpt, 'customActions'> {
72
118
  deviceId?: string;
73
119
  launch?: string;
74
120
  }
75
- export interface MidsceneYamlScriptIOSEnv extends MidsceneYamlScriptConfig {
76
- wdaPort?: number;
77
- wdaHost?: string;
78
- autoDismissKeyboard?: boolean;
121
+ export interface MidsceneYamlScriptIOSEnv extends MidsceneYamlScriptConfig, Omit<IOSDeviceOpt, 'customActions'> {
79
122
  launch?: string;
80
123
  }
81
124
  export type MidsceneYamlScriptEnv = MidsceneYamlScriptWebEnv | MidsceneYamlScriptAndroidEnv | MidsceneYamlScriptIOSEnv;
82
125
  export interface MidsceneYamlFlowItemAIAction {
83
- ai?: string;
84
126
  aiAction?: string;
127
+ ai?: string;
128
+ aiAct?: string;
85
129
  aiActionProgressTips?: string[];
86
130
  cacheable?: boolean;
131
+ _deepThink?: boolean;
132
+ [key: string]: unknown;
87
133
  }
88
134
  export interface MidsceneYamlFlowItemAIAssert {
89
135
  aiAssert: string;
90
136
  errorMessage?: string;
91
137
  name?: string;
92
- }
93
- export interface MidsceneYamlFlowItemAIQuery extends InsightExtractOption {
94
- aiQuery: string;
95
- name?: string;
96
- }
97
- export interface MidsceneYamlFlowItemAINumber extends InsightExtractOption {
98
- aiNumber: string;
99
- name?: string;
100
- }
101
- export interface MidsceneYamlFlowItemAIString extends InsightExtractOption {
102
- aiString: string;
103
- name?: string;
104
- }
105
- export interface MidsceneYamlFlowItemAIAsk extends InsightExtractOption {
106
- aiAsk: string;
107
- name?: string;
108
- }
109
- export interface MidsceneYamlFlowItemAIBoolean extends InsightExtractOption {
110
- aiBoolean: string;
111
- name?: string;
112
- }
113
- export interface MidsceneYamlFlowItemAILocate extends LocateOption {
114
- aiLocate: string;
115
- name?: string;
138
+ [key: string]: unknown;
116
139
  }
117
140
  export interface MidsceneYamlFlowItemAIWaitFor {
118
141
  aiWaitFor: string;
119
142
  timeout?: number;
143
+ [key: string]: unknown;
120
144
  }
121
145
  export interface MidsceneYamlFlowItemEvaluateJavaScript {
122
146
  javascript: string;
@@ -127,9 +151,10 @@ export interface MidsceneYamlFlowItemSleep {
127
151
  }
128
152
  export interface MidsceneYamlFlowItemLogScreenshot {
129
153
  logScreenshot?: string;
154
+ recordToReport?: string;
130
155
  content?: string;
131
156
  }
132
- export type MidsceneYamlFlowItem = MidsceneYamlFlowItemAIAction | MidsceneYamlFlowItemAIAssert | MidsceneYamlFlowItemAIQuery | MidsceneYamlFlowItemAIWaitFor | MidsceneYamlFlowItemSleep | MidsceneYamlFlowItemLogScreenshot;
157
+ export type MidsceneYamlFlowItem = MidsceneYamlFlowItemAIAction | MidsceneYamlFlowItemAIAssert | MidsceneYamlFlowItemAIWaitFor | MidsceneYamlFlowItemEvaluateJavaScript | MidsceneYamlFlowItemSleep | MidsceneYamlFlowItemLogScreenshot;
133
158
  export interface FreeFn {
134
159
  name: string;
135
160
  fn: () => void;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
4
- "version": "0.30.10",
4
+ "version": "1.0.0",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "main": "./dist/lib/index.js",
@@ -79,23 +79,20 @@
79
79
  }
80
80
  },
81
81
  "dependencies": {
82
- "@anthropic-ai/sdk": "0.33.1",
83
- "@azure/identity": "4.5.0",
84
82
  "@ui-tars/action-parser": "1.2.3",
83
+ "dayjs": "^1.11.11",
85
84
  "dotenv": "^16.4.5",
86
- "https-proxy-agent": "7.0.2",
85
+ "fetch-socks": "^1.3.0",
86
+ "openai": "6.3.0",
87
+ "undici": "^6.0.0",
87
88
  "jsonrepair": "3.12.0",
88
- "langsmith": "0.3.7",
89
- "openai": "4.81.0",
90
- "socks-proxy-agent": "8.0.4",
91
- "zod": "3.24.3",
92
89
  "semver": "7.5.2",
93
90
  "js-yaml": "4.1.0",
94
- "@midscene/recorder": "0.30.10",
95
- "@midscene/shared": "0.30.10"
91
+ "zod": "3.24.3",
92
+ "@midscene/shared": "1.0.0"
96
93
  },
97
94
  "devDependencies": {
98
- "@rslib/core": "^0.11.2",
95
+ "@rslib/core": "^0.18.3",
99
96
  "@types/node": "^18.0.0",
100
97
  "@types/node-fetch": "2.6.11",
101
98
  "@types/js-yaml": "4.0.9",
@@ -114,7 +111,7 @@
114
111
  "scripts": {
115
112
  "dev": "npm run build:watch",
116
113
  "build": "rslib build",
117
- "build:watch": "USE_DEV_REPORT=1 rslib build --watch",
114
+ "build:watch": "USE_DEV_REPORT=1 rslib build --watch --no-clean",
118
115
  "test": "vitest --run",
119
116
  "test:u": "vitest --run -u",
120
117
  "test:ai": "AITEST=true npm run test",