@midscene/core 0.30.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/dist/es/agent/agent.mjs +233 -144
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/execution-session.mjs +41 -0
  4. package/dist/es/agent/execution-session.mjs.map +1 -0
  5. package/dist/es/agent/index.mjs +3 -3
  6. package/dist/es/agent/task-builder.mjs +319 -0
  7. package/dist/es/agent/task-builder.mjs.map +1 -0
  8. package/dist/es/agent/task-cache.mjs +4 -4
  9. package/dist/es/agent/task-cache.mjs.map +1 -1
  10. package/dist/es/agent/tasks.mjs +197 -504
  11. package/dist/es/agent/tasks.mjs.map +1 -1
  12. package/dist/es/agent/ui-utils.mjs +54 -35
  13. package/dist/es/agent/ui-utils.mjs.map +1 -1
  14. package/dist/es/agent/utils.mjs +16 -58
  15. package/dist/es/agent/utils.mjs.map +1 -1
  16. package/dist/es/ai-model/conversation-history.mjs +25 -13
  17. package/dist/es/ai-model/conversation-history.mjs.map +1 -1
  18. package/dist/es/ai-model/index.mjs +4 -4
  19. package/dist/es/ai-model/inspect.mjs +45 -54
  20. package/dist/es/ai-model/inspect.mjs.map +1 -1
  21. package/dist/es/ai-model/llm-planning.mjs +47 -65
  22. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  23. package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
  24. package/dist/es/ai-model/prompt/common.mjs.map +1 -1
  25. package/dist/es/ai-model/prompt/describe.mjs.map +1 -1
  26. package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
  27. package/dist/es/ai-model/prompt/llm-locator.mjs +11 -235
  28. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
  29. package/dist/es/ai-model/prompt/llm-planning.mjs +76 -322
  30. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  31. package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -14
  32. package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
  33. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
  34. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
  35. package/dist/es/ai-model/prompt/playwright-generator.mjs +2 -2
  36. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  37. package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -1
  38. package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -1
  39. package/dist/es/ai-model/prompt/util.mjs +3 -88
  40. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  41. package/dist/es/ai-model/prompt/yaml-generator.mjs +10 -10
  42. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  43. package/dist/es/ai-model/service-caller/index.mjs +182 -274
  44. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  45. package/dist/es/ai-model/ui-tars-planning.mjs +69 -8
  46. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  47. package/dist/es/{ai-model/common.mjs → common.mjs} +18 -30
  48. package/dist/es/common.mjs.map +1 -0
  49. package/dist/es/device/device-options.mjs +0 -0
  50. package/dist/es/device/index.mjs +29 -12
  51. package/dist/es/device/index.mjs.map +1 -1
  52. package/dist/es/index.mjs +5 -4
  53. package/dist/es/index.mjs.map +1 -1
  54. package/dist/es/report.mjs.map +1 -1
  55. package/dist/es/{insight → service}/index.mjs +38 -51
  56. package/dist/es/service/index.mjs.map +1 -0
  57. package/dist/es/{insight → service}/utils.mjs +3 -3
  58. package/dist/es/service/utils.mjs.map +1 -0
  59. package/dist/es/task-runner.mjs +264 -0
  60. package/dist/es/task-runner.mjs.map +1 -0
  61. package/dist/es/tree.mjs +13 -2
  62. package/dist/es/tree.mjs.map +1 -0
  63. package/dist/es/types.mjs +18 -1
  64. package/dist/es/types.mjs.map +1 -1
  65. package/dist/es/utils.mjs +6 -7
  66. package/dist/es/utils.mjs.map +1 -1
  67. package/dist/es/yaml/builder.mjs.map +1 -1
  68. package/dist/es/yaml/player.mjs +121 -98
  69. package/dist/es/yaml/player.mjs.map +1 -1
  70. package/dist/es/yaml/utils.mjs +1 -1
  71. package/dist/es/yaml/utils.mjs.map +1 -1
  72. package/dist/lib/agent/agent.js +231 -142
  73. package/dist/lib/agent/agent.js.map +1 -1
  74. package/dist/lib/agent/common.js +1 -1
  75. package/dist/lib/agent/execution-session.js +75 -0
  76. package/dist/lib/agent/execution-session.js.map +1 -0
  77. package/dist/lib/agent/index.js +14 -14
  78. package/dist/lib/agent/index.js.map +1 -1
  79. package/dist/lib/agent/task-builder.js +356 -0
  80. package/dist/lib/agent/task-builder.js.map +1 -0
  81. package/dist/lib/agent/task-cache.js +8 -8
  82. package/dist/lib/agent/task-cache.js.map +1 -1
  83. package/dist/lib/agent/tasks.js +202 -506
  84. package/dist/lib/agent/tasks.js.map +1 -1
  85. package/dist/lib/agent/ui-utils.js +58 -36
  86. package/dist/lib/agent/ui-utils.js.map +1 -1
  87. package/dist/lib/agent/utils.js +26 -68
  88. package/dist/lib/agent/utils.js.map +1 -1
  89. package/dist/lib/ai-model/conversation-history.js +27 -15
  90. package/dist/lib/ai-model/conversation-history.js.map +1 -1
  91. package/dist/lib/ai-model/index.js +27 -27
  92. package/dist/lib/ai-model/index.js.map +1 -1
  93. package/dist/lib/ai-model/inspect.js +51 -57
  94. package/dist/lib/ai-model/inspect.js.map +1 -1
  95. package/dist/lib/ai-model/llm-planning.js +49 -67
  96. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  97. package/dist/lib/ai-model/prompt/assertion.js +2 -2
  98. package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
  99. package/dist/lib/ai-model/prompt/common.js +2 -2
  100. package/dist/lib/ai-model/prompt/common.js.map +1 -1
  101. package/dist/lib/ai-model/prompt/describe.js +2 -2
  102. package/dist/lib/ai-model/prompt/describe.js.map +1 -1
  103. package/dist/lib/ai-model/prompt/extraction.js +2 -2
  104. package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
  105. package/dist/lib/ai-model/prompt/llm-locator.js +14 -241
  106. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
  107. package/dist/lib/ai-model/prompt/llm-planning.js +79 -328
  108. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  109. package/dist/lib/ai-model/prompt/llm-section-locator.js +17 -16
  110. package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
  111. package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
  112. package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
  113. package/dist/lib/ai-model/prompt/playwright-generator.js +11 -11
  114. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  115. package/dist/lib/ai-model/prompt/ui-tars-locator.js +2 -2
  116. package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -1
  117. package/dist/lib/ai-model/prompt/ui-tars-planning.js +2 -2
  118. package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -1
  119. package/dist/lib/ai-model/prompt/util.js +7 -95
  120. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  121. package/dist/lib/ai-model/prompt/yaml-generator.js +18 -18
  122. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  123. package/dist/lib/ai-model/service-caller/index.js +288 -401
  124. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  125. package/dist/lib/ai-model/ui-tars-planning.js +71 -10
  126. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  127. package/dist/lib/{ai-model/common.js → common.js} +40 -55
  128. package/dist/lib/common.js.map +1 -0
  129. package/dist/lib/device/device-options.js +20 -0
  130. package/dist/lib/device/device-options.js.map +1 -0
  131. package/dist/lib/device/index.js +63 -40
  132. package/dist/lib/device/index.js.map +1 -1
  133. package/dist/lib/image/index.js +5 -5
  134. package/dist/lib/image/index.js.map +1 -1
  135. package/dist/lib/index.js +24 -20
  136. package/dist/lib/index.js.map +1 -1
  137. package/dist/lib/report.js +2 -2
  138. package/dist/lib/report.js.map +1 -1
  139. package/dist/lib/{insight → service}/index.js +41 -54
  140. package/dist/lib/service/index.js.map +1 -0
  141. package/dist/lib/{insight → service}/utils.js +7 -7
  142. package/dist/lib/service/utils.js.map +1 -0
  143. package/dist/lib/task-runner.js +301 -0
  144. package/dist/lib/task-runner.js.map +1 -0
  145. package/dist/lib/tree.js +13 -4
  146. package/dist/lib/tree.js.map +1 -1
  147. package/dist/lib/types.js +31 -12
  148. package/dist/lib/types.js.map +1 -1
  149. package/dist/lib/utils.js +16 -17
  150. package/dist/lib/utils.js.map +1 -1
  151. package/dist/lib/yaml/builder.js +2 -2
  152. package/dist/lib/yaml/builder.js.map +1 -1
  153. package/dist/lib/yaml/index.js +16 -22
  154. package/dist/lib/yaml/index.js.map +1 -1
  155. package/dist/lib/yaml/player.js +123 -100
  156. package/dist/lib/yaml/player.js.map +1 -1
  157. package/dist/lib/yaml/utils.js +6 -6
  158. package/dist/lib/yaml/utils.js.map +1 -1
  159. package/dist/lib/yaml.js +1 -1
  160. package/dist/lib/yaml.js.map +1 -1
  161. package/dist/types/agent/agent.d.ts +62 -17
  162. package/dist/types/agent/execution-session.d.ts +36 -0
  163. package/dist/types/agent/index.d.ts +3 -2
  164. package/dist/types/agent/task-builder.d.ts +35 -0
  165. package/dist/types/agent/tasks.d.ts +32 -23
  166. package/dist/types/agent/ui-utils.d.ts +9 -2
  167. package/dist/types/agent/utils.d.ts +9 -35
  168. package/dist/types/ai-model/conversation-history.d.ts +8 -4
  169. package/dist/types/ai-model/index.d.ts +5 -5
  170. package/dist/types/ai-model/inspect.d.ts +20 -12
  171. package/dist/types/ai-model/llm-planning.d.ts +3 -1
  172. package/dist/types/ai-model/prompt/llm-locator.d.ts +1 -6
  173. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -3
  174. package/dist/types/ai-model/prompt/llm-section-locator.d.ts +1 -3
  175. package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
  176. package/dist/types/ai-model/prompt/util.d.ts +2 -34
  177. package/dist/types/ai-model/service-caller/index.d.ts +2 -3
  178. package/dist/types/ai-model/ui-tars-planning.d.ts +15 -2
  179. package/dist/types/{ai-model/common.d.ts → common.d.ts} +6 -6
  180. package/dist/types/device/device-options.d.ts +57 -0
  181. package/dist/types/device/index.d.ts +55 -39
  182. package/dist/types/index.d.ts +7 -6
  183. package/dist/types/service/index.d.ts +26 -0
  184. package/dist/types/service/utils.d.ts +2 -0
  185. package/dist/types/task-runner.d.ts +49 -0
  186. package/dist/types/tree.d.ts +4 -1
  187. package/dist/types/types.d.ts +103 -66
  188. package/dist/types/yaml/utils.d.ts +1 -1
  189. package/dist/types/yaml.d.ts +68 -43
  190. package/package.json +9 -12
  191. package/dist/es/ai-model/action-executor.mjs +0 -129
  192. package/dist/es/ai-model/action-executor.mjs.map +0 -1
  193. package/dist/es/ai-model/common.mjs.map +0 -1
  194. package/dist/es/insight/index.mjs.map +0 -1
  195. package/dist/es/insight/utils.mjs.map +0 -1
  196. package/dist/lib/ai-model/action-executor.js +0 -163
  197. package/dist/lib/ai-model/action-executor.js.map +0 -1
  198. package/dist/lib/ai-model/common.js.map +0 -1
  199. package/dist/lib/insight/index.js.map +0 -1
  200. package/dist/lib/insight/utils.js.map +0 -1
  201. package/dist/types/ai-model/action-executor.d.ts +0 -19
  202. package/dist/types/insight/index.d.ts +0 -31
  203. package/dist/types/insight/utils.d.ts +0 -2
@@ -1,7 +1,14 @@
1
- import type { AndroidPullParam, DetailedLocateParam, ExecutionTask, ScrollParam } from '../types';
1
+ import type { DetailedLocateParam, ExecutionTask, PullParam, ScrollParam } from '../types';
2
2
  export declare function typeStr(task: ExecutionTask): any;
3
3
  export declare function locateParamStr(locate?: DetailedLocateParam | string): string;
4
4
  export declare function scrollParamStr(scrollParam?: ScrollParam): string;
5
- export declare function pullParamStr(pullParam?: AndroidPullParam): string;
5
+ export declare function pullParamStr(pullParam?: PullParam): string;
6
+ export declare function extractInsightParam(taskParam: any): {
7
+ content: string;
8
+ images?: Array<{
9
+ name: string;
10
+ url: string;
11
+ }>;
12
+ };
6
13
  export declare function taskTitleStr(type: 'Tap' | 'Hover' | 'Input' | 'RightClick' | 'KeyboardPress' | 'Scroll' | 'Action' | 'Query' | 'Assert' | 'WaitFor' | 'Locate' | 'Boolean' | 'Number' | 'String', prompt: string): string;
7
14
  export declare function paramStr(task: ExecutionTask): string;
@@ -1,7 +1,7 @@
1
- import type { TMultimodalPrompt, TUserPrompt } from '../ai-model/common';
1
+ import type { TMultimodalPrompt, TUserPrompt } from '../common';
2
2
  import type { AbstractInterface } from '../device';
3
- import type { BaseElement, ElementCacheFeature, ElementTreeNode, ExecutionDump, ExecutorContext, LocateResultElement, PlanningLocateParam, UIContext } from '../types';
4
- import type { TaskExecutor } from './tasks';
3
+ import type { ElementCacheFeature, LocateResultElement, PlanningLocateParam, UIContext } from '../types';
4
+ import type { TaskCache } from './task-cache';
5
5
  export declare function commonContextParser(interfaceInstance: AbstractInterface, _opt: {
6
6
  uploadServerUrl?: string;
7
7
  }): Promise<UIContext>;
@@ -13,38 +13,12 @@ export declare function printReportMsg(filepath: string): void;
13
13
  */
14
14
  export declare function getCurrentExecutionFile(trace?: string): string | false;
15
15
  export declare function generateCacheId(fileName?: string): string;
16
- export declare function matchElementFromPlan(planLocateParam: PlanningLocateParam, tree: ElementTreeNode<BaseElement>): any;
17
- export declare function matchElementFromCache(taskExecutor: TaskExecutor, cacheEntry: ElementCacheFeature | undefined, cachePrompt: TUserPrompt, cacheable: boolean | undefined): Promise<LocateResultElement | undefined>;
18
- export declare function trimContextByViewport(execution: ExecutionDump): {
19
- tasks: {
20
- type: any;
21
- subType?: string;
22
- param?: any;
23
- thought?: string;
24
- locate?: PlanningLocateParam | null;
25
- uiContext?: UIContext;
26
- executor: (param: any, context: ExecutorContext) => void | Promise<void | import("../types").ExecutionTaskReturn<any, any> | undefined> | undefined;
27
- output?: any;
28
- log?: any;
29
- recorder?: import("../types").ExecutionRecorderItem[];
30
- hitBy?: import("../types").ExecutionTaskHitBy;
31
- status: "pending" | "running" | "finished" | "failed" | "cancelled";
32
- error?: Error;
33
- errorMessage?: string;
34
- errorStack?: string;
35
- timing?: {
36
- start: number;
37
- end?: number;
38
- cost?: number;
39
- };
40
- usage?: import("../types").AIUsageInfo;
41
- searchAreaUsage?: import("../types").AIUsageInfo;
42
- }[];
43
- name: string;
44
- description?: string;
45
- aiActionContext?: string;
46
- logTime: number;
47
- };
16
+ export declare function ifPlanLocateParamIsBbox(planLocateParam: PlanningLocateParam): boolean;
17
+ export declare function matchElementFromPlan(planLocateParam: PlanningLocateParam): LocateResultElement | undefined;
18
+ export declare function matchElementFromCache(context: {
19
+ taskCache?: TaskCache;
20
+ interfaceInstance: AbstractInterface;
21
+ }, cacheEntry: ElementCacheFeature | undefined, cachePrompt: TUserPrompt, cacheable: boolean | undefined): Promise<LocateResultElement | undefined>;
48
22
  export declare const getMidsceneVersion: () => string;
49
23
  export declare const parsePrompt: (prompt: TUserPrompt) => {
50
24
  textPrompt: string;
@@ -1,18 +1,22 @@
1
1
  import type { ChatCompletionMessageParam } from 'openai/resources/index';
2
2
  export interface ConversationHistoryOptions {
3
- maxUserImageMessages?: number;
4
3
  initialMessages?: ChatCompletionMessageParam[];
5
4
  }
6
5
  export declare class ConversationHistory {
7
- private readonly maxUserImageMessages;
8
6
  private readonly messages;
7
+ pendingFeedbackMessage: string;
9
8
  constructor(options?: ConversationHistoryOptions);
9
+ resetPendingFeedbackMessageIfExists(): void;
10
10
  append(message: ChatCompletionMessageParam): void;
11
11
  seed(messages: ChatCompletionMessageParam[]): void;
12
12
  reset(): void;
13
- snapshot(): ChatCompletionMessageParam[];
13
+ /**
14
+ * Snapshot the conversation history, and replace the images with text if the number of images exceeds the limit.
15
+ * @param maxImages - The maximum number of images to include in the snapshot. Undefined means no limit.
16
+ * @returns The snapshot of the conversation history.
17
+ */
18
+ snapshot(maxImages?: number): ChatCompletionMessageParam[];
14
19
  get length(): number;
15
20
  [Symbol.iterator](): IterableIterator<ChatCompletionMessageParam>;
16
21
  toJSON(): ChatCompletionMessageParam[];
17
- private pruneOldestUserMessageIfNecessary;
18
22
  }
@@ -1,13 +1,13 @@
1
1
  export { callAIWithStringResponse, callAIWithObjectResponse, callAI, } from './service-caller/index';
2
2
  export { systemPromptToLocateElement } from './prompt/llm-locator';
3
- export { describeUserPage, elementByPositionWithElementInfo, } from './prompt/util';
3
+ export { describeUserPage } from './prompt/util';
4
4
  export { generatePlaywrightTest, generatePlaywrightTestStream, } from './prompt/playwright-generator';
5
5
  export { generateYamlTest, generateYamlTestStream, } from './prompt/yaml-generator';
6
6
  export type { ChatCompletionMessageParam } from 'openai/resources/index';
7
- export { AiLocateElement, AiExtractElementInfo, AiLocateSection, } from './inspect';
7
+ export { AiLocateElement, AiExtractElementInfo, AiLocateSection, AiJudgeOrderSensitive, } from './inspect';
8
8
  export { plan } from './llm-planning';
9
- export { adaptBboxToRect } from './common';
9
+ export { adaptBboxToRect } from '../common';
10
10
  export { uiTarsPlanning, resizeImageForUiTars } from './ui-tars-planning';
11
11
  export { ConversationHistory, type ConversationHistoryOptions, } from './conversation-history';
12
- export { AIActionType, type AIArgs } from './common';
13
- export { getMidsceneLocationSchema, type MidsceneLocationResultType, PointSchema, SizeSchema, RectSchema, TMultimodalPromptSchema, TUserPromptSchema, type TMultimodalPrompt, type TUserPrompt, findAllMidsceneLocatorField, dumpActionParam, loadActionParam, parseActionParam, } from './common';
12
+ export { AIActionType, type AIArgs } from '../common';
13
+ export { getMidsceneLocationSchema, type MidsceneLocationResultType, PointSchema, SizeSchema, RectSchema, TMultimodalPromptSchema, TUserPromptSchema, type TMultimodalPrompt, type TUserPrompt, findAllMidsceneLocatorField, dumpActionParam, loadActionParam, parseActionParam, } from '../common';
@@ -1,29 +1,31 @@
1
- import type { AIDataExtractionResponse, AIElementLocatorResponse, AIElementResponse, AIUsageInfo, BaseElement, ElementById, InsightExtractOption, Rect, ReferenceImage, UIContext } from '../types';
1
+ import type { AIDataExtractionResponse, AIElementResponse, AIUsageInfo, Rect, ReferenceImage, ServiceExtractOption, UIContext } from '../types';
2
2
  import type { IModelConfig } from '@midscene/shared/env';
3
+ import type { LocateResultElement } from '@midscene/shared/types';
3
4
  import type { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources/index';
4
- import type { TMultimodalPrompt, TUserPrompt } from './common';
5
+ import type { TMultimodalPrompt, TUserPrompt } from '../common';
5
6
  import { callAIWithObjectResponse } from './service-caller/index';
6
7
  export type AIArgs = [
7
8
  ChatCompletionSystemMessageParam,
8
9
  ...ChatCompletionUserMessageParam[]
9
10
  ];
10
- export declare function AiLocateElement<ElementType extends BaseElement = BaseElement>(options: {
11
- context: UIContext<ElementType>;
11
+ export declare function AiLocateElement(options: {
12
+ context: UIContext;
12
13
  targetElementDescription: TUserPrompt;
13
14
  referenceImage?: ReferenceImage;
14
15
  callAIFn: typeof callAIWithObjectResponse<AIElementResponse | [number, number]>;
15
16
  searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;
16
17
  modelConfig: IModelConfig;
17
18
  }): Promise<{
18
- parseResult: AIElementLocatorResponse;
19
+ parseResult: {
20
+ elements: LocateResultElement[];
21
+ errors?: string[];
22
+ };
19
23
  rect?: Rect;
20
24
  rawResponse: string;
21
- elementById: ElementById;
22
25
  usage?: AIUsageInfo;
23
- isOrderSensitive?: boolean;
24
26
  }>;
25
27
  export declare function AiLocateSection(options: {
26
- context: UIContext<BaseElement>;
28
+ context: UIContext;
27
29
  sectionDescription: TUserPrompt;
28
30
  modelConfig: IModelConfig;
29
31
  }): Promise<{
@@ -33,14 +35,20 @@ export declare function AiLocateSection(options: {
33
35
  rawResponse: string;
34
36
  usage?: AIUsageInfo;
35
37
  }>;
36
- export declare function AiExtractElementInfo<T, ElementType extends BaseElement = BaseElement>(options: {
38
+ export declare function AiExtractElementInfo<T>(options: {
37
39
  dataQuery: string | Record<string, string>;
38
40
  multimodalPrompt?: TMultimodalPrompt;
39
- context: UIContext<ElementType>;
40
- extractOption?: InsightExtractOption;
41
+ context: UIContext;
42
+ pageDescription?: string;
43
+ extractOption?: ServiceExtractOption;
41
44
  modelConfig: IModelConfig;
42
45
  }): Promise<{
43
46
  parseResult: AIDataExtractionResponse<T>;
44
- elementById: (idOrIndexId: string) => ElementType;
45
47
  usage: AIUsageInfo | undefined;
46
48
  }>;
49
+ export declare function AiJudgeOrderSensitive(description: string, callAIFn: typeof callAIWithObjectResponse<{
50
+ isOrderSensitive: boolean;
51
+ }>, modelConfig: IModelConfig): Promise<{
52
+ isOrderSensitive: boolean;
53
+ usage?: AIUsageInfo;
54
+ }>;
@@ -7,5 +7,7 @@ export declare function plan(userInstruction: string, opts: {
7
7
  actionSpace: DeviceAction<any>[];
8
8
  actionContext?: string;
9
9
  modelConfig: IModelConfig;
10
- conversationHistory?: ConversationHistory;
10
+ conversationHistory: ConversationHistory;
11
+ includeBbox: boolean;
12
+ imagesIncludeCount?: number;
11
13
  }): Promise<PlanningAIResponse>;
@@ -1,8 +1,3 @@
1
1
  import type { TVlModeTypes } from '@midscene/shared/env';
2
- import type { ResponseFormatJSONSchema } from 'openai/resources/index';
3
2
  export declare function systemPromptToLocateElement(vlMode: TVlModeTypes | undefined): string;
4
- export declare const locatorSchema: ResponseFormatJSONSchema;
5
- export declare const findElementPrompt: ({ pageDescription, targetElementDescription, }: {
6
- pageDescription: string;
7
- targetElementDescription: string;
8
- }) => string;
3
+ export declare const findElementPrompt: (targetElementDescription: string) => string;
@@ -1,9 +1,8 @@
1
1
  import type { DeviceAction } from '../../types';
2
2
  import type { TVlModeTypes } from '@midscene/shared/env';
3
- import type { ResponseFormatJSONSchema } from 'openai/resources/index';
4
3
  export declare const descriptionForAction: (action: DeviceAction<any>, locatorSchemaTypeDescription: string) => string;
5
- export declare function systemPromptToTaskPlanning({ actionSpace, vlMode, }: {
4
+ export declare function systemPromptToTaskPlanning({ actionSpace, vlMode, includeBbox, }: {
6
5
  actionSpace: DeviceAction<any>[];
7
6
  vlMode: TVlModeTypes | undefined;
7
+ includeBbox: boolean;
8
8
  }): Promise<string>;
9
- export declare const planSchema: ResponseFormatJSONSchema;
@@ -1,5 +1,3 @@
1
1
  import type { TVlModeTypes } from '@midscene/shared/env';
2
2
  export declare function systemPromptToLocateSection(vlMode: TVlModeTypes | undefined): string;
3
- export declare const sectionLocatorInstruction: ({ sectionDescription, }: {
4
- sectionDescription: string;
5
- }) => string;
3
+ export declare const sectionLocatorInstruction: (sectionDescription: string) => string;
@@ -0,0 +1,2 @@
1
+ export declare function systemPromptToJudgeOrderSensitive(): string;
2
+ export declare const orderSensitiveJudgePrompt: (description: string) => string;
@@ -1,17 +1,9 @@
1
- import type { BaseElement, ElementTreeNode, Size, UIContext } from '../../types';
2
- import type { TVlModeTypes } from '@midscene/shared/env';
1
+ import type { BaseElement, Size, UIContext } from '../../types';
3
2
  export declare function describeSize(size: Size): string;
4
3
  export declare function describeElement(elements: (Pick<BaseElement, 'rect' | 'content'> & {
5
4
  id: string;
6
5
  })[]): string;
7
6
  export declare const distanceThreshold = 16;
8
- export declare function elementByPositionWithElementInfo(treeRoot: ElementTreeNode<BaseElement>, position: {
9
- x: number;
10
- y: number;
11
- }, options?: {
12
- requireStrictDistance?: boolean;
13
- filterPositionElements?: boolean;
14
- }): BaseElement | undefined;
15
7
  export declare function distance(point1: {
16
8
  x: number;
17
9
  y: number;
@@ -20,28 +12,4 @@ export declare function distance(point1: {
20
12
  y: number;
21
13
  }): number;
22
14
  export declare const samplePageDescription = "\nAnd the page is described as follows:\n====================\nThe size of the page: 1280 x 720\nSome of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.\n\nDescription of all the elements in screenshot:\n<div id=\"969f1637\" markerId=\"1\" left=\"100\" top=\"100\" width=\"100\" height=\"100\"> // The markerId indicated by the rectangle label in the screenshot\n <h4 id=\"b211ecb2\" markerId=\"5\" left=\"150\" top=\"150\" width=\"90\" height=\"60\">\n The username is accepted\n </h4>\n ...many more\n</div>\n====================\n";
23
- export declare function describeUserPage<ElementType extends BaseElement = BaseElement>(context: Omit<UIContext<ElementType>, 'describer'>, opt: {
24
- truncateTextLength?: number;
25
- filterNonTextContent?: boolean;
26
- domIncluded?: boolean | 'visible-only';
27
- visibleOnly?: boolean;
28
- vlMode: TVlModeTypes | undefined;
29
- }): Promise<{
30
- description: string;
31
- elementById(idOrIndexId: string): ElementType;
32
- elementByPosition(position: {
33
- x: number;
34
- y: number;
35
- }, size: {
36
- width: number;
37
- height: number;
38
- }): BaseElement | undefined;
39
- insertElementByPosition(position: {
40
- x: number;
41
- y: number;
42
- }): ElementType;
43
- size: {
44
- width: number;
45
- height: number;
46
- };
47
- }>;
15
+ export declare function describeUserPage(context: UIContext): Promise<string>;
@@ -1,9 +1,8 @@
1
1
  import { type AIUsageInfo } from '../../types';
2
2
  import type { StreamingCallback } from '../../types';
3
3
  import { type IModelConfig, type TVlModeTypes } from '@midscene/shared/env';
4
- import OpenAI from 'openai';
5
4
  import type { ChatCompletionMessageParam } from 'openai/resources/index';
6
- import { AIActionType, type AIArgs } from '../common';
5
+ import type { AIActionType, AIArgs } from '../../common';
7
6
  export declare function callAI(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelConfig: IModelConfig, options?: {
8
7
  stream?: boolean;
9
8
  onChunk?: StreamingCallback;
@@ -12,9 +11,9 @@ export declare function callAI(messages: ChatCompletionMessageParam[], AIActionT
12
11
  usage?: AIUsageInfo;
13
12
  isStreamed: boolean;
14
13
  }>;
15
- export declare const getResponseFormat: (modelName: string, AIActionTypeValue: AIActionType) => OpenAI.ChatCompletionCreateParams["response_format"] | OpenAI.ResponseFormatJSONObject;
16
14
  export declare function callAIWithObjectResponse<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelConfig: IModelConfig): Promise<{
17
15
  content: T;
16
+ contentString: string;
18
17
  usage?: AIUsageInfo;
19
18
  }>;
20
19
  export declare function callAIWithStringResponse(msgs: AIArgs, AIActionTypeValue: AIActionType, modelConfig: IModelConfig): Promise<{
@@ -1,11 +1,12 @@
1
1
  import type { PlanningAIResponse, Size, UIContext } from '../types';
2
2
  import { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';
3
3
  import type { ConversationHistory } from './conversation-history';
4
- type ActionType = 'click' | 'drag' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
4
+ type ActionType = 'click' | 'left_double' | 'right_single' | 'drag' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
5
5
  export declare function uiTarsPlanning(userInstruction: string, options: {
6
6
  conversationHistory: ConversationHistory;
7
7
  context: UIContext;
8
8
  modelConfig: IModelConfig;
9
+ actionContext?: string;
9
10
  }): Promise<PlanningAIResponse>;
10
11
  interface BaseAction {
11
12
  action_type: ActionType;
@@ -32,6 +33,18 @@ interface WaitAction extends BaseAction {
32
33
  time: string;
33
34
  };
34
35
  }
36
+ interface LeftDoubleAction extends BaseAction {
37
+ action_type: 'left_double';
38
+ action_inputs: {
39
+ start_box: string;
40
+ };
41
+ }
42
+ interface RightSingleAction extends BaseAction {
43
+ action_type: 'right_single';
44
+ action_inputs: {
45
+ start_box: string;
46
+ };
47
+ }
35
48
  interface TypeAction extends BaseAction {
36
49
  action_type: 'type';
37
50
  action_inputs: {
@@ -54,6 +67,6 @@ interface FinishedAction extends BaseAction {
54
67
  action_type: 'finished';
55
68
  action_inputs: Record<string, never>;
56
69
  }
57
- export type Action = ClickAction | DragAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction;
70
+ export type Action = ClickAction | LeftDoubleAction | RightSingleAction | DragAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction;
58
71
  export declare function resizeImageForUiTars(imageBase64: string, size: Size, uiTarsVersion: UITarsModelVersion | undefined): Promise<string>;
59
72
  export {};
@@ -1,6 +1,6 @@
1
- import type { BaseElement, DeviceAction, ElementTreeNode, MidsceneYamlFlowItem, PlanningAction, Rect, Size } from '../types';
1
+ import type { BaseElement, DeviceAction, ElementTreeNode, MidsceneYamlFlowItem, PlanningAction, Rect, Size } from './types';
2
2
  import type { ChatCompletionMessageParam } from 'openai/resources/index';
3
- import type { PlanningLocateParam } from '../types';
3
+ import type { PlanningLocateParam } from './types';
4
4
  import type { TVlModeTypes } from '@midscene/shared/env';
5
5
  import { z } from 'zod';
6
6
  export type AIArgs = ChatCompletionMessageParam[];
@@ -12,14 +12,14 @@ export declare enum AIActionType {
12
12
  DESCRIBE_ELEMENT = 4,
13
13
  TEXT = 5
14
14
  }
15
+ type AdaptBboxInput = number[] | string[] | string | (number[] | string[])[];
15
16
  export declare function fillBboxParam(locate: PlanningLocateParam, width: number, height: number, rightLimit: number, bottomLimit: number, vlMode: TVlModeTypes | undefined): PlanningLocateParam;
16
- export declare function adaptQwenBbox(bbox: number[]): [number, number, number, number];
17
+ export declare function adaptQwen2_5Bbox(bbox: number[]): [number, number, number, number];
17
18
  export declare function adaptDoubaoBbox(bbox: string[] | number[] | string, width: number, height: number): [number, number, number, number];
18
- export declare function adaptBbox(bbox: number[], width: number, height: number, rightLimit: number, bottomLimit: number, vlMode: TVlModeTypes | undefined): [number, number, number, number];
19
+ export declare function adaptBbox(bbox: AdaptBboxInput, width: number, height: number, rightLimit: number, bottomLimit: number, vlMode: TVlModeTypes | undefined): [number, number, number, number];
19
20
  export declare function normalized01000(bbox: number[], width: number, height: number): [number, number, number, number];
20
21
  export declare function adaptGeminiBbox(bbox: number[], width: number, height: number): [number, number, number, number];
21
22
  export declare function adaptBboxToRect(bbox: number[], width: number, height: number, offsetX?: number, offsetY?: number, rightLimit?: number, bottomLimit?: number, vlMode?: TVlModeTypes | undefined): Rect;
22
- export declare function warnGPT4oSizeLimit(size: Size, modelName: string): void;
23
23
  export declare function mergeRects(rects: Rect[]): {
24
24
  left: number;
25
25
  top: number;
@@ -565,5 +565,5 @@ export declare const loadActionParam: (jsonObject: Record<string, any>, zodSchem
565
565
  * Locator fields are special business logic fields with complex validation requirements,
566
566
  * so they are intentionally excluded from Zod parsing and use existing validation logic.
567
567
  */
568
- export declare const parseActionParam: (rawParam: Record<string, any>, zodSchema: z.ZodType<any>) => Record<string, any>;
568
+ export declare const parseActionParam: (rawParam: Record<string, any> | undefined, zodSchema?: z.ZodType<any>) => Record<string, any> | undefined;
569
569
  export {};
@@ -0,0 +1,57 @@
1
+ import type { DeviceAction } from '../types';
2
+ /**
3
+ * Android device input options
4
+ */
5
+ export type AndroidDeviceInputOpt = {
6
+ /** Automatically dismiss the keyboard after input is completed */
7
+ autoDismissKeyboard?: boolean;
8
+ /** Strategy for dismissing the keyboard: 'esc-first' tries ESC before BACK, 'back-first' tries BACK before ESC */
9
+ keyboardDismissStrategy?: 'esc-first' | 'back-first';
10
+ };
11
+ /**
12
+ * Android device options
13
+ */
14
+ export type AndroidDeviceOpt = {
15
+ /** Path to the ADB executable */
16
+ androidAdbPath?: string;
17
+ /** Remote ADB host address */
18
+ remoteAdbHost?: string;
19
+ /** Remote ADB port */
20
+ remoteAdbPort?: number;
21
+ /** Input method editor strategy: 'always-yadb' always uses yadb, 'yadb-for-non-ascii' uses yadb only for non-ASCII characters */
22
+ imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii';
23
+ /** Display ID to use for this device */
24
+ displayId?: number;
25
+ /** Use physical display ID for screenshot operations */
26
+ usePhysicalDisplayIdForScreenshot?: boolean;
27
+ /** Use physical display ID when looking up display information */
28
+ usePhysicalDisplayIdForDisplayLookup?: boolean;
29
+ /** Custom device actions to register */
30
+ customActions?: DeviceAction<any>[];
31
+ /** Screenshot resize scale factor */
32
+ screenshotResizeScale?: number;
33
+ /** Always fetch screen info on each call; if false, cache the first result */
34
+ alwaysRefreshScreenInfo?: boolean;
35
+ } & AndroidDeviceInputOpt;
36
+ /**
37
+ * iOS device input options
38
+ */
39
+ export type IOSDeviceInputOpt = {
40
+ /** Automatically dismiss the keyboard after input is completed */
41
+ autoDismissKeyboard?: boolean;
42
+ };
43
+ /**
44
+ * iOS device options
45
+ */
46
+ export type IOSDeviceOpt = {
47
+ /** Device ID (UDID) to connect to */
48
+ deviceId?: string;
49
+ /** Custom device actions to register */
50
+ customActions?: DeviceAction<any>[];
51
+ /** WebDriverAgent port (default: 8100) */
52
+ wdaPort?: number;
53
+ /** WebDriverAgent host (default: 'localhost') */
54
+ wdaHost?: string;
55
+ /** Whether to use WebDriverAgent */
56
+ useWDA?: boolean;
57
+ } & IOSDeviceInputOpt;