@midscene/core 0.26.7-beta-20250818081955.0 → 0.26.7-beta-20250820150415.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/es/ai-model/common.mjs +130 -22
  2. package/dist/es/ai-model/common.mjs.map +1 -1
  3. package/dist/es/ai-model/index.mjs +3 -3
  4. package/dist/es/ai-model/inspect.mjs +28 -16
  5. package/dist/es/ai-model/inspect.mjs.map +1 -1
  6. package/dist/es/ai-model/llm-planning.mjs +25 -23
  7. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  8. package/dist/es/ai-model/prompt/llm-planning.mjs +69 -23
  9. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  10. package/dist/es/ai-model/prompt/playwright-generator.mjs +9 -3
  11. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  12. package/dist/es/ai-model/prompt/util.mjs +2 -2
  13. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  14. package/dist/es/ai-model/prompt/yaml-generator.mjs +9 -3
  15. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  16. package/dist/es/ai-model/service-caller/index.mjs +72 -118
  17. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  18. package/dist/es/ai-model/ui-tars-planning.mjs +5 -5
  19. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  20. package/dist/es/index.mjs +3 -2
  21. package/dist/es/index.mjs.map +1 -1
  22. package/dist/es/insight/index.mjs +13 -61
  23. package/dist/es/insight/index.mjs.map +1 -1
  24. package/dist/es/types.mjs.map +1 -1
  25. package/dist/es/utils.mjs +5 -6
  26. package/dist/es/utils.mjs.map +1 -1
  27. package/dist/lib/ai-model/common.js +166 -28
  28. package/dist/lib/ai-model/common.js.map +1 -1
  29. package/dist/lib/ai-model/index.js +31 -10
  30. package/dist/lib/ai-model/inspect.js +27 -15
  31. package/dist/lib/ai-model/inspect.js.map +1 -1
  32. package/dist/lib/ai-model/llm-planning.js +24 -22
  33. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  34. package/dist/lib/ai-model/prompt/llm-planning.js +71 -25
  35. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  36. package/dist/lib/ai-model/prompt/playwright-generator.js +9 -3
  37. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  38. package/dist/lib/ai-model/prompt/util.js +2 -2
  39. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  40. package/dist/lib/ai-model/prompt/yaml-generator.js +9 -3
  41. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  42. package/dist/lib/ai-model/service-caller/index.js +75 -124
  43. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  44. package/dist/lib/ai-model/ui-tars-planning.js +5 -5
  45. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  46. package/dist/lib/index.js +25 -3
  47. package/dist/lib/index.js.map +1 -1
  48. package/dist/lib/insight/index.js +10 -58
  49. package/dist/lib/insight/index.js.map +1 -1
  50. package/dist/lib/types.js.map +1 -1
  51. package/dist/lib/utils.js +4 -5
  52. package/dist/lib/utils.js.map +1 -1
  53. package/dist/types/ai-model/common.d.ts +517 -8
  54. package/dist/types/ai-model/index.d.ts +2 -2
  55. package/dist/types/ai-model/inspect.d.ts +4 -1
  56. package/dist/types/ai-model/llm-planning.d.ts +1 -1
  57. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
  58. package/dist/types/ai-model/prompt/util.d.ts +2 -1
  59. package/dist/types/ai-model/service-caller/index.d.ts +6 -6
  60. package/dist/types/ai-model/ui-tars-planning.d.ts +3 -1
  61. package/dist/types/index.d.ts +3 -1
  62. package/dist/types/insight/index.d.ts +2 -4
  63. package/dist/types/types.d.ts +9 -31
  64. package/dist/types/yaml.d.ts +6 -2
  65. package/package.json +4 -3
@@ -3,7 +3,7 @@ import { callAiFn } from './common';
3
3
  export declare function plan(userInstruction: string, opts: {
4
4
  context: UIContext;
5
5
  pageType: PageType;
6
- actionSpace: DeviceAction[];
6
+ actionSpace: DeviceAction<any>[];
7
7
  callAI?: typeof callAiFn<PlanningAIResponse>;
8
8
  log?: string;
9
9
  actionContext?: string;
@@ -2,9 +2,9 @@ import type { DeviceAction } from '../../types';
2
2
  import { PromptTemplate } from '@langchain/core/prompts';
3
3
  import type { vlLocateMode } from '@midscene/shared/env';
4
4
  import type { ResponseFormatJSONSchema } from 'openai/resources/index';
5
- export declare const descriptionForAction: (action: DeviceAction, locatorScheme: string) => string;
5
+ export declare const descriptionForAction: (action: DeviceAction<any>, locatorSchemaTypeDescription: string) => string;
6
6
  export declare function systemPromptToTaskPlanning({ actionSpace, vlMode, }: {
7
- actionSpace: DeviceAction[];
7
+ actionSpace: DeviceAction<any>[];
8
8
  vlMode: ReturnType<typeof vlLocateMode>;
9
9
  }): Promise<string>;
10
10
  export declare const planSchema: ResponseFormatJSONSchema;
@@ -1,4 +1,5 @@
1
1
  import type { BaseElement, ElementTreeNode, Size, UIContext } from '../../types';
2
+ import { type IModelPreferences } from '@midscene/shared/env';
2
3
  export declare function describeSize(size: Size): string;
3
4
  export declare function describeElement(elements: (Pick<BaseElement, 'rect' | 'content'> & {
4
5
  id: string;
@@ -19,7 +20,7 @@ export declare function distance(point1: {
19
20
  y: number;
20
21
  }): number;
21
22
  export declare const samplePageDescription = "\nAnd the page is described as follows:\n====================\nThe size of the page: 1280 x 720\nSome of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.\n\nDescription of all the elements in screenshot:\n<div id=\"969f1637\" markerId=\"1\" left=\"100\" top=\"100\" width=\"100\" height=\"100\"> // The markerId indicated by the rectangle label in the screenshot\n <h4 id=\"b211ecb2\" markerId=\"5\" left=\"150\" top=\"150\" width=\"90\" height=\"60\">\n The username is accepted\n </h4>\n ...many more\n</div>\n====================\n";
22
- export declare function describeUserPage<ElementType extends BaseElement = BaseElement>(context: Omit<UIContext<ElementType>, 'describer'>, opt?: {
23
+ export declare function describeUserPage<ElementType extends BaseElement = BaseElement>(context: Omit<UIContext<ElementType>, 'describer'>, modelPreferences: IModelPreferences, opt?: {
23
24
  truncateTextLength?: number;
24
25
  filterNonTextContent?: boolean;
25
26
  domIncluded?: boolean | 'visible-only';
@@ -1,11 +1,10 @@
1
1
  import { type AIUsageInfo } from '../../types';
2
2
  import type { StreamingCallback } from '../../types';
3
+ import { type IModelPreferences } from '@midscene/shared/env';
3
4
  import OpenAI from 'openai';
4
5
  import type { ChatCompletionMessageParam } from 'openai/resources/index';
5
6
  import { AIActionType, type AIArgs } from '../common';
6
- export declare function checkAIConfig(): boolean;
7
- export declare function getModelName(): string;
8
- export declare function call(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, responseFormat?: OpenAI.ChatCompletionCreateParams['response_format'] | OpenAI.ResponseFormatJSONObject, options?: {
7
+ export declare function call(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelPreferences: IModelPreferences, options?: {
9
8
  stream?: boolean;
10
9
  onChunk?: StreamingCallback;
11
10
  }): Promise<{
@@ -13,14 +12,15 @@ export declare function call(messages: ChatCompletionMessageParam[], AIActionTyp
13
12
  usage?: AIUsageInfo;
14
13
  isStreamed: boolean;
15
14
  }>;
16
- export declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
15
+ export declare const getResponseFormat: (modelName: string, AIActionTypeValue: AIActionType) => OpenAI.ChatCompletionCreateParams["response_format"] | OpenAI.ResponseFormatJSONObject;
16
+ export declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelPreferences: IModelPreferences): Promise<{
17
17
  content: T;
18
18
  usage?: AIUsageInfo;
19
19
  }>;
20
- export declare function callAiFnWithStringResponse<T>(msgs: AIArgs, AIActionTypeValue: AIActionType): Promise<{
20
+ export declare function callAiFnWithStringResponse<T>(msgs: AIArgs, AIActionTypeValue: AIActionType, modelPreferences: IModelPreferences): Promise<{
21
21
  content: string;
22
22
  usage?: AIUsageInfo;
23
23
  }>;
24
24
  export declare function extractJSONFromCodeBlock(response: string): string;
25
25
  export declare function preprocessDoubaoBboxJson(input: string): string;
26
- export declare function safeParseJson(input: string): any;
26
+ export declare function safeParseJson(input: string, modelPreferences: IModelPreferences): any;
@@ -1,4 +1,5 @@
1
1
  import type { AIUsageInfo, MidsceneYamlFlowItem, PlanningAction, Size } from '../types';
2
+ import { type IModelPreferences } from '@midscene/shared/env';
2
3
  import { actionParser } from '@ui-tars/action-parser';
3
4
  import type { ChatCompletionMessageParam } from 'openai/resources/index';
4
5
  type ActionType = 'click' | 'drag' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait' | 'androidBackButton' | 'androidHomeButton' | 'androidRecentAppsButton' | 'androidLongPress' | 'androidPull';
@@ -9,6 +10,7 @@ export declare function vlmPlanning(options: {
9
10
  width: number;
10
11
  height: number;
11
12
  };
13
+ modelPreferences: IModelPreferences;
12
14
  }): Promise<{
13
15
  actions: PlanningAction<any>[];
14
16
  actionsFromModel: ReturnType<typeof actionParser>['parsed'];
@@ -72,5 +74,5 @@ interface AndroidLongPressAction extends BaseAction {
72
74
  };
73
75
  }
74
76
  export type Action = ClickAction | DragAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction | AndroidLongPressAction;
75
- export declare function resizeImageForUiTars(imageBase64: string, size: Size): Promise<string>;
77
+ export declare function resizeImageForUiTars(imageBase64: string, size: Size, modelPreferences: IModelPreferences): Promise<string>;
76
78
  export {};
@@ -1,9 +1,11 @@
1
+ import { z } from 'zod';
1
2
  import { Executor } from './ai-model/action-executor';
2
3
  import Insight from './insight/index';
3
4
  import { getVersion } from './utils';
4
- export { plan, describeUserPage, AiLocateElement, } from './ai-model/index';
5
+ export { plan, describeUserPage, AiLocateElement, getMidsceneLocationSchema, type MidsceneLocationResultType, PointSchema, SizeSchema, RectSchema, TMultimodalPromptSchema, TUserPromptSchema, type TMultimodalPrompt, type TUserPrompt, } from './ai-model/index';
5
6
  export { getAIConfig, MIDSCENE_MODEL_NAME } from '@midscene/shared/env';
6
7
  export type * from './types';
8
+ export { z };
7
9
  export default Insight;
8
10
  export { Executor, Insight, getVersion };
9
11
  export type { MidsceneYamlScript, MidsceneYamlTask, MidsceneYamlFlowItem, MidsceneYamlFlowItemAIRightClick, MidsceneYamlConfigResult, LocateOption, DetailedLocateParam, } from './yaml';
@@ -1,5 +1,6 @@
1
1
  import { callAiFn } from '../ai-model/common';
2
- import type { AIDescribeElementResponse, AIElementResponse, AIUsageInfo, BaseElement, DetailedLocateParam, DumpSubscriber, InsightAction, InsightExtractOption, InsightExtractParam, InsightOptions, InsightTaskInfo, LocateResult, Rect, TMultimodalPrompt, UIContext } from '../types';
2
+ import type { AIElementResponse, AIUsageInfo, BaseElement, DetailedLocateParam, DumpSubscriber, InsightAction, InsightExtractOption, InsightExtractParam, InsightOptions, InsightTaskInfo, LocateResult, UIContext } from '../types';
3
+ import type { TMultimodalPrompt } from '../ai-model/common';
3
4
  export interface LocateOpts {
4
5
  context?: UIContext<BaseElement>;
5
6
  callAI?: typeof callAiFn<AIElementResponse>;
@@ -19,7 +20,4 @@ export default class Insight<ElementType extends BaseElement = BaseElement, Cont
19
20
  thought?: string;
20
21
  usage?: AIUsageInfo;
21
22
  }>;
22
- describe(target: Rect | [number, number], opt?: {
23
- deepThink?: boolean;
24
- }): Promise<Pick<AIDescribeElementResponse, 'description'>>;
25
23
  }
@@ -1,6 +1,8 @@
1
1
  import type { NodeType } from '@midscene/shared/constants';
2
2
  import type { BaseElement, ElementTreeNode, Rect, Size } from '@midscene/shared/types';
3
3
  import type { ChatCompletionMessageParam } from 'openai/resources/index';
4
+ import type { z } from 'zod';
5
+ import type { TUserPrompt } from './ai-model/common';
4
6
  import type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';
5
7
  export type { ElementTreeNode, BaseElement, Rect, Size, Point, } from '@midscene/shared/types';
6
8
  export * from './yaml';
@@ -106,10 +108,10 @@ export type EnsureObject<T> = {
106
108
  export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';
107
109
  export type InsightExtractParam = string | Record<string, string>;
108
110
  export type LocateResultElement = {
109
- id: string;
110
- indexId?: number;
111
111
  center: [number, number];
112
112
  rect: Rect;
113
+ id: string;
114
+ indexId?: number;
113
115
  xpaths: string[];
114
116
  attributes: {
115
117
  nodeType: NodeType;
@@ -189,7 +191,7 @@ export interface PlanningLocateParam extends DetailedLocateParam {
189
191
  }
190
192
  export interface PlanningAction<ParamType = any> {
191
193
  thought?: string;
192
- type: 'Locate' | 'Tap' | 'RightClick' | 'Hover' | 'Drag' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep' | 'Finished' | 'AndroidBackButton' | 'AndroidHomeButton' | 'AndroidRecentAppsButton' | 'AndroidLongPress' | 'AndroidPull';
194
+ type: string;
193
195
  param: ParamType;
194
196
  locate?: PlanningLocateParam | null;
195
197
  }
@@ -241,7 +243,6 @@ export interface Color {
241
243
  }
242
244
  export interface BaseAgentParserOpt {
243
245
  selector?: string;
244
- ignoreMarker?: boolean;
245
246
  }
246
247
  export interface PuppeteerParserOpt extends BaseAgentParserOpt {
247
248
  }
@@ -369,33 +370,10 @@ export interface StreamingAIResponse {
369
370
  /** Whether the response was streamed */
370
371
  isStreamed: boolean;
371
372
  }
372
- export type TMultimodalPrompt = {
373
- /**
374
- * Support use image to inspect elements.
375
- * The "images" field is an object that uses image name as key and image url as value.
376
- * The image url can be a local path, a http link , or a base64 string.
377
- */
378
- images?: {
379
- name: string;
380
- url: string;
381
- }[];
382
- /**
383
- * By default, the image url in the "images" filed starts with `https://` or `http://` will be directly sent to the LLM.
384
- * In case the images are not accessible to the LLM (One common case is that image url is internal network only.), you can enable this option.
385
- * Then image will be download and convert to base64 format.
386
- */
387
- convertHttpImage2Base64?: boolean;
388
- };
389
- export type TUserPrompt = string | ({
390
- prompt: string;
391
- } & Partial<TMultimodalPrompt>);
392
- export interface DeviceAction<ParamType = any> {
373
+ export interface DeviceAction<T = {}> {
393
374
  name: string;
394
- interfaceAlias?: string;
395
375
  description?: string;
396
- paramSchema?: string;
397
- paramDescription?: string;
398
- location?: 'required' | 'optional' | false;
399
- whatToLocate?: string;
400
- call: (context: ExecutorContext, param: ParamType) => Promise<void> | void;
376
+ interfaceAlias?: string;
377
+ paramSchema?: z.ZodType<T>;
378
+ call: (param: T, context: ExecutorContext) => Promise<void> | void;
401
379
  }
@@ -1,6 +1,8 @@
1
- import type { Rect, TUserPrompt } from './types';
1
+ import type { TUserPrompt } from './ai-model/common';
2
+ import type { Rect } from './types';
2
3
  import type { BaseElement, UIContext } from './types';
3
4
  export interface LocateOption {
5
+ prompt?: TUserPrompt;
4
6
  deepThink?: boolean;
5
7
  cacheable?: boolean;
6
8
  xpath?: string;
@@ -11,6 +13,7 @@ export interface InsightExtractOption {
11
13
  screenshotIncluded?: boolean;
12
14
  returnThought?: boolean;
13
15
  isWaitForAssert?: boolean;
16
+ doNotThrowError?: boolean;
14
17
  }
15
18
  export interface ReferenceImage {
16
19
  base64: string;
@@ -72,6 +75,7 @@ export interface MidsceneYamlFlowItemAIAction {
72
75
  export interface MidsceneYamlFlowItemAIAssert {
73
76
  aiAssert: string;
74
77
  errorMessage?: string;
78
+ name?: string;
75
79
  }
76
80
  export interface MidsceneYamlFlowItemAIQuery extends InsightExtractOption {
77
81
  aiQuery: string;
@@ -116,7 +120,7 @@ export interface MidsceneYamlFlowItemAIInput extends LocateOption {
116
120
  }
117
121
  export interface MidsceneYamlFlowItemAIKeyboardPress extends LocateOption {
118
122
  aiKeyboardPress: TUserPrompt | undefined;
119
- key: string;
123
+ keyName: string;
120
124
  }
121
125
  export interface MidsceneYamlFlowItemAIScroll extends LocateOption, ScrollParam {
122
126
  aiScroll: TUserPrompt | undefined;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
4
- "version": "0.26.7-beta-20250818081955.0",
4
+ "version": "0.26.7-beta-20250820150415.0",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "main": "./dist/lib/index.js",
@@ -60,8 +60,9 @@
60
60
  "langsmith": "0.3.7",
61
61
  "openai": "4.81.0",
62
62
  "socks-proxy-agent": "8.0.4",
63
- "@midscene/recorder": "0.26.7-beta-20250818081955.0",
64
- "@midscene/shared": "0.26.7-beta-20250818081955.0"
63
+ "zod": "3.24.3",
64
+ "@midscene/recorder": "0.26.7-beta-20250820150415.0",
65
+ "@midscene/shared": "0.26.7-beta-20250820150415.0"
65
66
  },
66
67
  "devDependencies": {
67
68
  "@microsoft/api-extractor": "^7.52.10",