@midscene/core 0.28.12-beta-20250923124135.0 → 0.28.12-beta-20250924031347.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/es/agent/agent.mjs +1 -1
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/tasks.mjs +45 -160
  4. package/dist/es/agent/tasks.mjs.map +1 -1
  5. package/dist/es/agent/utils.mjs +1 -1
  6. package/dist/es/ai-model/conversation-history.mjs +58 -0
  7. package/dist/es/ai-model/conversation-history.mjs.map +1 -0
  8. package/dist/es/ai-model/index.mjs +3 -2
  9. package/dist/es/ai-model/llm-planning.mjs +44 -12
  10. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  11. package/dist/es/ai-model/prompt/llm-planning.mjs +7 -61
  12. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  13. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  14. package/dist/es/ai-model/ui-tars-planning.mjs +40 -18
  15. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  16. package/dist/es/index.mjs.map +1 -1
  17. package/dist/es/types.mjs.map +1 -1
  18. package/dist/es/utils.mjs +2 -2
  19. package/dist/lib/agent/agent.js +1 -1
  20. package/dist/lib/agent/agent.js.map +1 -1
  21. package/dist/lib/agent/tasks.js +44 -159
  22. package/dist/lib/agent/tasks.js.map +1 -1
  23. package/dist/lib/agent/utils.js +1 -1
  24. package/dist/lib/ai-model/conversation-history.js +92 -0
  25. package/dist/lib/ai-model/conversation-history.js.map +1 -0
  26. package/dist/lib/ai-model/index.js +8 -4
  27. package/dist/lib/ai-model/llm-planning.js +43 -11
  28. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  29. package/dist/lib/ai-model/prompt/llm-planning.js +7 -67
  30. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  31. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  32. package/dist/lib/ai-model/ui-tars-planning.js +42 -20
  33. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  34. package/dist/lib/index.js.map +1 -1
  35. package/dist/lib/types.js.map +1 -1
  36. package/dist/lib/utils.js +2 -2
  37. package/dist/types/agent/tasks.d.ts +4 -17
  38. package/dist/types/ai-model/conversation-history.d.ts +18 -0
  39. package/dist/types/ai-model/index.d.ts +2 -1
  40. package/dist/types/ai-model/llm-planning.d.ts +2 -1
  41. package/dist/types/ai-model/prompt/llm-planning.d.ts +0 -6
  42. package/dist/types/ai-model/service-caller/index.d.ts +1 -1
  43. package/dist/types/ai-model/ui-tars-planning.d.ts +6 -18
  44. package/dist/types/index.d.ts +1 -1
  45. package/dist/types/types.d.ts +0 -1
  46. package/dist/types/yaml.d.ts +1 -11
  47. package/package.json +3 -3
@@ -1,4 +1,3 @@
1
- import { type ChatCompletionMessageParam } from '../ai-model';
2
1
  import type { AbstractInterface } from '../device';
3
2
  import { type DetailedLocateParam, type ExecutionTaskApply, type ExecutionTaskProgressOptions, Executor, type Insight, type InsightExtractOption, type InsightExtractParam, type MidsceneYamlFlowItem, type PlanningAction, type PlanningActionParamWaitFor, type PlanningLocateParam, type TMultimodalPrompt, type TUserPrompt } from '../index';
4
3
  import { type IModelConfig } from '@midscene/shared/env';
@@ -13,7 +12,7 @@ export declare class TaskExecutor {
13
12
  interface: AbstractInterface;
14
13
  insight: Insight;
15
14
  taskCache?: TaskCache;
16
- conversationHistory: ChatCompletionMessageParam[];
15
+ private conversationHistory;
17
16
  onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];
18
17
  replanningCycleLimit?: number;
19
18
  get page(): AbstractInterface;
@@ -32,29 +31,17 @@ export declare class TaskExecutor {
32
31
  loadYamlFlowAsPlanning(userInstruction: string, yamlString: string): Promise<{
33
32
  executor: Executor;
34
33
  }>;
35
- private planningTaskFromPrompt;
36
- private planningTaskToGoal;
34
+ private createPlanningTask;
37
35
  runPlans(title: string, plans: PlanningAction[], modelConfig: IModelConfig): Promise<ExecutionResult>;
36
+ private getReplanningCycleLimit;
38
37
  action(userPrompt: string, modelConfig: IModelConfig, actionContext?: string): Promise<ExecutionResult<{
39
38
  yamlFlow?: MidsceneYamlFlowItem[];
40
39
  } | undefined>>;
41
- actionToGoal(userPrompt: string, modelConfig: IModelConfig): Promise<ExecutionResult<{
42
- yamlFlow?: MidsceneYamlFlowItem[];
43
- } | undefined>>;
44
40
  private createTypeQueryTask;
45
41
  createTypeQueryExecution<T>(type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert', demand: InsightExtractParam, modelConfig: IModelConfig, opt?: InsightExtractOption, multimodalPrompt?: TMultimodalPrompt): Promise<ExecutionResult<T>>;
46
42
  assert(assertion: TUserPrompt, modelConfig: IModelConfig, opt?: InsightExtractOption): Promise<ExecutionResult<boolean>>;
47
- /**
48
- * Append a message to the conversation history
49
- * For user messages with images:
50
- * - Keep max 4 user image messages in history
51
- * - Remove oldest user image message when limit reached
52
- * For assistant messages:
53
- * - Simply append to history
54
- * @param conversationHistory Message to append
55
- */
56
- private appendConversationHistory;
57
43
  private appendErrorPlan;
44
+ taskForSleep(timeMs: number, modelConfig: IModelConfig): Promise<ExecutionTaskApply<any, any, any, any>>;
58
45
  waitFor(assertion: TUserPrompt, opt: PlanningActionParamWaitFor, modelConfig: IModelConfig): Promise<ExecutionResult<void>>;
59
46
  }
60
47
  export {};
@@ -0,0 +1,18 @@
1
+ import type { ChatCompletionMessageParam } from 'openai/resources/index';
2
+ export interface ConversationHistoryOptions {
3
+ maxUserImageMessages?: number;
4
+ initialMessages?: ChatCompletionMessageParam[];
5
+ }
6
+ export declare class ConversationHistory {
7
+ private readonly maxUserImageMessages;
8
+ private readonly messages;
9
+ constructor(options?: ConversationHistoryOptions);
10
+ append(message: ChatCompletionMessageParam): void;
11
+ seed(messages: ChatCompletionMessageParam[]): void;
12
+ reset(): void;
13
+ snapshot(): ChatCompletionMessageParam[];
14
+ get length(): number;
15
+ [Symbol.iterator](): IterableIterator<ChatCompletionMessageParam>;
16
+ toJSON(): ChatCompletionMessageParam[];
17
+ private pruneOldestUserMessageIfNecessary;
18
+ }
@@ -7,6 +7,7 @@ export type { ChatCompletionMessageParam } from 'openai/resources/index';
7
7
  export { AiLocateElement, AiExtractElementInfo, AiLocateSection, } from './inspect';
8
8
  export { plan } from './llm-planning';
9
9
  export { adaptBboxToRect } from './common';
10
- export { vlmPlanning, resizeImageForUiTars } from './ui-tars-planning';
10
+ export { uiTarsPlanning, resizeImageForUiTars } from './ui-tars-planning';
11
+ export { ConversationHistory, type ConversationHistoryOptions, } from './conversation-history';
11
12
  export { AIActionType, type AIArgs } from './common';
12
13
  export { getMidsceneLocationSchema, type MidsceneLocationResultType, PointSchema, SizeSchema, RectSchema, TMultimodalPromptSchema, TUserPromptSchema, type TMultimodalPrompt, type TUserPrompt, findAllMidsceneLocatorField, dumpActionParam, loadActionParam, } from './common';
@@ -1,10 +1,11 @@
1
1
  import type { DeviceAction, InterfaceType, PlanningAIResponse, UIContext } from '../types';
2
2
  import type { IModelConfig } from '@midscene/shared/env';
3
+ import type { ConversationHistory } from './conversation-history';
3
4
  export declare function plan(userInstruction: string, opts: {
4
5
  context: UIContext;
5
6
  interfaceType: InterfaceType;
6
7
  actionSpace: DeviceAction<any>[];
7
- log?: string;
8
8
  actionContext?: string;
9
9
  modelConfig: IModelConfig;
10
+ conversationHistory?: ConversationHistory;
10
11
  }): Promise<PlanningAIResponse>;
@@ -1,5 +1,4 @@
1
1
  import type { DeviceAction } from '../../types';
2
- import { PromptTemplate } from '@langchain/core/prompts';
3
2
  import type { TVlModeTypes } from '@midscene/shared/env';
4
3
  import type { ResponseFormatJSONSchema } from 'openai/resources/index';
5
4
  export declare const descriptionForAction: (action: DeviceAction<any>, locatorSchemaTypeDescription: string) => string;
@@ -8,8 +7,3 @@ export declare function systemPromptToTaskPlanning({ actionSpace, vlMode, }: {
8
7
  vlMode: TVlModeTypes | undefined;
9
8
  }): Promise<string>;
10
9
  export declare const planSchema: ResponseFormatJSONSchema;
11
- export declare const generateTaskBackgroundContext: (userInstruction: string, log?: string, userActionContext?: string) => string;
12
- export declare const automationUserPrompt: (vlMode: TVlModeTypes | undefined) => PromptTemplate<{
13
- pageDescription: any;
14
- taskBackgroundContext: any;
15
- }, any>;
@@ -13,7 +13,7 @@ export declare function callAI(messages: ChatCompletionMessageParam[], AIActionT
13
13
  isStreamed: boolean;
14
14
  }>;
15
15
  export declare const getResponseFormat: (modelName: string, AIActionTypeValue: AIActionType) => OpenAI.ChatCompletionCreateParams["response_format"] | OpenAI.ResponseFormatJSONObject;
16
- export declare function callAIWithObjectResponse<T>(messages: AIArgs, AIActionTypeValue: AIActionType, modelConfig: IModelConfig): Promise<{
16
+ export declare function callAIWithObjectResponse<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelConfig: IModelConfig): Promise<{
17
17
  content: T;
18
18
  usage?: AIUsageInfo;
19
19
  }>;
@@ -1,24 +1,12 @@
1
- import type { AIUsageInfo, MidsceneYamlFlowItem, PlanningAction, Size } from '../types';
1
+ import type { PlanningAIResponse, Size, UIContext } from '../types';
2
2
  import { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';
3
- import { actionParser } from '@ui-tars/action-parser';
4
- import type { ChatCompletionMessageParam } from 'openai/resources/index';
3
+ import type { ConversationHistory } from './conversation-history';
5
4
  type ActionType = 'click' | 'drag' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
6
- export declare function vlmPlanning(options: {
7
- userInstruction: string;
8
- conversationHistory: ChatCompletionMessageParam[];
9
- size: {
10
- width: number;
11
- height: number;
12
- };
5
+ export declare function uiTarsPlanning(userInstruction: string, options: {
6
+ conversationHistory: ConversationHistory;
7
+ context: UIContext;
13
8
  modelConfig: IModelConfig;
14
- }): Promise<{
15
- actions: PlanningAction<any>[];
16
- actionsFromModel: ReturnType<typeof actionParser>['parsed'];
17
- action_summary: string;
18
- yamlFlow?: MidsceneYamlFlowItem[];
19
- usage?: AIUsageInfo;
20
- rawResponse?: string;
21
- }>;
9
+ }): Promise<PlanningAIResponse>;
22
10
  interface BaseAction {
23
11
  action_type: ActionType;
24
12
  action_inputs: Record<string, any>;
@@ -8,5 +8,5 @@ export type * from './types';
8
8
  export { z };
9
9
  export default Insight;
10
10
  export { Executor, Insight, getVersion };
11
- export type { MidsceneYamlScript, MidsceneYamlTask, MidsceneYamlFlowItem, MidsceneYamlConfigResult, MidsceneYamlConfig, MidsceneYamlScriptWebEnv, MidsceneYamlScriptAndroidEnv, MidsceneYamlScriptIOSEnv, MidsceneYamlScriptEnv, LocateOption, DetailedLocateParam, } from './yaml';
11
+ export type { MidsceneYamlScript, MidsceneYamlTask, MidsceneYamlFlowItem, MidsceneYamlConfigResult, LocateOption, DetailedLocateParam, } from './yaml';
12
12
  export { Agent, type AgentOpt, createAgent } from './agent';
@@ -318,7 +318,6 @@ export type ExecutionTaskLogApply<LogParam = {
318
318
  export type ExecutionTaskLog = ExecutionTask<ExecutionTaskLogApply>;
319
319
  export type ExecutionTaskPlanningApply = ExecutionTaskApply<'Planning', {
320
320
  userInstruction: string;
321
- log?: string;
322
321
  }, PlanningAIResponse>;
323
322
  export type ExecutionTaskPlanning = ExecutionTask<ExecutionTaskPlanningApply>;
324
323
  export interface GroupedActionDump {
@@ -32,7 +32,6 @@ export interface MidsceneYamlScript {
32
32
  target?: MidsceneYamlScriptWebEnv;
33
33
  web?: MidsceneYamlScriptWebEnv;
34
34
  android?: MidsceneYamlScriptAndroidEnv;
35
- ios?: MidsceneYamlScriptIOSEnv;
36
35
  interface?: MidsceneYamlScriptEnvGeneralInterface;
37
36
  config?: MidsceneYamlScriptConfig;
38
37
  agent?: MidsceneYamlScriptAgentOpt;
@@ -74,15 +73,7 @@ export interface MidsceneYamlScriptAndroidEnv extends MidsceneYamlScriptConfig {
74
73
  deviceId?: string;
75
74
  launch?: string;
76
75
  }
77
- export interface MidsceneYamlScriptIOSEnv extends MidsceneYamlScriptConfig {
78
- deviceId?: string;
79
- wdaPort?: number;
80
- wdaHost?: string;
81
- autoDismissKeyboard?: boolean;
82
- keyboardDismissStrategy?: 'done-first' | 'escape-first';
83
- launch?: string;
84
- }
85
- export type MidsceneYamlScriptEnv = MidsceneYamlScriptWebEnv | MidsceneYamlScriptAndroidEnv | MidsceneYamlScriptIOSEnv;
76
+ export type MidsceneYamlScriptEnv = MidsceneYamlScriptWebEnv | MidsceneYamlScriptAndroidEnv;
86
77
  export interface MidsceneYamlFlowItemAIAction {
87
78
  ai?: string;
88
79
  aiAction?: string;
@@ -152,7 +143,6 @@ export interface MidsceneYamlConfig {
152
143
  shareBrowserContext?: boolean;
153
144
  web?: MidsceneYamlScriptWebEnv;
154
145
  android?: MidsceneYamlScriptAndroidEnv;
155
- ios?: MidsceneYamlScriptIOSEnv;
156
146
  files: string[];
157
147
  headed?: boolean;
158
148
  keepWindow?: boolean;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
4
- "version": "0.28.12-beta-20250923124135.0",
4
+ "version": "0.28.12-beta-20250924031347.0",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "main": "./dist/lib/index.js",
@@ -87,8 +87,8 @@
87
87
  "zod": "3.24.3",
88
88
  "semver": "7.5.2",
89
89
  "js-yaml": "4.1.0",
90
- "@midscene/recorder": "0.28.12-beta-20250923124135.0",
91
- "@midscene/shared": "0.28.12-beta-20250923124135.0"
90
+ "@midscene/recorder": "0.28.12-beta-20250924031347.0",
91
+ "@midscene/shared": "0.28.12-beta-20250924031347.0"
92
92
  },
93
93
  "devDependencies": {
94
94
  "@microsoft/api-extractor": "^7.52.10",