@midscene/core 1.0.5-beta-20251230131740.0 → 1.0.5-beta-20251231022759.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/es/agent/agent.mjs +3 -1
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/task-builder.mjs +1 -0
  4. package/dist/es/agent/task-builder.mjs.map +1 -1
  5. package/dist/es/agent/tasks.mjs +8 -4
  6. package/dist/es/agent/tasks.mjs.map +1 -1
  7. package/dist/es/agent/utils.mjs +1 -1
  8. package/dist/es/ai-model/inspect.mjs +4 -2
  9. package/dist/es/ai-model/inspect.mjs.map +1 -1
  10. package/dist/es/ai-model/llm-planning.mjs +5 -1
  11. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  12. package/dist/es/ai-model/prompt/llm-planning.mjs +3 -11
  13. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  14. package/dist/es/ai-model/service-caller/index.mjs +21 -5
  15. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  16. package/dist/es/service/index.mjs +7 -4
  17. package/dist/es/service/index.mjs.map +1 -1
  18. package/dist/es/types.mjs.map +1 -1
  19. package/dist/es/utils.mjs +2 -2
  20. package/dist/lib/agent/agent.js +3 -1
  21. package/dist/lib/agent/agent.js.map +1 -1
  22. package/dist/lib/agent/task-builder.js +1 -0
  23. package/dist/lib/agent/task-builder.js.map +1 -1
  24. package/dist/lib/agent/tasks.js +8 -4
  25. package/dist/lib/agent/tasks.js.map +1 -1
  26. package/dist/lib/agent/utils.js +1 -1
  27. package/dist/lib/ai-model/inspect.js +4 -2
  28. package/dist/lib/ai-model/inspect.js.map +1 -1
  29. package/dist/lib/ai-model/llm-planning.js +5 -1
  30. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  31. package/dist/lib/ai-model/prompt/llm-planning.js +3 -11
  32. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  33. package/dist/lib/ai-model/service-caller/index.js +21 -5
  34. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  35. package/dist/lib/service/index.js +7 -4
  36. package/dist/lib/service/index.js.map +1 -1
  37. package/dist/lib/types.js.map +1 -1
  38. package/dist/lib/utils.js +2 -2
  39. package/dist/types/agent/agent.d.ts +2 -0
  40. package/dist/types/agent/tasks.d.ts +1 -1
  41. package/dist/types/ai-model/inspect.d.ts +2 -0
  42. package/dist/types/ai-model/llm-planning.d.ts +2 -0
  43. package/dist/types/ai-model/service-caller/index.d.ts +8 -1
  44. package/dist/types/types.d.ts +4 -0
  45. package/package.json +2 -2
@@ -1 +1 @@
1
- {"version":3,"file":"agent/tasks.mjs","sources":["../../../src/agent/tasks.ts"],"sourcesContent":["import { ConversationHistory, plan, uiTarsPlanning } from '@/ai-model';\nimport type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface } from '@/device';\nimport type Service from '@/service';\nimport type { TaskRunner } from '@/task-runner';\nimport { TaskExecutionError } from '@/task-runner';\nimport type {\n DeviceAction,\n ExecutionTaskApply,\n ExecutionTaskInsightQueryApply,\n ExecutionTaskPlanningApply,\n ExecutionTaskProgressOptions,\n InterfaceType,\n MidsceneYamlFlowItem,\n PlanningAIResponse,\n PlanningAction,\n PlanningActionParamSleep,\n PlanningActionParamWaitFor,\n ServiceDump,\n ServiceExtractOption,\n ServiceExtractParam,\n} from '@/types';\nimport { ServiceError } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ExecutionSession } from './execution-session';\nimport { TaskBuilder } from './task-builder';\nimport type { TaskCache } from './task-cache';\nexport { locatePlanForLocate } from './task-builder';\nimport { descriptionOfTree } from '@midscene/shared/extractor';\nimport { taskTitleStr } from './ui-utils';\nimport { parsePrompt } from './utils';\n\ninterface ExecutionResult<OutputType = any> {\n output: OutputType;\n thought?: string;\n runner: TaskRunner;\n}\n\ninterface TaskExecutorHooks {\n onTaskUpdate?: (\n runner: TaskRunner,\n error?: TaskExecutionError,\n ) => Promise<void> | void;\n}\n\nconst debug = getDebug('device-task-executor');\nconst maxErrorCountAllowedInOnePlanningLoop = 5;\n\nexport { TaskExecutionError };\n\nexport class TaskExecutor {\n interface: AbstractInterface;\n\n service: Service;\n\n taskCache?: TaskCache;\n\n private readonly providedActionSpace: DeviceAction[];\n\n private readonly taskBuilder: TaskBuilder;\n\n private conversationHistory: ConversationHistory;\n\n onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];\n\n private readonly hooks?: TaskExecutorHooks;\n\n replanningCycleLimit?: number;\n\n // @deprecated use .interface instead\n get page() {\n return this.interface;\n }\n\n constructor(\n interfaceInstance: AbstractInterface,\n service: Service,\n opts: {\n taskCache?: TaskCache;\n onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];\n replanningCycleLimit?: number;\n hooks?: TaskExecutorHooks;\n actionSpace: DeviceAction[];\n },\n ) {\n this.interface = interfaceInstance;\n this.service = service;\n this.taskCache = opts.taskCache;\n this.onTaskStartCallback = opts?.onTaskStart;\n this.replanningCycleLimit = opts.replanningCycleLimit;\n this.hooks = opts.hooks;\n this.conversationHistory = new ConversationHistory();\n this.providedActionSpace = opts.actionSpace;\n this.taskBuilder = new TaskBuilder({\n interfaceInstance,\n service,\n taskCache: opts.taskCache,\n actionSpace: this.getActionSpace(),\n });\n }\n\n private createExecutionSession(\n title: string,\n options?: { tasks?: ExecutionTaskApply[] },\n ) {\n return new ExecutionSession(\n title,\n () => Promise.resolve(this.service.contextRetrieverFn()),\n {\n onTaskStart: this.onTaskStartCallback,\n tasks: options?.tasks,\n onTaskUpdate: this.hooks?.onTaskUpdate,\n },\n );\n }\n\n private getActionSpace(): DeviceAction[] {\n return this.providedActionSpace;\n }\n\n public async convertPlanToExecutable(\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n options?: {\n cacheable?: boolean;\n subTask?: boolean;\n },\n ) {\n return this.taskBuilder.build(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n options,\n );\n }\n\n async loadYamlFlowAsPlanning(userInstruction: string, yamlString: string) {\n const session = this.createExecutionSession(\n taskTitleStr('Action', userInstruction),\n );\n\n const task: ExecutionTaskPlanningApply = {\n type: 'Planning',\n subType: 'LoadYaml',\n param: {\n userInstruction,\n },\n executor: async (param, executorContext) => {\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n return {\n output: {\n actions: [],\n more_actions_needed_by_instruction: false,\n log: '',\n yamlString,\n },\n cache: {\n hit: true,\n },\n hitBy: {\n from: 'Cache',\n context: {\n yamlString,\n },\n },\n };\n },\n };\n const runner = session.getRunner();\n await session.appendAndRun(task);\n\n return {\n runner,\n };\n }\n\n async runPlans(\n title: string,\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n ): Promise<ExecutionResult> {\n const session = this.createExecutionSession(title);\n const { tasks } = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n );\n const runner = session.getRunner();\n const result = await session.appendAndRun(tasks);\n const { output } = result ?? {};\n return {\n output,\n runner,\n };\n }\n\n async action(\n userPrompt: string,\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n includeBboxInPlanning: boolean,\n aiActContext?: string,\n cacheable?: boolean,\n replanningCycleLimitOverride?: number,\n imagesIncludeCount?: number,\n ): Promise<\n ExecutionResult<\n | {\n yamlFlow?: MidsceneYamlFlowItem[]; // for cache use\n }\n | undefined\n >\n > {\n this.conversationHistory.reset();\n\n const session = this.createExecutionSession(\n taskTitleStr('Action', userPrompt),\n );\n const runner = session.getRunner();\n\n let replanCount = 0;\n const yamlFlow: MidsceneYamlFlowItem[] = [];\n const replanningCycleLimit =\n replanningCycleLimitOverride ?? this.replanningCycleLimit;\n assert(\n replanningCycleLimit !== undefined,\n 'replanningCycleLimit is required for TaskExecutor.action',\n );\n\n let errorCountInOnePlanningLoop = 0; // count the number of errors in one planning loop\n\n // Main planning loop - unified plan/replan logic\n while (true) {\n const result = await session.appendAndRun(\n {\n type: 'Planning',\n subType: 'Plan',\n param: {\n userInstruction: userPrompt,\n aiActContext,\n imagesIncludeCount,\n },\n executor: async (param, executorContext) => {\n const startTime = Date.now();\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n const { vlMode } = modelConfigForPlanning;\n const uiTarsModelVersion =\n vlMode === 'vlm-ui-tars'\n ? modelConfigForPlanning.uiTarsModelVersion\n : undefined;\n\n const actionSpace = this.getActionSpace();\n debug(\n 'actionSpace for this interface is:',\n actionSpace.map((action) => action.name).join(', '),\n );\n assert(Array.isArray(actionSpace), 'actionSpace must be an array');\n if (actionSpace.length === 0) {\n console.warn(\n `ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`,\n );\n }\n\n const planResult = await (uiTarsModelVersion\n ? uiTarsPlanning\n : plan)(param.userInstruction, {\n context: uiContext,\n actionContext: param.aiActContext,\n interfaceType: this.interface.interfaceType as InterfaceType,\n actionSpace,\n modelConfig: modelConfigForPlanning,\n conversationHistory: this.conversationHistory,\n includeBbox: includeBboxInPlanning,\n imagesIncludeCount,\n });\n debug('planResult', JSON.stringify(planResult, null, 2));\n\n const {\n actions,\n log,\n more_actions_needed_by_instruction,\n error,\n usage,\n rawResponse,\n sleep,\n } = planResult;\n\n executorContext.task.log = {\n ...(executorContext.task.log || {}),\n rawResponse,\n };\n executorContext.task.usage = usage;\n executorContext.task.output = {\n actions: actions || [],\n more_actions_needed_by_instruction,\n log,\n yamlFlow: planResult.yamlFlow,\n };\n executorContext.uiContext = uiContext;\n\n const finalActions = [...(actions || [])];\n\n if (sleep) {\n const timeNow = Date.now();\n const timeRemaining = sleep - (timeNow - startTime);\n if (timeRemaining > 0) {\n finalActions.push(this.sleepPlan(timeRemaining));\n }\n }\n\n assert(!error, `Failed to continue: ${error}\\n${log || ''}`);\n\n return {\n cache: {\n hit: false,\n },\n } as any;\n },\n },\n {\n allowWhenError: true,\n },\n );\n\n const planResult = result?.output as PlanningAIResponse | undefined;\n\n // Execute planned actions\n const plans = planResult?.actions || [];\n yamlFlow.push(...(planResult?.yamlFlow || []));\n\n let executables: Awaited<ReturnType<typeof this.convertPlanToExecutable>>;\n try {\n executables = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n {\n cacheable,\n subTask: true,\n },\n );\n } catch (error) {\n return session.appendErrorPlan(\n `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(\n plans,\n )}`,\n );\n }\n if (this.conversationHistory.pendingFeedbackMessage) {\n console.warn(\n 'unconsumed pending feedback message detected, this may lead to unexpected planning result:',\n this.conversationHistory.pendingFeedbackMessage,\n );\n }\n let errorFlag = false;\n try {\n await session.appendAndRun(executables.tasks);\n } catch (error: any) {\n errorFlag = true;\n errorCountInOnePlanningLoop++;\n this.conversationHistory.pendingFeedbackMessage = `Error executing running tasks: ${error?.message || String(error)}`;\n debug(\n 'error when executing running tasks, but continue to run if it is not too many errors:',\n error instanceof Error ? error.message : String(error),\n 'current error count in one planning loop:',\n errorCountInOnePlanningLoop,\n );\n }\n\n if (errorCountInOnePlanningLoop > maxErrorCountAllowedInOnePlanningLoop) {\n return session.appendErrorPlan('Too many errors in one planning loop');\n }\n\n // Check if task is complete\n if (!planResult?.more_actions_needed_by_instruction) {\n if (errorFlag) {\n debug(\n 'more_actions_needed_by_instruction is false, but there are errors in one planning loop, continue to run',\n );\n } else {\n break;\n }\n }\n\n // Increment replan count for next iteration\n ++replanCount;\n\n if (replanCount > replanningCycleLimit) {\n const errorMsg = `Replanned ${replanningCycleLimit} times, exceeding the limit. Please configure a larger value for replanningCycleLimit (or use MIDSCENE_REPLANNING_CYCLE_LIMIT) to handle more complex tasks.`;\n return session.appendErrorPlan(errorMsg);\n }\n\n if (!this.conversationHistory.pendingFeedbackMessage) {\n this.conversationHistory.pendingFeedbackMessage =\n 'I have finished the action previously planned.';\n }\n }\n\n const finalResult = {\n output: {\n yamlFlow,\n },\n runner,\n };\n return finalResult;\n }\n\n private createTypeQueryTask(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert' | 'WaitFor',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ) {\n const queryTask: ExecutionTaskInsightQueryApply = {\n type: 'Insight',\n subType: type,\n param: {\n dataDemand: multimodalPrompt\n ? ({\n demand,\n multimodalPrompt,\n } as never)\n : demand, // for user param presentation in report right sidebar\n },\n executor: async (param, taskContext) => {\n const { task } = taskContext;\n let queryDump: ServiceDump | undefined;\n const applyDump = (dump: ServiceDump) => {\n queryDump = dump;\n task.log = {\n dump,\n };\n };\n\n // Get context for query operations\n const uiContext = taskContext.uiContext;\n assert(uiContext, 'uiContext is required for Query task');\n\n const ifTypeRestricted = type !== 'Query';\n let demandInput = demand;\n let keyOfResult = 'result';\n if (ifTypeRestricted && (type === 'Assert' || type === 'WaitFor')) {\n keyOfResult = 'StatementIsTruthy';\n const booleanPrompt =\n type === 'Assert'\n ? `Boolean, whether the following statement is true: ${demand}`\n : `Boolean, the user wants to do some 'wait for' operation, please check whether the following statement is true: ${demand}`;\n demandInput = {\n [keyOfResult]: booleanPrompt,\n };\n } else if (ifTypeRestricted) {\n demandInput = {\n [keyOfResult]: `${type}, ${demand}`,\n };\n }\n\n let extractResult;\n\n let extraPageDescription = '';\n if (opt?.domIncluded && this.interface.getElementsNodeTree) {\n debug('appending tree info for page');\n const tree = await this.interface.getElementsNodeTree();\n extraPageDescription = await descriptionOfTree(\n tree,\n 200,\n false,\n opt?.domIncluded === 'visible-only',\n );\n }\n\n try {\n extractResult = await this.service.extract<any>(\n demandInput,\n modelConfig,\n opt,\n extraPageDescription,\n multimodalPrompt,\n );\n } catch (error) {\n if (error instanceof ServiceError) {\n applyDump(error.dump);\n }\n throw error;\n }\n\n const { data, usage, thought, dump } = extractResult;\n applyDump(dump);\n\n let outputResult = data;\n if (ifTypeRestricted) {\n // If AI returned a plain string instead of structured format, use it directly\n if (typeof data === 'string') {\n outputResult = data;\n } else if (type === 'WaitFor') {\n if (data === null || data === undefined) {\n outputResult = false;\n } else {\n outputResult = (data as any)[keyOfResult];\n }\n } else if (data === null || data === undefined) {\n outputResult = null;\n } else {\n assert(\n data?.[keyOfResult] !== undefined,\n 'No result in query data',\n );\n outputResult = (data as any)[keyOfResult];\n }\n }\n\n if (type === 'Assert' && !outputResult) {\n task.usage = usage;\n task.thought = thought;\n throw new Error(`Assertion failed: ${thought}`);\n }\n\n return {\n output: outputResult,\n log: queryDump,\n usage,\n thought,\n };\n },\n };\n\n return queryTask;\n }\n async createTypeQueryExecution<T>(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<ExecutionResult<T>> {\n const session = this.createExecutionSession(\n taskTitleStr(\n type,\n typeof demand === 'string' ? demand : JSON.stringify(demand),\n ),\n );\n\n const queryTask = await this.createTypeQueryTask(\n type,\n demand,\n modelConfig,\n opt,\n multimodalPrompt,\n );\n\n const runner = session.getRunner();\n const result = await session.appendAndRun(queryTask);\n\n if (!result) {\n throw new Error(\n 'result of taskExecutor.flush() is undefined in function createTypeQueryTask',\n );\n }\n\n const { output, thought } = result;\n\n return {\n output,\n thought,\n runner,\n };\n }\n\n private sleepPlan(timeMs: number): PlanningAction<PlanningActionParamSleep> {\n return {\n type: 'Sleep',\n param: {\n timeMs,\n },\n };\n }\n\n async taskForSleep(timeMs: number, _modelConfig: IModelConfig) {\n return this.taskBuilder.createSleepTask({\n timeMs,\n });\n }\n\n async waitFor(\n assertion: TUserPrompt,\n opt: PlanningActionParamWaitFor,\n modelConfig: IModelConfig,\n ): Promise<ExecutionResult<void>> {\n const { textPrompt, multimodalPrompt } = parsePrompt(assertion);\n\n const description = `waitFor: ${textPrompt}`;\n const session = this.createExecutionSession(\n taskTitleStr('WaitFor', description),\n );\n const runner = session.getRunner();\n const { timeoutMs, checkIntervalMs } = opt;\n\n assert(assertion, 'No assertion for waitFor');\n assert(timeoutMs, 'No timeoutMs for waitFor');\n assert(checkIntervalMs, 'No checkIntervalMs for waitFor');\n\n assert(\n checkIntervalMs <= timeoutMs,\n `wrong config for waitFor: checkIntervalMs must be less than timeoutMs, config: {checkIntervalMs: ${checkIntervalMs}, timeoutMs: ${timeoutMs}}`,\n );\n\n const overallStartTime = Date.now();\n let lastCheckStart = overallStartTime;\n let errorThought = '';\n // Continue checking as long as the previous iteration began within the timeout window.\n while (lastCheckStart - overallStartTime <= timeoutMs) {\n const currentCheckStart = Date.now();\n lastCheckStart = currentCheckStart;\n const queryTask = await this.createTypeQueryTask(\n 'WaitFor',\n textPrompt,\n modelConfig,\n undefined,\n multimodalPrompt,\n );\n\n const result = (await session.appendAndRun(queryTask)) as\n | {\n output: boolean;\n thought?: string;\n }\n | undefined;\n\n if (result?.output) {\n return {\n output: undefined,\n runner,\n };\n }\n\n errorThought =\n result?.thought ||\n (!result && `No result from assertion: ${textPrompt}`) ||\n `unknown error when waiting for assertion: ${textPrompt}`;\n const now = Date.now();\n if (now - currentCheckStart < checkIntervalMs) {\n const timeRemaining = checkIntervalMs - (now - currentCheckStart);\n const sleepTask = this.taskBuilder.createSleepTask({\n timeMs: timeRemaining,\n });\n await session.append(sleepTask);\n }\n }\n\n return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);\n }\n}\n"],"names":["debug","getDebug","maxErrorCountAllowedInOnePlanningLoop","TaskExecutor","title","options","ExecutionSession","Promise","plans","modelConfigForPlanning","modelConfigForDefaultIntent","userInstruction","yamlString","session","taskTitleStr","task","param","executorContext","uiContext","assert","runner","tasks","result","output","userPrompt","includeBboxInPlanning","aiActContext","cacheable","replanningCycleLimitOverride","imagesIncludeCount","replanCount","yamlFlow","replanningCycleLimit","undefined","errorCountInOnePlanningLoop","startTime","Date","vlMode","uiTarsModelVersion","actionSpace","action","Array","console","planResult","uiTarsPlanning","plan","JSON","actions","log","more_actions_needed_by_instruction","error","usage","rawResponse","sleep","finalActions","timeNow","timeRemaining","executables","errorFlag","String","Error","errorMsg","finalResult","type","demand","modelConfig","opt","multimodalPrompt","queryTask","taskContext","queryDump","applyDump","dump","ifTypeRestricted","demandInput","keyOfResult","booleanPrompt","extractResult","extraPageDescription","tree","descriptionOfTree","ServiceError","data","thought","outputResult","timeMs","_modelConfig","assertion","textPrompt","parsePrompt","description","timeoutMs","checkIntervalMs","overallStartTime","lastCheckStart","errorThought","currentCheckStart","now","sleepTask","interfaceInstance","service","opts","ConversationHistory","TaskBuilder"],"mappings":";;;;;;;;;;;;;;;;;;;;AA+CA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,wCAAwC;AAIvC,MAAMC;IAoBX,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,SAAS;IACvB;IA6BQ,uBACNC,KAAa,EACbC,OAA0C,EAC1C;QACA,OAAO,IAAIC,iBACTF,OACA,IAAMG,QAAQ,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,kBAAkB,KACrD;YACE,aAAa,IAAI,CAAC,mBAAmB;YACrC,OAAOF,SAAS;YAChB,cAAc,IAAI,CAAC,KAAK,EAAE;QAC5B;IAEJ;IAEQ,iBAAiC;QACvC,OAAO,IAAI,CAAC,mBAAmB;IACjC;IAEA,MAAa,wBACXG,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACzCL,OAGC,EACD;QACA,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,CAC3BG,OACAC,wBACAC,6BACAL;IAEJ;IAEA,MAAM,uBAAuBM,eAAuB,EAAEC,UAAkB,EAAE;QACxE,MAAMC,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUH;QAGzB,MAAMI,OAAmC;YACvC,MAAM;YACN,SAAS;YACT,OAAO;gBACLJ;YACF;YACA,UAAU,OAAOK,OAAOC;gBACtB,MAAM,EAAEC,SAAS,EAAE,GAAGD;gBACtBE,OAAOD,WAAW;gBAClB,OAAO;oBACL,QAAQ;wBACN,SAAS,EAAE;wBACX,oCAAoC;wBACpC,KAAK;wBACLN;oBACF;oBACA,OAAO;wBACL,KAAK;oBACP;oBACA,OAAO;wBACL,MAAM;wBACN,SAAS;4BACPA;wBACF;oBACF;gBACF;YACF;QACF;QACA,MAAMQ,SAASP,QAAQ,SAAS;QAChC,MAAMA,QAAQ,YAAY,CAACE;QAE3B,OAAO;YACLK;QACF;IACF;IAEA,MAAM,SACJhB,KAAa,EACbI,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACf;QAC1B,MAAMG,UAAU,IAAI,CAAC,sBAAsB,CAACT;QAC5C,MAAM,EAAEiB,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAClDb,OACAC,wBACAC;QAEF,MAAMU,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAACQ;QAC1C,MAAM,EAAEE,MAAM,EAAE,GAAGD,UAAU,CAAC;QAC9B,OAAO;YACLC;YACAH;QACF;IACF;IAEA,MAAM,OACJI,UAAkB,EAClBf,sBAAoC,EACpCC,2BAAyC,EACzCe,qBAA8B,EAC9BC,YAAqB,EACrBC,SAAmB,EACnBC,4BAAqC,EACrCC,kBAA2B,EAQ3B;QACA,IAAI,CAAC,mBAAmB,CAAC,KAAK;QAE9B,MAAMhB,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUU;QAEzB,MAAMJ,SAASP,QAAQ,SAAS;QAEhC,IAAIiB,cAAc;QAClB,MAAMC,WAAmC,EAAE;QAC3C,MAAMC,uBACJJ,gCAAgC,IAAI,CAAC,oBAAoB;QAC3DT,OACEa,AAAyBC,WAAzBD,sBACA;QAGF,IAAIE,8BAA8B;QAGlC,MAAO,KAAM;YACX,MAAMZ,SAAS,MAAMT,QAAQ,YAAY,CACvC;gBACE,MAAM;gBACN,SAAS;gBACT,OAAO;oBACL,iBAAiBW;oBACjBE;oBACAG;gBACF;gBACA,UAAU,OAAOb,OAAOC;oBACtB,MAAMkB,YAAYC,KAAK,GAAG;oBAC1B,MAAM,EAAElB,SAAS,EAAE,GAAGD;oBACtBE,OAAOD,WAAW;oBAClB,MAAM,EAAEmB,MAAM,EAAE,GAAG5B;oBACnB,MAAM6B,qBACJD,AAAW,kBAAXA,SACI5B,uBAAuB,kBAAkB,GACzCwB;oBAEN,MAAMM,cAAc,IAAI,CAAC,cAAc;oBACvCvC,MACE,sCACAuC,YAAY,GAAG,CAAC,CAACC,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;oBAEhDrB,OAAOsB,MAAM,OAAO,CAACF,cAAc;oBACnC,IAAIA,AAAuB,MAAvBA,YAAY,MAAM,EACpBG,QAAQ,IAAI,CACV,CAAC,gBAAgB,EAAE,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,gDAAgD,CAAC;oBAIrG,MAAMC,aAAa,MAAOL,AAAAA,CAAAA,qBACtBM,iBACAC,IAAG,EAAG7B,MAAM,eAAe,EAAE;wBAC/B,SAASE;wBACT,eAAeF,MAAM,YAAY;wBACjC,eAAe,IAAI,CAAC,SAAS,CAAC,aAAa;wBAC3CuB;wBACA,aAAa9B;wBACb,qBAAqB,IAAI,CAAC,mBAAmB;wBAC7C,aAAagB;wBACbI;oBACF;oBACA7B,MAAM,cAAc8C,KAAK,SAAS,CAACH,YAAY,MAAM;oBAErD,MAAM,EACJI,OAAO,EACPC,GAAG,EACHC,kCAAkC,EAClCC,KAAK,EACLC,KAAK,EACLC,WAAW,EACXC,KAAK,EACN,GAAGV;oBAEJ1B,gBAAgB,IAAI,CAAC,GAAG,GAAG;wBACzB,GAAIA,gBAAgB,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;wBAClCmC;oBACF;oBACAnC,gBAAgB,IAAI,CAAC,KAAK,GAAGkC;oBAC7BlC,gBAAgB,IAAI,CAAC,MAAM,GAAG;wBAC5B,SAAS8B,WAAW,EAAE;wBACtBE;wBACAD;wBACA,UAAUL,WAAW,QAAQ;oBAC/B;oBACA1B,gBAAgB,SAAS,GAAGC;oBAE5B,MAAMoC,eAAe;2BAAKP,WAAW,EAAE;qBAAE;oBAEzC,IAAIM,OAAO;wBACT,MAAME,UAAUnB,KAAK,GAAG;wBACxB,MAAMoB,gBAAgBH,QAASE,CAAAA,UAAUpB,SAAQ;wBACjD,IAAIqB,gBAAgB,GAClBF,aAAa,IAAI,CAAC,IAAI,CAAC,SAAS,CAACE;oBAErC;oBAEArC,OAAO,CAAC+B,OAAO,CAAC,oBAAoB,EAAEA,MAAM,EAAE,EAAEF,OAAO,IAAI;oBAE3D,OAAO;wBACL,OAAO;4BACL,KAAK;wBACP;oBACF;gBACF;YACF,GACA;gBACE,gBAAgB;YAClB;YAGF,MAAML,aAAarB,QAAQ;YAG3B,MAAMd,QAAQmC,YAAY,WAAW,EAAE;YACvCZ,SAAS,IAAI,IAAKY,YAAY,YAAY,EAAE;YAE5C,IAAIc;YACJ,IAAI;gBACFA,cAAc,MAAM,IAAI,CAAC,uBAAuB,CAC9CjD,OACAC,wBACAC,6BACA;oBACEiB;oBACA,SAAS;gBACX;YAEJ,EAAE,OAAOuB,OAAO;gBACd,OAAOrC,QAAQ,eAAe,CAC5B,CAAC,4CAA4C,EAAEqC,MAAM,SAAS,EAAEJ,KAAK,SAAS,CAC5EtC,QACC;YAEP;YACA,IAAI,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EACjDkC,QAAQ,IAAI,CACV,8FACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB;YAGnD,IAAIgB,YAAY;YAChB,IAAI;gBACF,MAAM7C,QAAQ,YAAY,CAAC4C,YAAY,KAAK;YAC9C,EAAE,OAAOP,OAAY;gBACnBQ,YAAY;gBACZxB;gBACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAAG,CAAC,+BAA+B,EAAEgB,OAAO,WAAWS,OAAOT,QAAQ;gBACrHlD,MACE,yFACAkD,iBAAiBU,QAAQV,MAAM,OAAO,GAAGS,OAAOT,QAChD,6CACAhB;YAEJ;YAEA,IAAIA,8BAA8BhC,uCAChC,OAAOW,QAAQ,eAAe,CAAC;YAIjC,IAAI,CAAC8B,YAAY,oCACf,IAAIe,WACF1D,MACE;iBAGF;YAKJ,EAAE8B;YAEF,IAAIA,cAAcE,sBAAsB;gBACtC,MAAM6B,WAAW,CAAC,UAAU,EAAE7B,qBAAqB,4JAA4J,CAAC;gBAChN,OAAOnB,QAAQ,eAAe,CAACgD;YACjC;YAEA,IAAI,CAAC,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EAClD,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAC7C;QAEN;QAEA,MAAMC,cAAc;YAClB,QAAQ;gBACN/B;YACF;YACAX;QACF;QACA,OAAO0C;IACT;IAEQ,oBACNC,IAAsE,EACtEC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACpC;QACA,MAAMC,YAA4C;YAChD,MAAM;YACN,SAASL;YACT,OAAO;gBACL,YAAYI,mBACP;oBACCH;oBACAG;gBACF,IACAH;YACN;YACA,UAAU,OAAOhD,OAAOqD;gBACtB,MAAM,EAAEtD,IAAI,EAAE,GAAGsD;gBACjB,IAAIC;gBACJ,MAAMC,YAAY,CAACC;oBACjBF,YAAYE;oBACZzD,KAAK,GAAG,GAAG;wBACTyD;oBACF;gBACF;gBAGA,MAAMtD,YAAYmD,YAAY,SAAS;gBACvClD,OAAOD,WAAW;gBAElB,MAAMuD,mBAAmBV,AAAS,YAATA;gBACzB,IAAIW,cAAcV;gBAClB,IAAIW,cAAc;gBAClB,IAAIF,oBAAqBV,CAAAA,AAAS,aAATA,QAAqBA,AAAS,cAATA,IAAiB,GAAI;oBACjEY,cAAc;oBACd,MAAMC,gBACJb,AAAS,aAATA,OACI,CAAC,kDAAkD,EAAEC,QAAQ,GAC7D,CAAC,+GAA+G,EAAEA,QAAQ;oBAChIU,cAAc;wBACZ,CAACC,YAAY,EAAEC;oBACjB;gBACF,OAAO,IAAIH,kBACTC,cAAc;oBACZ,CAACC,YAAY,EAAE,GAAGZ,KAAK,EAAE,EAAEC,QAAQ;gBACrC;gBAGF,IAAIa;gBAEJ,IAAIC,uBAAuB;gBAC3B,IAAIZ,KAAK,eAAe,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE;oBAC1DlE,MAAM;oBACN,MAAM+E,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,mBAAmB;oBACrDD,uBAAuB,MAAME,kBAC3BD,MACA,KACA,OACAb,KAAK,gBAAgB;gBAEzB;gBAEA,IAAI;oBACFW,gBAAgB,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CACxCH,aACAT,aACAC,KACAY,sBACAX;gBAEJ,EAAE,OAAOjB,OAAO;oBACd,IAAIA,iBAAiB+B,cACnBV,UAAUrB,MAAM,IAAI;oBAEtB,MAAMA;gBACR;gBAEA,MAAM,EAAEgC,IAAI,EAAE/B,KAAK,EAAEgC,OAAO,EAAEX,IAAI,EAAE,GAAGK;gBACvCN,UAAUC;gBAEV,IAAIY,eAAeF;gBACnB,IAAIT,kBAEF,IAAI,AAAgB,YAAhB,OAAOS,MACTE,eAAeF;qBACV,IAAInB,AAAS,cAATA,MAEPqB,eADEF,QAAAA,OACa,QAECA,IAAY,CAACP,YAAY;qBAEtC,IAAIO,QAAAA,MACTE,eAAe;qBACV;oBACLjE,OACE+D,MAAM,CAACP,YAAY,KAAK1C,QACxB;oBAEFmD,eAAgBF,IAAY,CAACP,YAAY;gBAC3C;gBAGF,IAAIZ,AAAS,aAATA,QAAqB,CAACqB,cAAc;oBACtCrE,KAAK,KAAK,GAAGoC;oBACbpC,KAAK,OAAO,GAAGoE;oBACf,MAAM,IAAIvB,MAAM,CAAC,kBAAkB,EAAEuB,SAAS;gBAChD;gBAEA,OAAO;oBACL,QAAQC;oBACR,KAAKd;oBACLnB;oBACAgC;gBACF;YACF;QACF;QAEA,OAAOf;IACT;IACA,MAAM,yBACJL,IAA0D,EAC1DC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACP;QAC7B,MAAMtD,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aACEiD,MACA,AAAkB,YAAlB,OAAOC,SAAsBA,SAASlB,KAAK,SAAS,CAACkB;QAIzD,MAAMI,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9CL,MACAC,QACAC,aACAC,KACAC;QAGF,MAAM/C,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAACuD;QAE1C,IAAI,CAAC9C,QACH,MAAM,IAAIsC,MACR;QAIJ,MAAM,EAAErC,MAAM,EAAE4D,OAAO,EAAE,GAAG7D;QAE5B,OAAO;YACLC;YACA4D;YACA/D;QACF;IACF;IAEQ,UAAUiE,MAAc,EAA4C;QAC1E,OAAO;YACL,MAAM;YACN,OAAO;gBACLA;YACF;QACF;IACF;IAEA,MAAM,aAAaA,MAAc,EAAEC,YAA0B,EAAE;QAC7D,OAAO,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;YACtCD;QACF;IACF;IAEA,MAAM,QACJE,SAAsB,EACtBrB,GAA+B,EAC/BD,WAAyB,EACO;QAChC,MAAM,EAAEuB,UAAU,EAAErB,gBAAgB,EAAE,GAAGsB,YAAYF;QAErD,MAAMG,cAAc,CAAC,SAAS,EAAEF,YAAY;QAC5C,MAAM3E,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,WAAW4E;QAE1B,MAAMtE,SAASP,QAAQ,SAAS;QAChC,MAAM,EAAE8E,SAAS,EAAEC,eAAe,EAAE,GAAG1B;QAEvC/C,OAAOoE,WAAW;QAClBpE,OAAOwE,WAAW;QAClBxE,OAAOyE,iBAAiB;QAExBzE,OACEyE,mBAAmBD,WACnB,CAAC,iGAAiG,EAAEC,gBAAgB,aAAa,EAAED,UAAU,CAAC,CAAC;QAGjJ,MAAME,mBAAmBzD,KAAK,GAAG;QACjC,IAAI0D,iBAAiBD;QACrB,IAAIE,eAAe;QAEnB,MAAOD,iBAAiBD,oBAAoBF,UAAW;YACrD,MAAMK,oBAAoB5D,KAAK,GAAG;YAClC0D,iBAAiBE;YACjB,MAAM5B,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9C,WACAoB,YACAvB,aACAhC,QACAkC;YAGF,MAAM7C,SAAU,MAAMT,QAAQ,YAAY,CAACuD;YAO3C,IAAI9C,QAAQ,QACV,OAAO;gBACL,QAAQW;gBACRb;YACF;YAGF2E,eACEzE,QAAQ,WACP,CAACA,UAAU,CAAC,0BAA0B,EAAEkE,YAAY,IACrD,CAAC,0CAA0C,EAAEA,YAAY;YAC3D,MAAMS,MAAM7D,KAAK,GAAG;YACpB,IAAI6D,MAAMD,oBAAoBJ,iBAAiB;gBAC7C,MAAMpC,gBAAgBoC,kBAAmBK,CAAAA,MAAMD,iBAAgB;gBAC/D,MAAME,YAAY,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;oBACjD,QAAQ1C;gBACV;gBACA,MAAM3C,QAAQ,MAAM,CAACqF;YACvB;QACF;QAEA,OAAOrF,QAAQ,eAAe,CAAC,CAAC,iBAAiB,EAAEkF,cAAc;IACnE;IApkBA,YACEI,iBAAoC,EACpCC,OAAgB,EAChBC,IAMC,CACD;QAjCF;QAEA;QAEA;QAEA,uBAAiB,uBAAjB;QAEA,uBAAiB,eAAjB;QAEA,uBAAQ,uBAAR;QAEA;QAEA,uBAAiB,SAAjB;QAEA;QAkBE,IAAI,CAAC,SAAS,GAAGF;QACjB,IAAI,CAAC,OAAO,GAAGC;QACf,IAAI,CAAC,SAAS,GAAGC,KAAK,SAAS;QAC/B,IAAI,CAAC,mBAAmB,GAAGA,MAAM;QACjC,IAAI,CAAC,oBAAoB,GAAGA,KAAK,oBAAoB;QACrD,IAAI,CAAC,KAAK,GAAGA,KAAK,KAAK;QACvB,IAAI,CAAC,mBAAmB,GAAG,IAAIC;QAC/B,IAAI,CAAC,mBAAmB,GAAGD,KAAK,WAAW;QAC3C,IAAI,CAAC,WAAW,GAAG,IAAIE,YAAY;YACjCJ;YACAC;YACA,WAAWC,KAAK,SAAS;YACzB,aAAa,IAAI,CAAC,cAAc;QAClC;IACF;AA4iBF"}
1
+ {"version":3,"file":"agent/tasks.mjs","sources":["../../../src/agent/tasks.ts"],"sourcesContent":["import { ConversationHistory, plan, uiTarsPlanning } from '@/ai-model';\nimport type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface } from '@/device';\nimport type Service from '@/service';\nimport type { TaskRunner } from '@/task-runner';\nimport { TaskExecutionError } from '@/task-runner';\nimport type {\n DeviceAction,\n ExecutionTaskApply,\n ExecutionTaskInsightQueryApply,\n ExecutionTaskPlanningApply,\n ExecutionTaskProgressOptions,\n InterfaceType,\n MidsceneYamlFlowItem,\n PlanningAIResponse,\n PlanningAction,\n PlanningActionParamSleep,\n PlanningActionParamWaitFor,\n ServiceDump,\n ServiceExtractOption,\n ServiceExtractParam,\n} from '@/types';\nimport { ServiceError } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ExecutionSession } from './execution-session';\nimport { TaskBuilder } from './task-builder';\nimport type { TaskCache } from './task-cache';\nexport { locatePlanForLocate } from './task-builder';\nimport { descriptionOfTree } from '@midscene/shared/extractor';\nimport { taskTitleStr } from './ui-utils';\nimport { parsePrompt } from './utils';\n\ninterface ExecutionResult<OutputType = any> {\n output: OutputType;\n thought?: string;\n runner: TaskRunner;\n}\n\ninterface TaskExecutorHooks {\n onTaskUpdate?: (\n runner: TaskRunner,\n error?: TaskExecutionError,\n ) => Promise<void> | void;\n}\n\nconst debug = getDebug('device-task-executor');\nconst maxErrorCountAllowedInOnePlanningLoop = 5;\n\nexport { TaskExecutionError };\n\nexport class TaskExecutor {\n interface: AbstractInterface;\n\n service: Service;\n\n taskCache?: TaskCache;\n\n private readonly providedActionSpace: DeviceAction[];\n\n private readonly taskBuilder: TaskBuilder;\n\n private conversationHistory: ConversationHistory;\n\n onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];\n\n private readonly hooks?: TaskExecutorHooks;\n\n replanningCycleLimit?: number;\n\n // @deprecated use .interface instead\n get page() {\n return this.interface;\n }\n\n constructor(\n interfaceInstance: AbstractInterface,\n service: Service,\n opts: {\n taskCache?: TaskCache;\n onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];\n replanningCycleLimit?: number;\n hooks?: TaskExecutorHooks;\n actionSpace: DeviceAction[];\n },\n ) {\n this.interface = interfaceInstance;\n this.service = service;\n this.taskCache = opts.taskCache;\n this.onTaskStartCallback = opts?.onTaskStart;\n this.replanningCycleLimit = opts.replanningCycleLimit;\n this.hooks = opts.hooks;\n this.conversationHistory = new ConversationHistory();\n this.providedActionSpace = opts.actionSpace;\n this.taskBuilder = new TaskBuilder({\n interfaceInstance,\n service,\n taskCache: opts.taskCache,\n actionSpace: this.getActionSpace(),\n });\n }\n\n private createExecutionSession(\n title: string,\n options?: { tasks?: ExecutionTaskApply[] },\n ) {\n return new ExecutionSession(\n title,\n () => Promise.resolve(this.service.contextRetrieverFn()),\n {\n onTaskStart: this.onTaskStartCallback,\n tasks: options?.tasks,\n onTaskUpdate: this.hooks?.onTaskUpdate,\n },\n );\n }\n\n private getActionSpace(): DeviceAction[] {\n return this.providedActionSpace;\n }\n\n public async convertPlanToExecutable(\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n options?: {\n cacheable?: boolean;\n subTask?: boolean;\n },\n ) {\n return this.taskBuilder.build(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n options,\n );\n }\n\n async loadYamlFlowAsPlanning(userInstruction: string, yamlString: string) {\n const session = this.createExecutionSession(\n taskTitleStr('Action', userInstruction),\n );\n\n const task: ExecutionTaskPlanningApply = {\n type: 'Planning',\n subType: 'LoadYaml',\n param: {\n userInstruction,\n },\n executor: async (param, executorContext) => {\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n return {\n output: {\n actions: [],\n more_actions_needed_by_instruction: false,\n log: '',\n yamlString,\n },\n cache: {\n hit: true,\n },\n hitBy: {\n from: 'Cache',\n context: {\n yamlString,\n },\n },\n };\n },\n };\n const runner = session.getRunner();\n await session.appendAndRun(task);\n\n return {\n runner,\n };\n }\n\n async runPlans(\n title: string,\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n ): Promise<ExecutionResult> {\n const session = this.createExecutionSession(title);\n const { tasks } = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n );\n const runner = session.getRunner();\n const result = await session.appendAndRun(tasks);\n const { output } = result ?? {};\n return {\n output,\n runner,\n };\n }\n\n async action(\n userPrompt: string,\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n includeBboxInPlanning: boolean,\n aiActContext?: string,\n cacheable?: boolean,\n replanningCycleLimitOverride?: number,\n imagesIncludeCount?: number,\n qwen3_vl_enable_thinking?: boolean,\n doubao_enable_thinking?: 'enabled' | 'disabled' | 'auto',\n ): Promise<\n ExecutionResult<\n | {\n yamlFlow?: MidsceneYamlFlowItem[]; // for cache use\n }\n | undefined\n >\n > {\n this.conversationHistory.reset();\n\n const session = this.createExecutionSession(\n taskTitleStr('Action', userPrompt),\n );\n const runner = session.getRunner();\n\n let replanCount = 0;\n const yamlFlow: MidsceneYamlFlowItem[] = [];\n const replanningCycleLimit =\n replanningCycleLimitOverride ?? this.replanningCycleLimit;\n assert(\n replanningCycleLimit !== undefined,\n 'replanningCycleLimit is required for TaskExecutor.action',\n );\n\n let errorCountInOnePlanningLoop = 0; // count the number of errors in one planning loop\n\n // Main planning loop - unified plan/replan logic\n while (true) {\n const result = await session.appendAndRun(\n {\n type: 'Planning',\n subType: 'Plan',\n param: {\n userInstruction: userPrompt,\n aiActContext,\n imagesIncludeCount,\n },\n executor: async (param, executorContext) => {\n const startTime = Date.now();\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n const { vlMode } = modelConfigForPlanning;\n const uiTarsModelVersion =\n vlMode === 'vlm-ui-tars'\n ? modelConfigForPlanning.uiTarsModelVersion\n : undefined;\n\n const actionSpace = this.getActionSpace();\n debug(\n 'actionSpace for this interface is:',\n actionSpace.map((action) => action.name).join(', '),\n );\n assert(Array.isArray(actionSpace), 'actionSpace must be an array');\n if (actionSpace.length === 0) {\n console.warn(\n `ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`,\n );\n }\n\n const planResult = await (uiTarsModelVersion\n ? uiTarsPlanning\n : plan)(param.userInstruction, {\n context: uiContext,\n actionContext: param.aiActContext,\n interfaceType: this.interface.interfaceType as InterfaceType,\n actionSpace,\n modelConfig: modelConfigForPlanning,\n conversationHistory: this.conversationHistory,\n includeBbox: includeBboxInPlanning,\n imagesIncludeCount,\n qwen3_vl_enable_thinking:\n modelConfigForPlanning.vlMode === 'qwen3-vl'\n ? qwen3_vl_enable_thinking\n : undefined,\n doubao_enable_thinking:\n modelConfigForPlanning.vlMode === 'doubao-vision' ||\n modelConfigForPlanning.vlMode === 'vlm-ui-tars'\n ? doubao_enable_thinking\n : undefined,\n });\n debug('planResult', JSON.stringify(planResult, null, 2));\n\n const {\n actions,\n log,\n more_actions_needed_by_instruction,\n error,\n usage,\n rawResponse,\n sleep,\n reasoning_content,\n } = planResult;\n\n executorContext.task.log = {\n ...(executorContext.task.log || {}),\n rawResponse,\n };\n executorContext.task.usage = usage;\n executorContext.task.reasoning_content = reasoning_content;\n executorContext.task.output = {\n actions: actions || [],\n more_actions_needed_by_instruction,\n log,\n yamlFlow: planResult.yamlFlow,\n };\n executorContext.uiContext = uiContext;\n\n const finalActions = [...(actions || [])];\n\n if (sleep) {\n const timeNow = Date.now();\n const timeRemaining = sleep - (timeNow - startTime);\n if (timeRemaining > 0) {\n finalActions.push(this.sleepPlan(timeRemaining));\n }\n }\n\n assert(!error, `Failed to continue: ${error}\\n${log || ''}`);\n\n return {\n cache: {\n hit: false,\n },\n } as any;\n },\n },\n {\n allowWhenError: true,\n },\n );\n\n const planResult = result?.output as PlanningAIResponse | undefined;\n\n // Execute planned actions\n const plans = planResult?.actions || [];\n yamlFlow.push(...(planResult?.yamlFlow || []));\n\n let executables: Awaited<ReturnType<typeof this.convertPlanToExecutable>>;\n try {\n executables = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n {\n cacheable,\n subTask: true,\n },\n );\n } catch (error) {\n return session.appendErrorPlan(\n `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(\n plans,\n )}`,\n );\n }\n if (this.conversationHistory.pendingFeedbackMessage) {\n console.warn(\n 'unconsumed pending feedback message detected, this may lead to unexpected planning result:',\n this.conversationHistory.pendingFeedbackMessage,\n );\n }\n let errorFlag = false;\n try {\n await session.appendAndRun(executables.tasks);\n } catch (error: any) {\n errorFlag = true;\n errorCountInOnePlanningLoop++;\n this.conversationHistory.pendingFeedbackMessage = `Error executing running tasks: ${error?.message || String(error)}`;\n debug(\n 'error when executing running tasks, but continue to run if it is not too many errors:',\n error instanceof Error ? error.message : String(error),\n 'current error count in one planning loop:',\n errorCountInOnePlanningLoop,\n );\n }\n\n if (errorCountInOnePlanningLoop > maxErrorCountAllowedInOnePlanningLoop) {\n return session.appendErrorPlan('Too many errors in one planning loop');\n }\n\n // Check if task is complete\n if (!planResult?.more_actions_needed_by_instruction) {\n if (errorFlag) {\n debug(\n 'more_actions_needed_by_instruction is false, but there are errors in one planning loop, continue to run',\n );\n } else {\n break;\n }\n }\n\n // Increment replan count for next iteration\n ++replanCount;\n\n if (replanCount > replanningCycleLimit) {\n const errorMsg = `Replanned ${replanningCycleLimit} times, exceeding the limit. Please configure a larger value for replanningCycleLimit (or use MIDSCENE_REPLANNING_CYCLE_LIMIT) to handle more complex tasks.`;\n return session.appendErrorPlan(errorMsg);\n }\n\n if (!this.conversationHistory.pendingFeedbackMessage) {\n this.conversationHistory.pendingFeedbackMessage =\n 'I have finished the action previously planned.';\n }\n }\n\n const finalResult = {\n output: {\n yamlFlow,\n },\n runner,\n };\n return finalResult;\n }\n\n private createTypeQueryTask(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert' | 'WaitFor',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ) {\n const queryTask: ExecutionTaskInsightQueryApply = {\n type: 'Insight',\n subType: type,\n param: {\n dataDemand: multimodalPrompt\n ? ({\n demand,\n multimodalPrompt,\n } as never)\n : demand, // for user param presentation in report right sidebar\n },\n executor: async (param, taskContext) => {\n const { task } = taskContext;\n let queryDump: ServiceDump | undefined;\n const applyDump = (dump: ServiceDump) => {\n queryDump = dump;\n task.log = {\n dump,\n };\n };\n\n // Get context for query operations\n const uiContext = taskContext.uiContext;\n assert(uiContext, 'uiContext is required for Query task');\n\n const ifTypeRestricted = type !== 'Query';\n let demandInput = demand;\n let keyOfResult = 'result';\n if (ifTypeRestricted && (type === 'Assert' || type === 'WaitFor')) {\n keyOfResult = 'StatementIsTruthy';\n const booleanPrompt =\n type === 'Assert'\n ? `Boolean, whether the following statement is true: ${demand}`\n : `Boolean, the user wants to do some 'wait for' operation, please check whether the following statement is true: ${demand}`;\n demandInput = {\n [keyOfResult]: booleanPrompt,\n };\n } else if (ifTypeRestricted) {\n demandInput = {\n [keyOfResult]: `${type}, ${demand}`,\n };\n }\n\n let extractResult;\n\n let extraPageDescription = '';\n if (opt?.domIncluded && this.interface.getElementsNodeTree) {\n debug('appending tree info for page');\n const tree = await this.interface.getElementsNodeTree();\n extraPageDescription = await descriptionOfTree(\n tree,\n 200,\n false,\n opt?.domIncluded === 'visible-only',\n );\n }\n\n try {\n extractResult = await this.service.extract<any>(\n demandInput,\n modelConfig,\n opt,\n extraPageDescription,\n multimodalPrompt,\n );\n } catch (error) {\n if (error instanceof ServiceError) {\n applyDump(error.dump);\n }\n throw error;\n }\n\n const { data, usage, thought, dump, reasoning_content } = extractResult;\n applyDump(dump);\n task.reasoning_content = reasoning_content;\n\n let outputResult = data;\n if (ifTypeRestricted) {\n // If AI returned a plain string instead of structured format, use it directly\n if (typeof data === 'string') {\n outputResult = data;\n } else if (type === 'WaitFor') {\n if (data === null || data === undefined) {\n outputResult = false;\n } else {\n outputResult = (data as any)[keyOfResult];\n }\n } else if (data === null || data === undefined) {\n outputResult = null;\n } else {\n assert(\n data?.[keyOfResult] !== undefined,\n 'No result in query data',\n );\n outputResult = (data as any)[keyOfResult];\n }\n }\n\n if (type === 'Assert' && !outputResult) {\n task.usage = usage;\n task.thought = thought;\n throw new Error(`Assertion failed: ${thought}`);\n }\n\n return {\n output: outputResult,\n log: queryDump,\n usage,\n thought,\n };\n },\n };\n\n return queryTask;\n }\n async createTypeQueryExecution<T>(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<ExecutionResult<T>> {\n const session = this.createExecutionSession(\n taskTitleStr(\n type,\n typeof demand === 'string' ? demand : JSON.stringify(demand),\n ),\n );\n\n const queryTask = await this.createTypeQueryTask(\n type,\n demand,\n modelConfig,\n opt,\n multimodalPrompt,\n );\n\n const runner = session.getRunner();\n const result = await session.appendAndRun(queryTask);\n\n if (!result) {\n throw new Error(\n 'result of taskExecutor.flush() is undefined in function createTypeQueryTask',\n );\n }\n\n const { output, thought } = result;\n\n return {\n output,\n thought,\n runner,\n };\n }\n\n private sleepPlan(timeMs: number): PlanningAction<PlanningActionParamSleep> {\n return {\n type: 'Sleep',\n param: {\n timeMs,\n },\n };\n }\n\n async taskForSleep(timeMs: number, _modelConfig: IModelConfig) {\n return this.taskBuilder.createSleepTask({\n timeMs,\n });\n }\n\n async waitFor(\n assertion: TUserPrompt,\n opt: PlanningActionParamWaitFor,\n modelConfig: IModelConfig,\n ): Promise<ExecutionResult<void>> {\n const { textPrompt, multimodalPrompt } = parsePrompt(assertion);\n\n const description = `waitFor: ${textPrompt}`;\n const session = this.createExecutionSession(\n taskTitleStr('WaitFor', description),\n );\n const runner = session.getRunner();\n const { timeoutMs, checkIntervalMs } = opt;\n\n assert(assertion, 'No assertion for waitFor');\n assert(timeoutMs, 'No timeoutMs for waitFor');\n assert(checkIntervalMs, 'No checkIntervalMs for waitFor');\n\n assert(\n checkIntervalMs <= timeoutMs,\n `wrong config for waitFor: checkIntervalMs must be less than timeoutMs, config: {checkIntervalMs: ${checkIntervalMs}, timeoutMs: ${timeoutMs}}`,\n );\n\n const overallStartTime = Date.now();\n let lastCheckStart = overallStartTime;\n let errorThought = '';\n // Continue checking as long as the previous iteration began within the timeout window.\n while (lastCheckStart - overallStartTime <= timeoutMs) {\n const currentCheckStart = Date.now();\n lastCheckStart = currentCheckStart;\n const queryTask = await this.createTypeQueryTask(\n 'WaitFor',\n textPrompt,\n modelConfig,\n undefined,\n multimodalPrompt,\n );\n\n const result = (await session.appendAndRun(queryTask)) as\n | {\n output: boolean;\n thought?: string;\n }\n | undefined;\n\n if (result?.output) {\n return {\n output: undefined,\n runner,\n };\n }\n\n errorThought =\n result?.thought ||\n (!result && `No result from assertion: ${textPrompt}`) ||\n `unknown error when waiting for assertion: ${textPrompt}`;\n const now = Date.now();\n if (now - currentCheckStart < checkIntervalMs) {\n const timeRemaining = checkIntervalMs - (now - currentCheckStart);\n const sleepTask = this.taskBuilder.createSleepTask({\n timeMs: timeRemaining,\n });\n await session.append(sleepTask);\n }\n }\n\n return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);\n }\n}\n"],"names":["debug","getDebug","maxErrorCountAllowedInOnePlanningLoop","TaskExecutor","title","options","ExecutionSession","Promise","plans","modelConfigForPlanning","modelConfigForDefaultIntent","userInstruction","yamlString","session","taskTitleStr","task","param","executorContext","uiContext","assert","runner","tasks","result","output","userPrompt","includeBboxInPlanning","aiActContext","cacheable","replanningCycleLimitOverride","imagesIncludeCount","qwen3_vl_enable_thinking","doubao_enable_thinking","replanCount","yamlFlow","replanningCycleLimit","undefined","errorCountInOnePlanningLoop","startTime","Date","vlMode","uiTarsModelVersion","actionSpace","action","Array","console","planResult","uiTarsPlanning","plan","JSON","actions","log","more_actions_needed_by_instruction","error","usage","rawResponse","sleep","reasoning_content","finalActions","timeNow","timeRemaining","executables","errorFlag","String","Error","errorMsg","finalResult","type","demand","modelConfig","opt","multimodalPrompt","queryTask","taskContext","queryDump","applyDump","dump","ifTypeRestricted","demandInput","keyOfResult","booleanPrompt","extractResult","extraPageDescription","tree","descriptionOfTree","ServiceError","data","thought","outputResult","timeMs","_modelConfig","assertion","textPrompt","parsePrompt","description","timeoutMs","checkIntervalMs","overallStartTime","lastCheckStart","errorThought","currentCheckStart","now","sleepTask","interfaceInstance","service","opts","ConversationHistory","TaskBuilder"],"mappings":";;;;;;;;;;;;;;;;;;;;AA+CA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,wCAAwC;AAIvC,MAAMC;IAoBX,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,SAAS;IACvB;IA6BQ,uBACNC,KAAa,EACbC,OAA0C,EAC1C;QACA,OAAO,IAAIC,iBACTF,OACA,IAAMG,QAAQ,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,kBAAkB,KACrD;YACE,aAAa,IAAI,CAAC,mBAAmB;YACrC,OAAOF,SAAS;YAChB,cAAc,IAAI,CAAC,KAAK,EAAE;QAC5B;IAEJ;IAEQ,iBAAiC;QACvC,OAAO,IAAI,CAAC,mBAAmB;IACjC;IAEA,MAAa,wBACXG,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACzCL,OAGC,EACD;QACA,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,CAC3BG,OACAC,wBACAC,6BACAL;IAEJ;IAEA,MAAM,uBAAuBM,eAAuB,EAAEC,UAAkB,EAAE;QACxE,MAAMC,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUH;QAGzB,MAAMI,OAAmC;YACvC,MAAM;YACN,SAAS;YACT,OAAO;gBACLJ;YACF;YACA,UAAU,OAAOK,OAAOC;gBACtB,MAAM,EAAEC,SAAS,EAAE,GAAGD;gBACtBE,OAAOD,WAAW;gBAClB,OAAO;oBACL,QAAQ;wBACN,SAAS,EAAE;wBACX,oCAAoC;wBACpC,KAAK;wBACLN;oBACF;oBACA,OAAO;wBACL,KAAK;oBACP;oBACA,OAAO;wBACL,MAAM;wBACN,SAAS;4BACPA;wBACF;oBACF;gBACF;YACF;QACF;QACA,MAAMQ,SAASP,QAAQ,SAAS;QAChC,MAAMA,QAAQ,YAAY,CAACE;QAE3B,OAAO;YACLK;QACF;IACF;IAEA,MAAM,SACJhB,KAAa,EACbI,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACf;QAC1B,MAAMG,UAAU,IAAI,CAAC,sBAAsB,CAACT;QAC5C,MAAM,EAAEiB,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAClDb,OACAC,wBACAC;QAEF,MAAMU,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAACQ;QAC1C,MAAM,EAAEE,MAAM,EAAE,GAAGD,UAAU,CAAC;QAC9B,OAAO;YACLC;YACAH;QACF;IACF;IAEA,MAAM,OACJI,UAAkB,EAClBf,sBAAoC,EACpCC,2BAAyC,EACzCe,qBAA8B,EAC9BC,YAAqB,EACrBC,SAAmB,EACnBC,4BAAqC,EACrCC,kBAA2B,EAC3BC,wBAAkC,EAClCC,sBAAwD,EAQxD;QACA,IAAI,CAAC,mBAAmB,CAAC,KAAK;QAE9B,MAAMlB,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUU;QAEzB,MAAMJ,SAASP,QAAQ,SAAS;QAEhC,IAAImB,cAAc;QAClB,MAAMC,WAAmC,EAAE;QAC3C,MAAMC,uBACJN,gCAAgC,IAAI,CAAC,oBAAoB;QAC3DT,OACEe,AAAyBC,WAAzBD,sBACA;QAGF,IAAIE,8BAA8B;QAGlC,MAAO,KAAM;YACX,MAAMd,SAAS,MAAMT,QAAQ,YAAY,CACvC;gBACE,MAAM;gBACN,SAAS;gBACT,OAAO;oBACL,iBAAiBW;oBACjBE;oBACAG;gBACF;gBACA,UAAU,OAAOb,OAAOC;oBACtB,MAAMoB,YAAYC,KAAK,GAAG;oBAC1B,MAAM,EAAEpB,SAAS,EAAE,GAAGD;oBACtBE,OAAOD,WAAW;oBAClB,MAAM,EAAEqB,MAAM,EAAE,GAAG9B;oBACnB,MAAM+B,qBACJD,AAAW,kBAAXA,SACI9B,uBAAuB,kBAAkB,GACzC0B;oBAEN,MAAMM,cAAc,IAAI,CAAC,cAAc;oBACvCzC,MACE,sCACAyC,YAAY,GAAG,CAAC,CAACC,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;oBAEhDvB,OAAOwB,MAAM,OAAO,CAACF,cAAc;oBACnC,IAAIA,AAAuB,MAAvBA,YAAY,MAAM,EACpBG,QAAQ,IAAI,CACV,CAAC,gBAAgB,EAAE,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,gDAAgD,CAAC;oBAIrG,MAAMC,aAAa,MAAOL,AAAAA,CAAAA,qBACtBM,iBACAC,IAAG,EAAG/B,MAAM,eAAe,EAAE;wBAC/B,SAASE;wBACT,eAAeF,MAAM,YAAY;wBACjC,eAAe,IAAI,CAAC,SAAS,CAAC,aAAa;wBAC3CyB;wBACA,aAAahC;wBACb,qBAAqB,IAAI,CAAC,mBAAmB;wBAC7C,aAAagB;wBACbI;wBACA,0BACEpB,AAAkC,eAAlCA,uBAAuB,MAAM,GACzBqB,2BACAK;wBACN,wBACE1B,AAAkC,oBAAlCA,uBAAuB,MAAM,IAC7BA,AAAkC,kBAAlCA,uBAAuB,MAAM,GACzBsB,yBACAI;oBACR;oBACAnC,MAAM,cAAcgD,KAAK,SAAS,CAACH,YAAY,MAAM;oBAErD,MAAM,EACJI,OAAO,EACPC,GAAG,EACHC,kCAAkC,EAClCC,KAAK,EACLC,KAAK,EACLC,WAAW,EACXC,KAAK,EACLC,iBAAiB,EAClB,GAAGX;oBAEJ5B,gBAAgB,IAAI,CAAC,GAAG,GAAG;wBACzB,GAAIA,gBAAgB,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;wBAClCqC;oBACF;oBACArC,gBAAgB,IAAI,CAAC,KAAK,GAAGoC;oBAC7BpC,gBAAgB,IAAI,CAAC,iBAAiB,GAAGuC;oBACzCvC,gBAAgB,IAAI,CAAC,MAAM,GAAG;wBAC5B,SAASgC,WAAW,EAAE;wBACtBE;wBACAD;wBACA,UAAUL,WAAW,QAAQ;oBAC/B;oBACA5B,gBAAgB,SAAS,GAAGC;oBAE5B,MAAMuC,eAAe;2BAAKR,WAAW,EAAE;qBAAE;oBAEzC,IAAIM,OAAO;wBACT,MAAMG,UAAUpB,KAAK,GAAG;wBACxB,MAAMqB,gBAAgBJ,QAASG,CAAAA,UAAUrB,SAAQ;wBACjD,IAAIsB,gBAAgB,GAClBF,aAAa,IAAI,CAAC,IAAI,CAAC,SAAS,CAACE;oBAErC;oBAEAxC,OAAO,CAACiC,OAAO,CAAC,oBAAoB,EAAEA,MAAM,EAAE,EAAEF,OAAO,IAAI;oBAE3D,OAAO;wBACL,OAAO;4BACL,KAAK;wBACP;oBACF;gBACF;YACF,GACA;gBACE,gBAAgB;YAClB;YAGF,MAAML,aAAavB,QAAQ;YAG3B,MAAMd,QAAQqC,YAAY,WAAW,EAAE;YACvCZ,SAAS,IAAI,IAAKY,YAAY,YAAY,EAAE;YAE5C,IAAIe;YACJ,IAAI;gBACFA,cAAc,MAAM,IAAI,CAAC,uBAAuB,CAC9CpD,OACAC,wBACAC,6BACA;oBACEiB;oBACA,SAAS;gBACX;YAEJ,EAAE,OAAOyB,OAAO;gBACd,OAAOvC,QAAQ,eAAe,CAC5B,CAAC,4CAA4C,EAAEuC,MAAM,SAAS,EAAEJ,KAAK,SAAS,CAC5ExC,QACC;YAEP;YACA,IAAI,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EACjDoC,QAAQ,IAAI,CACV,8FACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB;YAGnD,IAAIiB,YAAY;YAChB,IAAI;gBACF,MAAMhD,QAAQ,YAAY,CAAC+C,YAAY,KAAK;YAC9C,EAAE,OAAOR,OAAY;gBACnBS,YAAY;gBACZzB;gBACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAAG,CAAC,+BAA+B,EAAEgB,OAAO,WAAWU,OAAOV,QAAQ;gBACrHpD,MACE,yFACAoD,iBAAiBW,QAAQX,MAAM,OAAO,GAAGU,OAAOV,QAChD,6CACAhB;YAEJ;YAEA,IAAIA,8BAA8BlC,uCAChC,OAAOW,QAAQ,eAAe,CAAC;YAIjC,IAAI,CAACgC,YAAY,oCACf,IAAIgB,WACF7D,MACE;iBAGF;YAKJ,EAAEgC;YAEF,IAAIA,cAAcE,sBAAsB;gBACtC,MAAM8B,WAAW,CAAC,UAAU,EAAE9B,qBAAqB,4JAA4J,CAAC;gBAChN,OAAOrB,QAAQ,eAAe,CAACmD;YACjC;YAEA,IAAI,CAAC,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EAClD,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAC7C;QAEN;QAEA,MAAMC,cAAc;YAClB,QAAQ;gBACNhC;YACF;YACAb;QACF;QACA,OAAO6C;IACT;IAEQ,oBACNC,IAAsE,EACtEC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACpC;QACA,MAAMC,YAA4C;YAChD,MAAM;YACN,SAASL;YACT,OAAO;gBACL,YAAYI,mBACP;oBACCH;oBACAG;gBACF,IACAH;YACN;YACA,UAAU,OAAOnD,OAAOwD;gBACtB,MAAM,EAAEzD,IAAI,EAAE,GAAGyD;gBACjB,IAAIC;gBACJ,MAAMC,YAAY,CAACC;oBACjBF,YAAYE;oBACZ5D,KAAK,GAAG,GAAG;wBACT4D;oBACF;gBACF;gBAGA,MAAMzD,YAAYsD,YAAY,SAAS;gBACvCrD,OAAOD,WAAW;gBAElB,MAAM0D,mBAAmBV,AAAS,YAATA;gBACzB,IAAIW,cAAcV;gBAClB,IAAIW,cAAc;gBAClB,IAAIF,oBAAqBV,CAAAA,AAAS,aAATA,QAAqBA,AAAS,cAATA,IAAiB,GAAI;oBACjEY,cAAc;oBACd,MAAMC,gBACJb,AAAS,aAATA,OACI,CAAC,kDAAkD,EAAEC,QAAQ,GAC7D,CAAC,+GAA+G,EAAEA,QAAQ;oBAChIU,cAAc;wBACZ,CAACC,YAAY,EAAEC;oBACjB;gBACF,OAAO,IAAIH,kBACTC,cAAc;oBACZ,CAACC,YAAY,EAAE,GAAGZ,KAAK,EAAE,EAAEC,QAAQ;gBACrC;gBAGF,IAAIa;gBAEJ,IAAIC,uBAAuB;gBAC3B,IAAIZ,KAAK,eAAe,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE;oBAC1DrE,MAAM;oBACN,MAAMkF,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,mBAAmB;oBACrDD,uBAAuB,MAAME,kBAC3BD,MACA,KACA,OACAb,KAAK,gBAAgB;gBAEzB;gBAEA,IAAI;oBACFW,gBAAgB,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CACxCH,aACAT,aACAC,KACAY,sBACAX;gBAEJ,EAAE,OAAOlB,OAAO;oBACd,IAAIA,iBAAiBgC,cACnBV,UAAUtB,MAAM,IAAI;oBAEtB,MAAMA;gBACR;gBAEA,MAAM,EAAEiC,IAAI,EAAEhC,KAAK,EAAEiC,OAAO,EAAEX,IAAI,EAAEnB,iBAAiB,EAAE,GAAGwB;gBAC1DN,UAAUC;gBACV5D,KAAK,iBAAiB,GAAGyC;gBAEzB,IAAI+B,eAAeF;gBACnB,IAAIT,kBAEF,IAAI,AAAgB,YAAhB,OAAOS,MACTE,eAAeF;qBACV,IAAInB,AAAS,cAATA,MAEPqB,eADEF,QAAAA,OACa,QAECA,IAAY,CAACP,YAAY;qBAEtC,IAAIO,QAAAA,MACTE,eAAe;qBACV;oBACLpE,OACEkE,MAAM,CAACP,YAAY,KAAK3C,QACxB;oBAEFoD,eAAgBF,IAAY,CAACP,YAAY;gBAC3C;gBAGF,IAAIZ,AAAS,aAATA,QAAqB,CAACqB,cAAc;oBACtCxE,KAAK,KAAK,GAAGsC;oBACbtC,KAAK,OAAO,GAAGuE;oBACf,MAAM,IAAIvB,MAAM,CAAC,kBAAkB,EAAEuB,SAAS;gBAChD;gBAEA,OAAO;oBACL,QAAQC;oBACR,KAAKd;oBACLpB;oBACAiC;gBACF;YACF;QACF;QAEA,OAAOf;IACT;IACA,MAAM,yBACJL,IAA0D,EAC1DC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACP;QAC7B,MAAMzD,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aACEoD,MACA,AAAkB,YAAlB,OAAOC,SAAsBA,SAASnB,KAAK,SAAS,CAACmB;QAIzD,MAAMI,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9CL,MACAC,QACAC,aACAC,KACAC;QAGF,MAAMlD,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAAC0D;QAE1C,IAAI,CAACjD,QACH,MAAM,IAAIyC,MACR;QAIJ,MAAM,EAAExC,MAAM,EAAE+D,OAAO,EAAE,GAAGhE;QAE5B,OAAO;YACLC;YACA+D;YACAlE;QACF;IACF;IAEQ,UAAUoE,MAAc,EAA4C;QAC1E,OAAO;YACL,MAAM;YACN,OAAO;gBACLA;YACF;QACF;IACF;IAEA,MAAM,aAAaA,MAAc,EAAEC,YAA0B,EAAE;QAC7D,OAAO,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;YACtCD;QACF;IACF;IAEA,MAAM,QACJE,SAAsB,EACtBrB,GAA+B,EAC/BD,WAAyB,EACO;QAChC,MAAM,EAAEuB,UAAU,EAAErB,gBAAgB,EAAE,GAAGsB,YAAYF;QAErD,MAAMG,cAAc,CAAC,SAAS,EAAEF,YAAY;QAC5C,MAAM9E,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,WAAW+E;QAE1B,MAAMzE,SAASP,QAAQ,SAAS;QAChC,MAAM,EAAEiF,SAAS,EAAEC,eAAe,EAAE,GAAG1B;QAEvClD,OAAOuE,WAAW;QAClBvE,OAAO2E,WAAW;QAClB3E,OAAO4E,iBAAiB;QAExB5E,OACE4E,mBAAmBD,WACnB,CAAC,iGAAiG,EAAEC,gBAAgB,aAAa,EAAED,UAAU,CAAC,CAAC;QAGjJ,MAAME,mBAAmB1D,KAAK,GAAG;QACjC,IAAI2D,iBAAiBD;QACrB,IAAIE,eAAe;QAEnB,MAAOD,iBAAiBD,oBAAoBF,UAAW;YACrD,MAAMK,oBAAoB7D,KAAK,GAAG;YAClC2D,iBAAiBE;YACjB,MAAM5B,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9C,WACAoB,YACAvB,aACAjC,QACAmC;YAGF,MAAMhD,SAAU,MAAMT,QAAQ,YAAY,CAAC0D;YAO3C,IAAIjD,QAAQ,QACV,OAAO;gBACL,QAAQa;gBACRf;YACF;YAGF8E,eACE5E,QAAQ,WACP,CAACA,UAAU,CAAC,0BAA0B,EAAEqE,YAAY,IACrD,CAAC,0CAA0C,EAAEA,YAAY;YAC3D,MAAMS,MAAM9D,KAAK,GAAG;YACpB,IAAI8D,MAAMD,oBAAoBJ,iBAAiB;gBAC7C,MAAMpC,gBAAgBoC,kBAAmBK,CAAAA,MAAMD,iBAAgB;gBAC/D,MAAME,YAAY,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;oBACjD,QAAQ1C;gBACV;gBACA,MAAM9C,QAAQ,MAAM,CAACwF;YACvB;QACF;QAEA,OAAOxF,QAAQ,eAAe,CAAC,CAAC,iBAAiB,EAAEqF,cAAc;IACnE;IAllBA,YACEI,iBAAoC,EACpCC,OAAgB,EAChBC,IAMC,CACD;QAjCF;QAEA;QAEA;QAEA,uBAAiB,uBAAjB;QAEA,uBAAiB,eAAjB;QAEA,uBAAQ,uBAAR;QAEA;QAEA,uBAAiB,SAAjB;QAEA;QAkBE,IAAI,CAAC,SAAS,GAAGF;QACjB,IAAI,CAAC,OAAO,GAAGC;QACf,IAAI,CAAC,SAAS,GAAGC,KAAK,SAAS;QAC/B,IAAI,CAAC,mBAAmB,GAAGA,MAAM;QACjC,IAAI,CAAC,oBAAoB,GAAGA,KAAK,oBAAoB;QACrD,IAAI,CAAC,KAAK,GAAGA,KAAK,KAAK;QACvB,IAAI,CAAC,mBAAmB,GAAG,IAAIC;QAC/B,IAAI,CAAC,mBAAmB,GAAGD,KAAK,WAAW;QAC3C,IAAI,CAAC,WAAW,GAAG,IAAIE,YAAY;YACjCJ;YACAC;YACA,WAAWC,KAAK,SAAS;YACzB,aAAa,IAAI,CAAC,cAAc;QAClC;IACF;AA0jBF"}
@@ -101,7 +101,7 @@ async function matchElementFromCache(context, cacheEntry, cachePrompt, cacheable
101
101
  return;
102
102
  }
103
103
  }
104
- const getMidsceneVersion = ()=>"1.0.5-beta-20251230131740.0";
104
+ const getMidsceneVersion = ()=>"1.0.5-beta-20251231022759.0";
105
105
  const parsePrompt = (prompt)=>{
106
106
  if ('string' == typeof prompt) return {
107
107
  textPrompt: prompt,
@@ -142,7 +142,8 @@ async function AiLocateElement(options) {
142
142
  errors: errors
143
143
  },
144
144
  rawResponse,
145
- usage: res.usage
145
+ usage: res.usage,
146
+ reasoning_content: res.reasoning_content
146
147
  };
147
148
  }
148
149
  async function AiLocateSection(options) {
@@ -250,7 +251,8 @@ async function AiExtractElementInfo(options) {
250
251
  const result = await callAIWithObjectResponse(msgs, AIActionType.EXTRACT_DATA, modelConfig);
251
252
  return {
252
253
  parseResult: result.content,
253
- usage: result.usage
254
+ usage: result.usage,
255
+ reasoning_content: result.reasoning_content
254
256
  };
255
257
  }
256
258
  async function AiJudgeOrderSensitive(description, callAIFn, modelConfig) {
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n mergeRects,\n} from '../common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements = 'elements' in res.content ? res.content.elements : [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const { screenshotBase64 } = context;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(\n msgs,\n AIActionType.INSPECT_ELEMENT, // Reuse existing action type for now\n modelConfig,\n );\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","adaptBboxToRect","rectCenter","element","generateElementByPosition","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;AAsDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OASrC;IASC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7BM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,4BAA4BP;IAEjD,IAAIQ,eAAeP;IACnB,IAAIQ,aAAab,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIc,cAAcd,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIe,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIf,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFa,eAAeb,QAAQ,YAAY,CAAC,WAAW;QAC/Cc,aAAad,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCe,cAAcf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCgB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIV,AAAW,iBAAXA,QAAyB;QAClC,MAAMa,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMvB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMkB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMC,MAAM,MAAMlB,SAASR,MAAM2B,aAAa,eAAe,EAAElB;IAE/D,MAAMmB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBAAkB,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IAC3E,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAI,UAAUI,gBACRR,IAAI,OAAO,CAAC,IAAI,EAChBP,YACAC,aACAf,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BgB,oBACAC,qBACAZ;YAGFjB,aAAa,WAAWqC;YAExB,MAAMK,aAAa;gBACjB,GAAGL,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMM,UAA+BC,0BACnCF,YACAtB;YAEFmB,SAAS,EAAE;YAEX,IAAII,SACFL,kBAAkB;gBAACK;aAAQ;QAE/B;IACF,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACN,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEO,IAAI,CAAC,CAAC;aAFtBP,SAAS;YAACO;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMT;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAJ;QACA,OAAOF,IAAI,KAAK;IAClB;AACF;AAEO,eAAee,gBAAgBpC,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEoC,kBAAkB,EAAEjC,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMU,eAAe2B,4BAA4BjC;IACjD,MAAMkC,gCAAgCC,0BACpCjD,wBAAwB8C;IAE1B,MAAM1C,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMiC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMjB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQ4C,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA1C,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMqB,SAAS,MAAMC,yBACnB/C,MACA2B,aAAa,YAAY,EACzBlB;IAGF,IAAIuC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAahB,gBACjBe,aACA3C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BuD;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DnD,aAAa,wBAAwBwD;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASpB,MAAM,OAAO,CAACoB,OAC/B,GAAG,CAAC,CAACA,OACGnB,gBACLmB,MACA/C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqByD;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DzD,aAAa,iBAAiB2D;QAG9BN,cAAcQ,iBAAiBF,YAAYhD,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BqD;IAC1C;IAEA,IAAIS,cAAc9C;IAClB,IAAIqC,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1BhD,kBACAqC,aACAtC,AAAW,iBAAXA;QAEF+C,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAajB,KAAK,SAAS,CAACiB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBvD,OAO7C;IACC,MAAM,EAAEwD,SAAS,EAAEvD,OAAO,EAAEwD,aAAa,EAAE/D,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe+C;IACrB,MAAM,EAAEpD,gBAAgB,EAAE,GAAGL;IAE7B,MAAM0D,wBAAwBC,uBAC5B5D,QAAQ,eAAe,IAAI,IAC3BwD;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKvD;YACL,QAAQ;QACV;IACF;IAGFuD,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMhE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASkD;QACX;KACD;IAED,IAAInE,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMqB,SAAS,MAAMC,yBACnB/C,MACA2B,aAAa,YAAY,EACzBlB;IAEF,OAAO;QACL,aAAaqC,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB5D,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeqD;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMpE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASsD;QACX;KACD;IAED,MAAMxB,SAAS,MAAMtC,SACnBR,MACA2B,aAAa,eAAe,EAC5BlB;IAGF,OAAO;QACL,kBAAkBqC,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n mergeRects,\n} from '../common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements = 'elements' in res.content ? res.content.elements : [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const { screenshotBase64 } = context;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(\n msgs,\n AIActionType.INSPECT_ELEMENT, // Reuse existing action type for now\n modelConfig,\n );\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","adaptBboxToRect","rectCenter","element","generateElementByPosition","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;AAsDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OASrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7BM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,4BAA4BP;IAEjD,IAAIQ,eAAeP;IACnB,IAAIQ,aAAab,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIc,cAAcd,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIe,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIf,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFa,eAAeb,QAAQ,YAAY,CAAC,WAAW;QAC/Cc,aAAad,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCe,cAAcf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCgB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIV,AAAW,iBAAXA,QAAyB;QAClC,MAAMa,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMvB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMkB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMC,MAAM,MAAMlB,SAASR,MAAM2B,aAAa,eAAe,EAAElB;IAE/D,MAAMmB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBAAkB,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IAC3E,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAI,UAAUI,gBACRR,IAAI,OAAO,CAAC,IAAI,EAChBP,YACAC,aACAf,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BgB,oBACAC,qBACAZ;YAGFjB,aAAa,WAAWqC;YAExB,MAAMK,aAAa;gBACjB,GAAGL,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMM,UAA+BC,0BACnCF,YACAtB;YAEFmB,SAAS,EAAE;YAEX,IAAII,SACFL,kBAAkB;gBAACK;aAAQ;QAE/B;IACF,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACN,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEO,IAAI,CAAC,CAAC;aAFtBP,SAAS;YAACO;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMT;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAJ;QACA,OAAOF,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAee,gBAAgBpC,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEoC,kBAAkB,EAAEjC,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMU,eAAe2B,4BAA4BjC;IACjD,MAAMkC,gCAAgCC,0BACpCjD,wBAAwB8C;IAE1B,MAAM1C,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMiC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMjB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQ4C,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA1C,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMqB,SAAS,MAAMC,yBACnB/C,MACA2B,aAAa,YAAY,EACzBlB;IAGF,IAAIuC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAahB,gBACjBe,aACA3C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BuD;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DnD,aAAa,wBAAwBwD;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASpB,MAAM,OAAO,CAACoB,OAC/B,GAAG,CAAC,CAACA,OACGnB,gBACLmB,MACA/C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqByD;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DzD,aAAa,iBAAiB2D;QAG9BN,cAAcQ,iBAAiBF,YAAYhD,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BqD;IAC1C;IAEA,IAAIS,cAAc9C;IAClB,IAAIqC,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1BhD,kBACAqC,aACAtC,AAAW,iBAAXA;QAEF+C,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAajB,KAAK,SAAS,CAACiB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBvD,OAO7C;IACC,MAAM,EAAEwD,SAAS,EAAEvD,OAAO,EAAEwD,aAAa,EAAE/D,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe+C;IACrB,MAAM,EAAEpD,gBAAgB,EAAE,GAAGL;IAE7B,MAAM0D,wBAAwBC,uBAC5B5D,QAAQ,eAAe,IAAI,IAC3BwD;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKvD;YACL,QAAQ;QACV;IACF;IAGFuD,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMhE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASkD;QACX;KACD;IAED,IAAInE,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMqB,SAAS,MAAMC,yBACnB/C,MACA2B,aAAa,YAAY,EACzBlB;IAEF,OAAO;QACL,aAAaqC,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB5D,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeqD;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMpE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASsD;QACX;KACD;IAED,MAAMxB,SAAS,MAAMtC,SACnBR,MACA2B,aAAa,eAAe,EAC5BlB;IAGF,OAAO;QACL,kBAAkBqC,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
@@ -82,7 +82,10 @@ async function plan(userInstruction, opts) {
82
82
  ...instruction,
83
83
  ...historyLog
84
84
  ];
85
- const { content: planFromAI, contentString: rawResponse, usage } = await callAIWithObjectResponse(msgs, AIActionType.PLAN, modelConfig);
85
+ const { content: planFromAI, contentString: rawResponse, usage, reasoning_content } = await callAIWithObjectResponse(msgs, AIActionType.PLAN, modelConfig, {
86
+ qwen3_vl_enable_thinking: opts.qwen3_vl_enable_thinking,
87
+ doubao_enable_thinking: opts.doubao_enable_thinking
88
+ });
86
89
  const actions = planFromAI.action ? [
87
90
  planFromAI.action
88
91
  ] : [];
@@ -91,6 +94,7 @@ async function plan(userInstruction, opts) {
91
94
  actions,
92
95
  rawResponse,
93
96
  usage,
97
+ reasoning_content,
94
98
  yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace, planFromAI.sleep)
95
99
  };
96
100
  assert(planFromAI, "can't get plans from AI");
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","callAIWithObjectResponse","AIActionType","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","undefined","fillBboxParam","console"],"mappings":";;;;;;AAsBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IASC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeL;IACnB,IAAIM,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAC7B,MAAMO,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAEJ,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAC3B,MAAMC,aAAahB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMoB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCU;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACN,GAAG,MAAMC,yBACRJ,MACAK,aAAa,IAAI,EACjBvB;IAGF,MAAMwB,UAAUL,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,MAAMM,cAAkC;QACtC,GAAGN,UAAU;QACbK;QACAJ;QACAC;QACA,UAAUK,uBACRF,SACA1B,KAAK,WAAW,EAChBqB,WAAW,KAAK;IAEpB;IAEAQ,OAAOR,YAAY;IAEnBK,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBhC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAAC8B,SAAWA,OAAO,IAAI,KAAKC;QAG9BnC,MAAM,+BAA+BoC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENpC,MAAM,gBAAgBqC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgB9B,AAAW+B,WAAX/B,QAElBwB,OAAO,KAAK,CAACK,MAAM,GAAGG,cACpBF,cACA1B,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEA,IACEoB,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACAxC;IAIJI,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMmB;YACR;SACD;IACH;IAEA,OAAOK;AACT"}
1
+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n qwen3_vl_enable_thinking?: boolean;\n doubao_enable_thinking?: 'enabled' | 'disabled' | 'auto';\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n reasoning_content,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n {\n qwen3_vl_enable_thinking: opts.qwen3_vl_enable_thinking,\n doubao_enable_thinking: opts.doubao_enable_thinking,\n },\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","reasoning_content","callAIWithObjectResponse","AIActionType","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","undefined","fillBboxParam","console"],"mappings":";;;;;;AAsBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAWC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeL;IACnB,IAAIM,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAC7B,MAAMO,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAEJ,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAC3B,MAAMC,aAAahB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMoB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCU;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,yBACRL,MACAM,aAAa,IAAI,EACjBxB,aACA;QACE,0BAA0BF,KAAK,wBAAwB;QACvD,wBAAwBA,KAAK,sBAAsB;IACrD;IAGF,MAAM2B,UAAUN,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,MAAMO,cAAkC;QACtC,GAAGP,UAAU;QACbM;QACAL;QACAC;QACAC;QACA,UAAUK,uBACRF,SACA3B,KAAK,WAAW,EAChBqB,WAAW,KAAK;IAEpB;IAEAS,OAAOT,YAAY;IAEnBM,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBjC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAAC+B,SAAWA,OAAO,IAAI,KAAKC;QAG9BpC,MAAM,+BAA+BqC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENrC,MAAM,gBAAgBsC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgB/B,AAAWgC,WAAXhC,QAElByB,OAAO,KAAK,CAACK,MAAM,GAAGG,cACpBF,cACA3B,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEA,IACEqB,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACAzC;IAIJI,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMmB;YACR;SACD;IACH;IAEA,OAAOM;AACT"}
@@ -48,14 +48,7 @@ async function systemPromptToTaskPlanning({ actionSpace, vlMode, includeBbox })
48
48
  if (includeBbox && !vlMode) throw new Error('vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.');
49
49
  const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam(includeBbox ? vlMode : void 0)));
50
50
  const actionList = actionDescriptionList.join('\n');
51
- const logFieldInstruction = 'qwen2.5-vl' === vlMode || 'qwen3-vl' === vlMode ? `
52
- ## About the \`log\` field
53
-
54
- * Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction.
55
- * The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....".
56
- * If no action should be done, log the reason.
57
- * Use the same language as the user's instruction.
58
- ` : `
51
+ const logFieldInstruction = `
59
52
  ## About the \`log\` field (preamble message)
60
53
 
61
54
  The \`log\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:
@@ -71,7 +64,6 @@ The \`log\` field is a brief preamble message to the user explaining what you’
71
64
  - "Previous actions failed to find the 'Yes' button, i will try again"
72
65
  - "Go back to find the login button"
73
66
  `;
74
- const logSample = 'qwen2.5-vl' === vlMode || 'qwen3-vl' === vlMode ? 'The user wants to fill the form and login. According to the instruction and the previous logs, next step is to click the login button. Now i am going to compose an action "Tap" to click the login button.' : 'Click the login button';
75
67
  return `
76
68
  Target: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.
77
69
 
@@ -96,7 +88,7 @@ ${logFieldInstruction}
96
88
 
97
89
  Return in JSON format:
98
90
  {
99
- "log": string, // a preamble message to the user explaining what you’re about to do
91
+ "log": string, // a brief preamble to the user explaining what you’re about to do
100
92
  ${commonOutputFields}
101
93
  "action":
102
94
  {
@@ -112,7 +104,7 @@ Return in JSON format:
112
104
  For example, if the instruction is to login and the form has already been filled, this is a valid return value:
113
105
 
114
106
  {
115
- "log": "${logSample}",
107
+ "log": "Click the login button",
116
108
  "more_actions_needed_by_instruction": false,
117
109
  "action": {
118
110
  "type": "Tap",
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction =\n vlMode === 'qwen2.5-vl' || vlMode === 'qwen3-vl'\n ? `\n## About the \\`log\\` field\n\n* Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. \n* The log should contain the following information: \"The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....\". \n* If no action should be done, log the reason. \n* Use the same language as the user's instruction.\n`\n : `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n const logSample =\n vlMode === 'qwen2.5-vl' || vlMode === 'qwen3-vl'\n ? 'The user wants to fill the form and login. According to the instruction and the previous logs, next step is to click the login button. Now i am going to compose an action \"Tap\" to click the login button.'\n : 'Click the login button';\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If there is nothing to do but waiting, set the \"sleep\" field to the positive waiting time in milliseconds and null for the \"action\" field.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"log\": string, // a preamble message to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"log\": \"${logSample}\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","field","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","paramLine","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","undefined","actionList","logFieldInstruction","logSample"],"mappings":";;AAYA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACH,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAME,aACJ,AACE,cADF,OAAQF,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMG,kBAAkBD,aAAa,GAAGH,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMK,WAAWC,eAAeL,OAAOR;gBAGvC,MAAMc,cAAcC,kBAAkBP;gBAGtC,IAAIQ,YAAY,GAAGL,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,IAAIE,aACFE,aAAa,CAAC,IAAI,EAAEF,aAAa;gBAGnCX,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACc;oBAClBf,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEe,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAML,WAAWC,eAAeT;YAChC,MAAMU,cAAcC,kBAAkBX;YAGtC,IAAIc,mBAAmB,CAAC,SAAS,EAAEN,UAAU;YAC7C,IAAIE,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpBhB,OAAO,IAAI,CAACgB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEnB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAekB,2BAA2B,EAC/CC,WAAW,EACXxB,MAAM,EACNyB,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACzB,QAClB,MAAM,IAAI0B,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAACrB,SACtCD,qBACLC,QACAJ,cAAc0B,cAAczB,SAAS4B;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAE9C,MAAMG,sBACJ9B,AAAW,iBAAXA,UAA2BA,AAAW,eAAXA,SACvB,CAAC;;;;;;;AAOT,CAAC,GACO,CAAC;;;;;;;;;;;;;;;AAeT,CAAC;IAEC,MAAM+B,YACJ/B,AAAW,iBAAXA,UAA2BA,AAAW,eAAXA,SACvB,gNACA;IAEN,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAE6B,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;EAOpB,EAAEhC,mBAAmB;;;;;;;;;;;;;;;UAeb,EAAEiC,UAAU;;;;;;oCAMc,EAAE/B,UAAUyB,cAAc,mCAAmC,GAAG;;;;;AAKpG,CAAC;AACD"}
1
+ {"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If there is nothing to do but waiting, set the \"sleep\" field to the positive waiting time in milliseconds and null for the \"action\" field.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"log\": \"Click the login button\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","field","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","paramLine","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","undefined","actionList","logFieldInstruction"],"mappings":";;AAYA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACH,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAME,aACJ,AACE,cADF,OAAQF,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMG,kBAAkBD,aAAa,GAAGH,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMK,WAAWC,eAAeL,OAAOR;gBAGvC,MAAMc,cAAcC,kBAAkBP;gBAGtC,IAAIQ,YAAY,GAAGL,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,IAAIE,aACFE,aAAa,CAAC,IAAI,EAAEF,aAAa;gBAGnCX,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACc;oBAClBf,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEe,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAML,WAAWC,eAAeT;YAChC,MAAMU,cAAcC,kBAAkBX;YAGtC,IAAIc,mBAAmB,CAAC,SAAS,EAAEN,UAAU;YAC7C,IAAIE,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpBhB,OAAO,IAAI,CAACgB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEnB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAekB,2BAA2B,EAC/CC,WAAW,EACXxB,MAAM,EACNyB,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACzB,QAClB,MAAM,IAAI0B,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAACrB,SACtCD,qBACLC,QACAJ,cAAc0B,cAAczB,SAAS4B;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAE9C,MAAMG,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;EAOpB,EAAEhC,mBAAmB;;;;;;;;;;;;;;;;;;;;;oCAqBa,EAAEE,UAAUyB,cAAc,mCAAmC,GAAG;;;;;AAKpG,CAAC;AACD"}
@@ -116,6 +116,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
116
116
  const isStreaming = options?.stream && options?.onChunk;
117
117
  let content;
118
118
  let accumulated = '';
119
+ let accumulatedReasoning = '';
119
120
  let usage;
120
121
  let timeCost;
121
122
  const buildUsageInfo = (usageData)=>{
@@ -139,7 +140,14 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
139
140
  ...'qwen2.5-vl' === vlMode ? {
140
141
  vl_high_resolution_images: true
141
142
  } : {},
142
- enable_thinking: true
143
+ ...options?.qwen3_vl_enable_thinking !== void 0 ? {
144
+ enable_thinking: options.qwen3_vl_enable_thinking
145
+ } : {},
146
+ ...('doubao-vision' === vlMode || 'vlm-ui-tars' === vlMode) && options?.doubao_enable_thinking ? {
147
+ thinking: {
148
+ type: options.doubao_enable_thinking
149
+ }
150
+ } : {}
143
151
  };
144
152
  try {
145
153
  debugCall(`sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`);
@@ -157,6 +165,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
157
165
  if (chunk.usage) usage = chunk.usage;
158
166
  if (content || reasoning_content) {
159
167
  accumulated += content;
168
+ accumulatedReasoning += reasoning_content;
160
169
  const chunkData = {
161
170
  content,
162
171
  reasoning_content,
@@ -200,9 +209,11 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
200
209
  debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);
201
210
  assert(result.choices, `invalid response from LLM service: ${JSON.stringify(result)}`);
202
211
  content = result.choices[0].message.content;
212
+ accumulatedReasoning = result.choices[0].message?.reasoning_content || '';
203
213
  usage = result.usage;
204
214
  }
205
- debugCall(`response: ${content}`);
215
+ debugCall(`response reasoning content: ${accumulatedReasoning}`);
216
+ debugCall(`response content: ${content}`);
206
217
  assert(content, 'empty content');
207
218
  if (isStreaming && !usage) {
208
219
  const estimatedTokens = Math.max(1, Math.floor((content || '').length / 4));
@@ -214,6 +225,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
214
225
  }
215
226
  return {
216
227
  content: content || '',
228
+ reasoning_content: accumulatedReasoning || void 0,
217
229
  usage: buildUsageInfo(usage),
218
230
  isStreamed: !!isStreaming
219
231
  };
@@ -225,8 +237,11 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
225
237
  throw newError;
226
238
  }
227
239
  }
228
- async function callAIWithObjectResponse(messages, AIActionTypeValue, modelConfig) {
229
- const response = await callAI(messages, AIActionTypeValue, modelConfig);
240
+ async function callAIWithObjectResponse(messages, AIActionTypeValue, modelConfig, options) {
241
+ const response = await callAI(messages, AIActionTypeValue, modelConfig, {
242
+ qwen3_vl_enable_thinking: options?.qwen3_vl_enable_thinking,
243
+ doubao_enable_thinking: options?.doubao_enable_thinking
244
+ });
230
245
  assert(response, 'empty response');
231
246
  const vlMode = modelConfig.vlMode;
232
247
  const jsonContent = safeParseJson(response.content, vlMode);
@@ -234,7 +249,8 @@ async function callAIWithObjectResponse(messages, AIActionTypeValue, modelConfig
234
249
  return {
235
250
  content: jsonContent,
236
251
  contentString: response.content,
237
- usage: response.usage
252
+ usage: response.usage,
253
+ reasoning_content: response.reasoning_content
238
254
  };
239
255
  }
240
256
  async function callAIWithStringResponse(msgs, AIActionTypeValue, modelConfig) {