@midscene/core 1.2.2-beta-20260115092052.0 → 1.2.2-beta-20260116060040.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/dist/es/agent/agent.mjs +27 -24
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/tasks.mjs +15 -29
  4. package/dist/es/agent/tasks.mjs.map +1 -1
  5. package/dist/es/agent/utils.mjs +4 -2
  6. package/dist/es/agent/utils.mjs.map +1 -1
  7. package/dist/es/ai-model/inspect.mjs +3 -3
  8. package/dist/es/ai-model/inspect.mjs.map +1 -1
  9. package/dist/es/ai-model/llm-planning.mjs +14 -4
  10. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  11. package/dist/es/ai-model/prompt/llm-planning.mjs +44 -9
  12. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  13. package/dist/es/ai-model/service-caller/index.mjs +11 -10
  14. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  15. package/dist/es/ai-model/ui-tars-planning.mjs +3 -2
  16. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  17. package/dist/es/common.mjs +14 -5
  18. package/dist/es/common.mjs.map +1 -1
  19. package/dist/es/device/index.mjs +29 -4
  20. package/dist/es/device/index.mjs.map +1 -1
  21. package/dist/es/dump/html-utils.mjs +42 -0
  22. package/dist/es/dump/html-utils.mjs.map +1 -0
  23. package/dist/es/dump/image-restoration.mjs +31 -0
  24. package/dist/es/dump/image-restoration.mjs.map +1 -0
  25. package/dist/es/dump/index.mjs +3 -0
  26. package/dist/es/index.mjs +4 -2
  27. package/dist/es/index.mjs.map +1 -1
  28. package/dist/es/screenshot-item.mjs +34 -0
  29. package/dist/es/screenshot-item.mjs.map +1 -0
  30. package/dist/es/service/index.mjs +2 -1
  31. package/dist/es/service/index.mjs.map +1 -1
  32. package/dist/es/task-runner.mjs +7 -9
  33. package/dist/es/task-runner.mjs.map +1 -1
  34. package/dist/es/types.mjs +78 -1
  35. package/dist/es/types.mjs.map +1 -1
  36. package/dist/es/utils.mjs +3 -2
  37. package/dist/es/utils.mjs.map +1 -1
  38. package/dist/lib/agent/agent.js +24 -21
  39. package/dist/lib/agent/agent.js.map +1 -1
  40. package/dist/lib/agent/tasks.js +15 -29
  41. package/dist/lib/agent/tasks.js.map +1 -1
  42. package/dist/lib/agent/utils.js +4 -2
  43. package/dist/lib/agent/utils.js.map +1 -1
  44. package/dist/lib/ai-model/inspect.js +3 -3
  45. package/dist/lib/ai-model/inspect.js.map +1 -1
  46. package/dist/lib/ai-model/llm-planning.js +13 -3
  47. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  48. package/dist/lib/ai-model/prompt/llm-planning.js +44 -9
  49. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  50. package/dist/lib/ai-model/service-caller/index.js +11 -10
  51. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  52. package/dist/lib/ai-model/ui-tars-planning.js +3 -2
  53. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  54. package/dist/lib/common.js +20 -5
  55. package/dist/lib/common.js.map +1 -1
  56. package/dist/lib/device/index.js +53 -16
  57. package/dist/lib/device/index.js.map +1 -1
  58. package/dist/lib/dump/html-utils.js +94 -0
  59. package/dist/lib/dump/html-utils.js.map +1 -0
  60. package/dist/lib/dump/image-restoration.js +65 -0
  61. package/dist/lib/dump/image-restoration.js.map +1 -0
  62. package/dist/lib/dump/index.js +60 -0
  63. package/dist/lib/dump/index.js.map +1 -0
  64. package/dist/lib/index.js +42 -7
  65. package/dist/lib/index.js.map +1 -1
  66. package/dist/lib/screenshot-item.js +68 -0
  67. package/dist/lib/screenshot-item.js.map +1 -0
  68. package/dist/lib/service/index.js +2 -1
  69. package/dist/lib/service/index.js.map +1 -1
  70. package/dist/lib/task-runner.js +7 -9
  71. package/dist/lib/task-runner.js.map +1 -1
  72. package/dist/lib/types.js +91 -3
  73. package/dist/lib/types.js.map +1 -1
  74. package/dist/lib/utils.js +3 -2
  75. package/dist/lib/utils.js.map +1 -1
  76. package/dist/types/agent/agent.d.ts +5 -16
  77. package/dist/types/agent/tasks.d.ts +4 -4
  78. package/dist/types/ai-model/service-caller/index.d.ts +3 -3
  79. package/dist/types/common.d.ts +8 -1
  80. package/dist/types/device/index.d.ts +85 -23
  81. package/dist/types/dump/html-utils.d.ts +7 -0
  82. package/dist/types/dump/image-restoration.d.ts +6 -0
  83. package/dist/types/dump/index.d.ts +5 -0
  84. package/dist/types/index.d.ts +3 -1
  85. package/dist/types/screenshot-item.d.ts +27 -0
  86. package/dist/types/task-runner.d.ts +1 -1
  87. package/dist/types/types.d.ts +61 -6
  88. package/package.json +2 -2
@@ -1 +1 @@
1
- {"version":3,"file":"agent/tasks.mjs","sources":["../../../src/agent/tasks.ts"],"sourcesContent":["import { ConversationHistory, plan, uiTarsPlanning } from '@/ai-model';\nimport type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface, FileChooserHandler } from '@/device';\nimport type Service from '@/service';\nimport type { TaskRunner } from '@/task-runner';\nimport { TaskExecutionError } from '@/task-runner';\nimport type {\n DeepThinkOption,\n DeviceAction,\n ExecutionTaskApply,\n ExecutionTaskInsightQueryApply,\n ExecutionTaskPlanningApply,\n ExecutionTaskProgressOptions,\n InterfaceType,\n MidsceneYamlFlowItem,\n PlanningAIResponse,\n PlanningAction,\n PlanningActionParamSleep,\n PlanningActionParamWaitFor,\n ServiceDump,\n ServiceExtractOption,\n ServiceExtractParam,\n} from '@/types';\nimport { ServiceError } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ExecutionSession } from './execution-session';\nimport { TaskBuilder } from './task-builder';\nimport type { TaskCache } from './task-cache';\nexport { locatePlanForLocate } from './task-builder';\nimport { descriptionOfTree } from '@midscene/shared/extractor';\nimport { taskTitleStr } from './ui-utils';\nimport { parsePrompt } from './utils';\n\ninterface ExecutionResult<OutputType = any> {\n output: OutputType;\n thought?: string;\n runner: TaskRunner;\n}\n\ninterface TaskExecutorHooks {\n onTaskUpdate?: (\n runner: TaskRunner,\n error?: TaskExecutionError,\n ) => Promise<void> | void;\n}\n\nconst debug = getDebug('device-task-executor');\nconst maxErrorCountAllowedInOnePlanningLoop = 5;\n\nexport { TaskExecutionError };\n\nexport class TaskExecutor {\n interface: AbstractInterface;\n\n service: Service;\n\n taskCache?: TaskCache;\n\n private readonly providedActionSpace: DeviceAction[];\n\n private readonly taskBuilder: TaskBuilder;\n\n private conversationHistory: ConversationHistory;\n\n onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];\n\n private readonly hooks?: TaskExecutorHooks;\n\n replanningCycleLimit?: number;\n\n // @deprecated use .interface instead\n get page() {\n return this.interface;\n }\n\n constructor(\n interfaceInstance: AbstractInterface,\n service: Service,\n opts: {\n taskCache?: TaskCache;\n onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];\n replanningCycleLimit?: number;\n hooks?: TaskExecutorHooks;\n actionSpace: DeviceAction[];\n },\n ) {\n this.interface = interfaceInstance;\n this.service = service;\n this.taskCache = opts.taskCache;\n this.onTaskStartCallback = opts?.onTaskStart;\n this.replanningCycleLimit = opts.replanningCycleLimit;\n this.hooks = opts.hooks;\n this.conversationHistory = new ConversationHistory();\n this.providedActionSpace = opts.actionSpace;\n this.taskBuilder = new TaskBuilder({\n interfaceInstance,\n service,\n taskCache: opts.taskCache,\n actionSpace: this.getActionSpace(),\n });\n }\n\n private createExecutionSession(\n title: string,\n options?: { tasks?: ExecutionTaskApply[] },\n ) {\n return new ExecutionSession(\n title,\n () => Promise.resolve(this.service.contextRetrieverFn()),\n {\n onTaskStart: this.onTaskStartCallback,\n tasks: options?.tasks,\n onTaskUpdate: this.hooks?.onTaskUpdate,\n },\n );\n }\n\n private getActionSpace(): DeviceAction[] {\n return this.providedActionSpace;\n }\n\n public async convertPlanToExecutable(\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n options?: {\n cacheable?: boolean;\n subTask?: boolean;\n },\n ) {\n return this.taskBuilder.build(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n options,\n );\n }\n\n async loadYamlFlowAsPlanning(userInstruction: string, yamlString: string) {\n const session = this.createExecutionSession(\n taskTitleStr('Action', userInstruction),\n );\n\n const task: ExecutionTaskPlanningApply = {\n type: 'Planning',\n subType: 'LoadYaml',\n param: {\n userInstruction,\n },\n executor: async (param, executorContext) => {\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n return {\n output: {\n actions: [],\n more_actions_needed_by_instruction: false,\n log: '',\n yamlString,\n },\n cache: {\n hit: true,\n },\n hitBy: {\n from: 'Cache',\n context: {\n yamlString,\n },\n },\n };\n },\n };\n const runner = session.getRunner();\n await session.appendAndRun(task);\n\n return {\n runner,\n };\n }\n\n async runPlans(\n title: string,\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n ): Promise<ExecutionResult> {\n const session = this.createExecutionSession(title);\n const { tasks } = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n );\n const runner = session.getRunner();\n const result = await session.appendAndRun(tasks);\n const { output } = result ?? {};\n return {\n output,\n runner,\n };\n }\n\n async action(\n userPrompt: string,\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n includeBboxInPlanning: boolean,\n aiActContext?: string,\n cacheable?: boolean,\n replanningCycleLimitOverride?: number,\n imagesIncludeCount?: number,\n deepThink?: DeepThinkOption,\n fileChooserAccept?: string[],\n ): Promise<\n ExecutionResult<\n | {\n yamlFlow?: MidsceneYamlFlowItem[]; // for cache use\n }\n | undefined\n >\n > {\n return withFileChooser(this.interface, fileChooserAccept, async () => {\n return this.runAction(\n userPrompt,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n includeBboxInPlanning,\n aiActContext,\n cacheable,\n replanningCycleLimitOverride,\n imagesIncludeCount,\n deepThink,\n );\n });\n }\n\n private async runAction(\n userPrompt: string,\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n includeBboxInPlanning: boolean,\n aiActContext?: string,\n cacheable?: boolean,\n replanningCycleLimitOverride?: number,\n imagesIncludeCount?: number,\n deepThink?: DeepThinkOption,\n ): Promise<\n ExecutionResult<\n | {\n yamlFlow?: MidsceneYamlFlowItem[]; // for cache use\n }\n | undefined\n >\n > {\n this.conversationHistory.reset();\n\n const session = this.createExecutionSession(\n taskTitleStr('Action', userPrompt),\n );\n const runner = session.getRunner();\n\n let replanCount = 0;\n const yamlFlow: MidsceneYamlFlowItem[] = [];\n const replanningCycleLimit =\n replanningCycleLimitOverride ?? this.replanningCycleLimit;\n assert(\n replanningCycleLimit !== undefined,\n 'replanningCycleLimit is required for TaskExecutor.action',\n );\n\n let errorCountInOnePlanningLoop = 0; // count the number of errors in one planning loop\n\n // Main planning loop - unified plan/replan logic\n while (true) {\n const result = await session.appendAndRun(\n {\n type: 'Planning',\n subType: 'Plan',\n param: {\n userInstruction: userPrompt,\n aiActContext,\n imagesIncludeCount,\n deepThink,\n },\n executor: async (param, executorContext) => {\n const startTime = Date.now();\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n const { vlMode } = modelConfigForPlanning;\n const uiTarsModelVersion =\n vlMode === 'vlm-ui-tars'\n ? modelConfigForPlanning.uiTarsModelVersion\n : undefined;\n\n const actionSpace = this.getActionSpace();\n debug(\n 'actionSpace for this interface is:',\n actionSpace.map((action) => action.name).join(', '),\n );\n assert(Array.isArray(actionSpace), 'actionSpace must be an array');\n if (actionSpace.length === 0) {\n console.warn(\n `ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`,\n );\n }\n\n const planResult = await (uiTarsModelVersion\n ? uiTarsPlanning\n : plan)(param.userInstruction, {\n context: uiContext,\n actionContext: param.aiActContext,\n interfaceType: this.interface.interfaceType as InterfaceType,\n actionSpace,\n modelConfig: modelConfigForPlanning,\n conversationHistory: this.conversationHistory,\n includeBbox: includeBboxInPlanning,\n imagesIncludeCount,\n deepThink,\n });\n debug('planResult', JSON.stringify(planResult, null, 2));\n\n const {\n actions,\n log,\n more_actions_needed_by_instruction,\n error,\n usage,\n rawResponse,\n sleep,\n reasoning_content,\n } = planResult;\n\n executorContext.task.log = {\n ...(executorContext.task.log || {}),\n rawResponse,\n };\n executorContext.task.usage = usage;\n executorContext.task.reasoning_content = reasoning_content;\n executorContext.task.output = {\n actions: actions || [],\n more_actions_needed_by_instruction,\n log,\n yamlFlow: planResult.yamlFlow,\n };\n executorContext.uiContext = uiContext;\n\n const finalActions = [...(actions || [])];\n\n if (sleep) {\n const timeNow = Date.now();\n const timeRemaining = sleep - (timeNow - startTime);\n if (timeRemaining > 0) {\n finalActions.push(this.sleepPlan(timeRemaining));\n }\n }\n\n assert(!error, `Failed to continue: ${error}\\n${log || ''}`);\n\n return {\n cache: {\n hit: false,\n },\n } as any;\n },\n },\n {\n allowWhenError: true,\n },\n );\n\n const planResult = result?.output as PlanningAIResponse | undefined;\n\n // Execute planned actions\n const plans = planResult?.actions || [];\n yamlFlow.push(...(planResult?.yamlFlow || []));\n\n let executables: Awaited<ReturnType<typeof this.convertPlanToExecutable>>;\n try {\n executables = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n {\n cacheable,\n subTask: true,\n },\n );\n } catch (error) {\n return session.appendErrorPlan(\n `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(\n plans,\n )}`,\n );\n }\n if (this.conversationHistory.pendingFeedbackMessage) {\n console.warn(\n 'unconsumed pending feedback message detected, this may lead to unexpected planning result:',\n this.conversationHistory.pendingFeedbackMessage,\n );\n }\n let errorFlag = false;\n try {\n await session.appendAndRun(executables.tasks);\n } catch (error: any) {\n errorFlag = true;\n errorCountInOnePlanningLoop++;\n this.conversationHistory.pendingFeedbackMessage = `Error executing running tasks: ${error?.message || String(error)}`;\n debug(\n 'error when executing running tasks, but continue to run if it is not too many errors:',\n error instanceof Error ? error.message : String(error),\n 'current error count in one planning loop:',\n errorCountInOnePlanningLoop,\n );\n }\n\n if (errorCountInOnePlanningLoop > maxErrorCountAllowedInOnePlanningLoop) {\n return session.appendErrorPlan('Too many errors in one planning loop');\n }\n\n // Check if task is complete\n if (!planResult?.more_actions_needed_by_instruction) {\n if (errorFlag) {\n debug(\n 'more_actions_needed_by_instruction is false, but there are errors in one planning loop, continue to run',\n );\n } else {\n break;\n }\n }\n\n // Increment replan count for next iteration\n ++replanCount;\n\n if (replanCount > replanningCycleLimit) {\n const errorMsg = `Replanned ${replanningCycleLimit} times, exceeding the limit. Please configure a larger value for replanningCycleLimit (or use MIDSCENE_REPLANNING_CYCLE_LIMIT) to handle more complex tasks.`;\n return session.appendErrorPlan(errorMsg);\n }\n\n if (!this.conversationHistory.pendingFeedbackMessage) {\n this.conversationHistory.pendingFeedbackMessage =\n 'I have finished the action previously planned.';\n }\n }\n\n return {\n output: {\n yamlFlow,\n },\n runner,\n };\n }\n\n private createTypeQueryTask(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert' | 'WaitFor',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ) {\n const queryTask: ExecutionTaskInsightQueryApply = {\n type: 'Insight',\n subType: type,\n param: {\n dataDemand: multimodalPrompt\n ? ({\n demand,\n multimodalPrompt,\n } as never)\n : demand, // for user param presentation in report right sidebar\n },\n executor: async (param, taskContext) => {\n const { task } = taskContext;\n let queryDump: ServiceDump | undefined;\n const applyDump = (dump: ServiceDump) => {\n queryDump = dump;\n task.log = {\n dump,\n };\n };\n\n // Get context for query operations\n const uiContext = taskContext.uiContext;\n assert(uiContext, 'uiContext is required for Query task');\n\n const ifTypeRestricted = type !== 'Query';\n let demandInput = demand;\n let keyOfResult = 'result';\n if (ifTypeRestricted && (type === 'Assert' || type === 'WaitFor')) {\n keyOfResult = 'StatementIsTruthy';\n const booleanPrompt =\n type === 'Assert'\n ? `Boolean, whether the following statement is true: ${demand}`\n : `Boolean, the user wants to do some 'wait for' operation, please check whether the following statement is true: ${demand}`;\n demandInput = {\n [keyOfResult]: booleanPrompt,\n };\n } else if (ifTypeRestricted) {\n demandInput = {\n [keyOfResult]: `${type}, ${demand}`,\n };\n }\n\n let extractResult;\n\n let extraPageDescription = '';\n if (opt?.domIncluded && this.interface.getElementsNodeTree) {\n debug('appending tree info for page');\n const tree = await this.interface.getElementsNodeTree();\n extraPageDescription = await descriptionOfTree(\n tree,\n 200,\n false,\n opt?.domIncluded === 'visible-only',\n );\n }\n\n try {\n extractResult = await this.service.extract<any>(\n demandInput,\n modelConfig,\n opt,\n extraPageDescription,\n multimodalPrompt,\n );\n } catch (error) {\n if (error instanceof ServiceError) {\n applyDump(error.dump);\n }\n throw error;\n }\n\n const { data, usage, thought, dump, reasoning_content } = extractResult;\n applyDump(dump);\n task.reasoning_content = reasoning_content;\n\n let outputResult = data;\n if (ifTypeRestricted) {\n // If AI returned a plain string instead of structured format, use it directly\n if (typeof data === 'string') {\n outputResult = data;\n } else if (type === 'WaitFor') {\n if (data === null || data === undefined) {\n outputResult = false;\n } else {\n outputResult = (data as any)[keyOfResult];\n }\n } else if (data === null || data === undefined) {\n outputResult = null;\n } else {\n assert(\n data?.[keyOfResult] !== undefined,\n 'No result in query data',\n );\n outputResult = (data as any)[keyOfResult];\n }\n }\n\n if (type === 'Assert' && !outputResult) {\n task.usage = usage;\n task.thought = thought;\n throw new Error(`Assertion failed: ${thought}`);\n }\n\n return {\n output: outputResult,\n log: queryDump,\n usage,\n thought,\n };\n },\n };\n\n return queryTask;\n }\n async createTypeQueryExecution<T>(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<ExecutionResult<T>> {\n const session = this.createExecutionSession(\n taskTitleStr(\n type,\n typeof demand === 'string' ? demand : JSON.stringify(demand),\n ),\n );\n\n const queryTask = await this.createTypeQueryTask(\n type,\n demand,\n modelConfig,\n opt,\n multimodalPrompt,\n );\n\n const runner = session.getRunner();\n const result = await session.appendAndRun(queryTask);\n\n if (!result) {\n throw new Error(\n 'result of taskExecutor.flush() is undefined in function createTypeQueryTask',\n );\n }\n\n const { output, thought } = result;\n\n return {\n output,\n thought,\n runner,\n };\n }\n\n private sleepPlan(timeMs: number): PlanningAction<PlanningActionParamSleep> {\n return {\n type: 'Sleep',\n param: {\n timeMs,\n },\n };\n }\n\n async taskForSleep(timeMs: number, _modelConfig: IModelConfig) {\n return this.taskBuilder.createSleepTask({\n timeMs,\n });\n }\n\n async waitFor(\n assertion: TUserPrompt,\n opt: PlanningActionParamWaitFor,\n modelConfig: IModelConfig,\n ): Promise<ExecutionResult<void>> {\n const { textPrompt, multimodalPrompt } = parsePrompt(assertion);\n\n const description = `waitFor: ${textPrompt}`;\n const session = this.createExecutionSession(\n taskTitleStr('WaitFor', description),\n );\n const runner = session.getRunner();\n const {\n timeoutMs,\n checkIntervalMs,\n domIncluded,\n screenshotIncluded,\n ...restOpt\n } = opt;\n const serviceExtractOpt: ServiceExtractOption = {\n domIncluded,\n screenshotIncluded,\n ...restOpt,\n };\n\n assert(assertion, 'No assertion for waitFor');\n assert(timeoutMs, 'No timeoutMs for waitFor');\n assert(checkIntervalMs, 'No checkIntervalMs for waitFor');\n\n assert(\n checkIntervalMs <= timeoutMs,\n `wrong config for waitFor: checkIntervalMs must be less than timeoutMs, config: {checkIntervalMs: ${checkIntervalMs}, timeoutMs: ${timeoutMs}}`,\n );\n\n const overallStartTime = Date.now();\n let lastCheckStart = overallStartTime;\n let errorThought = '';\n // Continue checking as long as the previous iteration began within the timeout window.\n while (lastCheckStart - overallStartTime <= timeoutMs) {\n const currentCheckStart = Date.now();\n lastCheckStart = currentCheckStart;\n const queryTask = await this.createTypeQueryTask(\n 'WaitFor',\n textPrompt,\n modelConfig,\n serviceExtractOpt,\n multimodalPrompt,\n );\n\n const result = (await session.appendAndRun(queryTask)) as\n | {\n output: boolean;\n thought?: string;\n }\n | undefined;\n\n if (result?.output) {\n return {\n output: undefined,\n runner,\n };\n }\n\n errorThought =\n result?.thought ||\n (!result && `No result from assertion: ${textPrompt}`) ||\n `unknown error when waiting for assertion: ${textPrompt}`;\n const now = Date.now();\n if (now - currentCheckStart < checkIntervalMs) {\n const timeRemaining = checkIntervalMs - (now - currentCheckStart);\n const sleepTask = this.taskBuilder.createSleepTask({\n timeMs: timeRemaining,\n });\n await session.append(sleepTask);\n }\n }\n\n return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);\n }\n}\n\nexport async function withFileChooser<T>(\n interfaceInstance: AbstractInterface,\n fileChooserAccept: string[] | undefined,\n action: () => Promise<T>,\n): Promise<T> {\n if (!fileChooserAccept?.length) {\n return action();\n }\n\n if (!interfaceInstance.registerFileChooserListener) {\n throw new Error(\n `File upload is not supported on ${interfaceInstance.interfaceType}`,\n );\n }\n\n const handler = async (chooser: FileChooserHandler) => {\n await chooser.accept(fileChooserAccept);\n };\n\n const { dispose, getError } =\n await interfaceInstance.registerFileChooserListener(handler);\n try {\n const result = await action();\n // Check for errors that occurred during file chooser handling\n const error = getError();\n if (error) {\n throw error;\n }\n return result;\n } finally {\n dispose();\n }\n}\n"],"names":["debug","getDebug","maxErrorCountAllowedInOnePlanningLoop","TaskExecutor","title","options","ExecutionSession","Promise","plans","modelConfigForPlanning","modelConfigForDefaultIntent","userInstruction","yamlString","session","taskTitleStr","task","param","executorContext","uiContext","assert","runner","tasks","result","output","userPrompt","includeBboxInPlanning","aiActContext","cacheable","replanningCycleLimitOverride","imagesIncludeCount","deepThink","fileChooserAccept","withFileChooser","replanCount","yamlFlow","replanningCycleLimit","undefined","errorCountInOnePlanningLoop","startTime","Date","vlMode","uiTarsModelVersion","actionSpace","action","Array","console","planResult","uiTarsPlanning","plan","JSON","actions","log","more_actions_needed_by_instruction","error","usage","rawResponse","sleep","reasoning_content","finalActions","timeNow","timeRemaining","executables","errorFlag","String","Error","errorMsg","type","demand","modelConfig","opt","multimodalPrompt","queryTask","taskContext","queryDump","applyDump","dump","ifTypeRestricted","demandInput","keyOfResult","booleanPrompt","extractResult","extraPageDescription","tree","descriptionOfTree","ServiceError","data","thought","outputResult","timeMs","_modelConfig","assertion","textPrompt","parsePrompt","description","timeoutMs","checkIntervalMs","domIncluded","screenshotIncluded","restOpt","serviceExtractOpt","overallStartTime","lastCheckStart","errorThought","currentCheckStart","now","sleepTask","interfaceInstance","service","opts","ConversationHistory","TaskBuilder","handler","chooser","dispose","getError"],"mappings":";;;;;;;;;;;;;;;;;;;;AAgDA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,wCAAwC;AAIvC,MAAMC;IAoBX,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,SAAS;IACvB;IA6BQ,uBACNC,KAAa,EACbC,OAA0C,EAC1C;QACA,OAAO,IAAIC,iBACTF,OACA,IAAMG,QAAQ,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,kBAAkB,KACrD;YACE,aAAa,IAAI,CAAC,mBAAmB;YACrC,OAAOF,SAAS;YAChB,cAAc,IAAI,CAAC,KAAK,EAAE;QAC5B;IAEJ;IAEQ,iBAAiC;QACvC,OAAO,IAAI,CAAC,mBAAmB;IACjC;IAEA,MAAa,wBACXG,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACzCL,OAGC,EACD;QACA,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,CAC3BG,OACAC,wBACAC,6BACAL;IAEJ;IAEA,MAAM,uBAAuBM,eAAuB,EAAEC,UAAkB,EAAE;QACxE,MAAMC,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUH;QAGzB,MAAMI,OAAmC;YACvC,MAAM;YACN,SAAS;YACT,OAAO;gBACLJ;YACF;YACA,UAAU,OAAOK,OAAOC;gBACtB,MAAM,EAAEC,SAAS,EAAE,GAAGD;gBACtBE,OAAOD,WAAW;gBAClB,OAAO;oBACL,QAAQ;wBACN,SAAS,EAAE;wBACX,oCAAoC;wBACpC,KAAK;wBACLN;oBACF;oBACA,OAAO;wBACL,KAAK;oBACP;oBACA,OAAO;wBACL,MAAM;wBACN,SAAS;4BACPA;wBACF;oBACF;gBACF;YACF;QACF;QACA,MAAMQ,SAASP,QAAQ,SAAS;QAChC,MAAMA,QAAQ,YAAY,CAACE;QAE3B,OAAO;YACLK;QACF;IACF;IAEA,MAAM,SACJhB,KAAa,EACbI,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACf;QAC1B,MAAMG,UAAU,IAAI,CAAC,sBAAsB,CAACT;QAC5C,MAAM,EAAEiB,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAClDb,OACAC,wBACAC;QAEF,MAAMU,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAACQ;QAC1C,MAAM,EAAEE,MAAM,EAAE,GAAGD,UAAU,CAAC;QAC9B,OAAO;YACLC;YACAH;QACF;IACF;IAEA,MAAM,OACJI,UAAkB,EAClBf,sBAAoC,EACpCC,2BAAyC,EACzCe,qBAA8B,EAC9BC,YAAqB,EACrBC,SAAmB,EACnBC,4BAAqC,EACrCC,kBAA2B,EAC3BC,SAA2B,EAC3BC,iBAA4B,EAQ5B;QACA,OAAOC,gBAAgB,IAAI,CAAC,SAAS,EAAED,mBAAmB,UACjD,IAAI,CAAC,SAAS,CACnBP,YACAf,wBACAC,6BACAe,uBACAC,cACAC,WACAC,8BACAC,oBACAC;IAGN;IAEA,MAAc,UACZN,UAAkB,EAClBf,sBAAoC,EACpCC,2BAAyC,EACzCe,qBAA8B,EAC9BC,YAAqB,EACrBC,SAAmB,EACnBC,4BAAqC,EACrCC,kBAA2B,EAC3BC,SAA2B,EAQ3B;QACA,IAAI,CAAC,mBAAmB,CAAC,KAAK;QAE9B,MAAMjB,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUU;QAEzB,MAAMJ,SAASP,QAAQ,SAAS;QAEhC,IAAIoB,cAAc;QAClB,MAAMC,WAAmC,EAAE;QAC3C,MAAMC,uBACJP,gCAAgC,IAAI,CAAC,oBAAoB;QAC3DT,OACEgB,AAAyBC,WAAzBD,sBACA;QAGF,IAAIE,8BAA8B;QAGlC,MAAO,KAAM;YACX,MAAMf,SAAS,MAAMT,QAAQ,YAAY,CACvC;gBACE,MAAM;gBACN,SAAS;gBACT,OAAO;oBACL,iBAAiBW;oBACjBE;oBACAG;oBACAC;gBACF;gBACA,UAAU,OAAOd,OAAOC;oBACtB,MAAMqB,YAAYC,KAAK,GAAG;oBAC1B,MAAM,EAAErB,SAAS,EAAE,GAAGD;oBACtBE,OAAOD,WAAW;oBAClB,MAAM,EAAEsB,MAAM,EAAE,GAAG/B;oBACnB,MAAMgC,qBACJD,AAAW,kBAAXA,SACI/B,uBAAuB,kBAAkB,GACzC2B;oBAEN,MAAMM,cAAc,IAAI,CAAC,cAAc;oBACvC1C,MACE,sCACA0C,YAAY,GAAG,CAAC,CAACC,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;oBAEhDxB,OAAOyB,MAAM,OAAO,CAACF,cAAc;oBACnC,IAAIA,AAAuB,MAAvBA,YAAY,MAAM,EACpBG,QAAQ,IAAI,CACV,CAAC,gBAAgB,EAAE,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,gDAAgD,CAAC;oBAIrG,MAAMC,aAAa,MAAOL,AAAAA,CAAAA,qBACtBM,iBACAC,IAAG,EAAGhC,MAAM,eAAe,EAAE;wBAC/B,SAASE;wBACT,eAAeF,MAAM,YAAY;wBACjC,eAAe,IAAI,CAAC,SAAS,CAAC,aAAa;wBAC3C0B;wBACA,aAAajC;wBACb,qBAAqB,IAAI,CAAC,mBAAmB;wBAC7C,aAAagB;wBACbI;wBACAC;oBACF;oBACA9B,MAAM,cAAciD,KAAK,SAAS,CAACH,YAAY,MAAM;oBAErD,MAAM,EACJI,OAAO,EACPC,GAAG,EACHC,kCAAkC,EAClCC,KAAK,EACLC,KAAK,EACLC,WAAW,EACXC,KAAK,EACLC,iBAAiB,EAClB,GAAGX;oBAEJ7B,gBAAgB,IAAI,CAAC,GAAG,GAAG;wBACzB,GAAIA,gBAAgB,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;wBAClCsC;oBACF;oBACAtC,gBAAgB,IAAI,CAAC,KAAK,GAAGqC;oBAC7BrC,gBAAgB,IAAI,CAAC,iBAAiB,GAAGwC;oBACzCxC,gBAAgB,IAAI,CAAC,MAAM,GAAG;wBAC5B,SAASiC,WAAW,EAAE;wBACtBE;wBACAD;wBACA,UAAUL,WAAW,QAAQ;oBAC/B;oBACA7B,gBAAgB,SAAS,GAAGC;oBAE5B,MAAMwC,eAAe;2BAAKR,WAAW,EAAE;qBAAE;oBAEzC,IAAIM,OAAO;wBACT,MAAMG,UAAUpB,KAAK,GAAG;wBACxB,MAAMqB,gBAAgBJ,QAASG,CAAAA,UAAUrB,SAAQ;wBACjD,IAAIsB,gBAAgB,GAClBF,aAAa,IAAI,CAAC,IAAI,CAAC,SAAS,CAACE;oBAErC;oBAEAzC,OAAO,CAACkC,OAAO,CAAC,oBAAoB,EAAEA,MAAM,EAAE,EAAEF,OAAO,IAAI;oBAE3D,OAAO;wBACL,OAAO;4BACL,KAAK;wBACP;oBACF;gBACF;YACF,GACA;gBACE,gBAAgB;YAClB;YAGF,MAAML,aAAaxB,QAAQ;YAG3B,MAAMd,QAAQsC,YAAY,WAAW,EAAE;YACvCZ,SAAS,IAAI,IAAKY,YAAY,YAAY,EAAE;YAE5C,IAAIe;YACJ,IAAI;gBACFA,cAAc,MAAM,IAAI,CAAC,uBAAuB,CAC9CrD,OACAC,wBACAC,6BACA;oBACEiB;oBACA,SAAS;gBACX;YAEJ,EAAE,OAAO0B,OAAO;gBACd,OAAOxC,QAAQ,eAAe,CAC5B,CAAC,4CAA4C,EAAEwC,MAAM,SAAS,EAAEJ,KAAK,SAAS,CAC5EzC,QACC;YAEP;YACA,IAAI,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EACjDqC,QAAQ,IAAI,CACV,8FACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB;YAGnD,IAAIiB,YAAY;YAChB,IAAI;gBACF,MAAMjD,QAAQ,YAAY,CAACgD,YAAY,KAAK;YAC9C,EAAE,OAAOR,OAAY;gBACnBS,YAAY;gBACZzB;gBACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAAG,CAAC,+BAA+B,EAAEgB,OAAO,WAAWU,OAAOV,QAAQ;gBACrHrD,MACE,yFACAqD,iBAAiBW,QAAQX,MAAM,OAAO,GAAGU,OAAOV,QAChD,6CACAhB;YAEJ;YAEA,IAAIA,8BAA8BnC,uCAChC,OAAOW,QAAQ,eAAe,CAAC;YAIjC,IAAI,CAACiC,YAAY,oCACf,IAAIgB,WACF9D,MACE;iBAGF;YAKJ,EAAEiC;YAEF,IAAIA,cAAcE,sBAAsB;gBACtC,MAAM8B,WAAW,CAAC,UAAU,EAAE9B,qBAAqB,4JAA4J,CAAC;gBAChN,OAAOtB,QAAQ,eAAe,CAACoD;YACjC;YAEA,IAAI,CAAC,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EAClD,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAC7C;QAEN;QAEA,OAAO;YACL,QAAQ;gBACN/B;YACF;YACAd;QACF;IACF;IAEQ,oBACN8C,IAAsE,EACtEC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACpC;QACA,MAAMC,YAA4C;YAChD,MAAM;YACN,SAASL;YACT,OAAO;gBACL,YAAYI,mBACP;oBACCH;oBACAG;gBACF,IACAH;YACN;YACA,UAAU,OAAOnD,OAAOwD;gBACtB,MAAM,EAAEzD,IAAI,EAAE,GAAGyD;gBACjB,IAAIC;gBACJ,MAAMC,YAAY,CAACC;oBACjBF,YAAYE;oBACZ5D,KAAK,GAAG,GAAG;wBACT4D;oBACF;gBACF;gBAGA,MAAMzD,YAAYsD,YAAY,SAAS;gBACvCrD,OAAOD,WAAW;gBAElB,MAAM0D,mBAAmBV,AAAS,YAATA;gBACzB,IAAIW,cAAcV;gBAClB,IAAIW,cAAc;gBAClB,IAAIF,oBAAqBV,CAAAA,AAAS,aAATA,QAAqBA,AAAS,cAATA,IAAiB,GAAI;oBACjEY,cAAc;oBACd,MAAMC,gBACJb,AAAS,aAATA,OACI,CAAC,kDAAkD,EAAEC,QAAQ,GAC7D,CAAC,+GAA+G,EAAEA,QAAQ;oBAChIU,cAAc;wBACZ,CAACC,YAAY,EAAEC;oBACjB;gBACF,OAAO,IAAIH,kBACTC,cAAc;oBACZ,CAACC,YAAY,EAAE,GAAGZ,KAAK,EAAE,EAAEC,QAAQ;gBACrC;gBAGF,IAAIa;gBAEJ,IAAIC,uBAAuB;gBAC3B,IAAIZ,KAAK,eAAe,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE;oBAC1DrE,MAAM;oBACN,MAAMkF,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,mBAAmB;oBACrDD,uBAAuB,MAAME,kBAC3BD,MACA,KACA,OACAb,KAAK,gBAAgB;gBAEzB;gBAEA,IAAI;oBACFW,gBAAgB,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CACxCH,aACAT,aACAC,KACAY,sBACAX;gBAEJ,EAAE,OAAOjB,OAAO;oBACd,IAAIA,iBAAiB+B,cACnBV,UAAUrB,MAAM,IAAI;oBAEtB,MAAMA;gBACR;gBAEA,MAAM,EAAEgC,IAAI,EAAE/B,KAAK,EAAEgC,OAAO,EAAEX,IAAI,EAAElB,iBAAiB,EAAE,GAAGuB;gBAC1DN,UAAUC;gBACV5D,KAAK,iBAAiB,GAAG0C;gBAEzB,IAAI8B,eAAeF;gBACnB,IAAIT,kBAEF,IAAI,AAAgB,YAAhB,OAAOS,MACTE,eAAeF;qBACV,IAAInB,AAAS,cAATA,MAEPqB,eADEF,QAAAA,OACa,QAECA,IAAY,CAACP,YAAY;qBAEtC,IAAIO,QAAAA,MACTE,eAAe;qBACV;oBACLpE,OACEkE,MAAM,CAACP,YAAY,KAAK1C,QACxB;oBAEFmD,eAAgBF,IAAY,CAACP,YAAY;gBAC3C;gBAGF,IAAIZ,AAAS,aAATA,QAAqB,CAACqB,cAAc;oBACtCxE,KAAK,KAAK,GAAGuC;oBACbvC,KAAK,OAAO,GAAGuE;oBACf,MAAM,IAAItB,MAAM,CAAC,kBAAkB,EAAEsB,SAAS;gBAChD;gBAEA,OAAO;oBACL,QAAQC;oBACR,KAAKd;oBACLnB;oBACAgC;gBACF;YACF;QACF;QAEA,OAAOf;IACT;IACA,MAAM,yBACJL,IAA0D,EAC1DC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACP;QAC7B,MAAMzD,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aACEoD,MACA,AAAkB,YAAlB,OAAOC,SAAsBA,SAASlB,KAAK,SAAS,CAACkB;QAIzD,MAAMI,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9CL,MACAC,QACAC,aACAC,KACAC;QAGF,MAAMlD,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAAC0D;QAE1C,IAAI,CAACjD,QACH,MAAM,IAAI0C,MACR;QAIJ,MAAM,EAAEzC,MAAM,EAAE+D,OAAO,EAAE,GAAGhE;QAE5B,OAAO;YACLC;YACA+D;YACAlE;QACF;IACF;IAEQ,UAAUoE,MAAc,EAA4C;QAC1E,OAAO;YACL,MAAM;YACN,OAAO;gBACLA;YACF;QACF;IACF;IAEA,MAAM,aAAaA,MAAc,EAAEC,YAA0B,EAAE;QAC7D,OAAO,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;YACtCD;QACF;IACF;IAEA,MAAM,QACJE,SAAsB,EACtBrB,GAA+B,EAC/BD,WAAyB,EACO;QAChC,MAAM,EAAEuB,UAAU,EAAErB,gBAAgB,EAAE,GAAGsB,YAAYF;QAErD,MAAMG,cAAc,CAAC,SAAS,EAAEF,YAAY;QAC5C,MAAM9E,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,WAAW+E;QAE1B,MAAMzE,SAASP,QAAQ,SAAS;QAChC,MAAM,EACJiF,SAAS,EACTC,eAAe,EACfC,WAAW,EACXC,kBAAkB,EAClB,GAAGC,SACJ,GAAG7B;QACJ,MAAM8B,oBAA0C;YAC9CH;YACAC;YACA,GAAGC,OAAO;QACZ;QAEA/E,OAAOuE,WAAW;QAClBvE,OAAO2E,WAAW;QAClB3E,OAAO4E,iBAAiB;QAExB5E,OACE4E,mBAAmBD,WACnB,CAAC,iGAAiG,EAAEC,gBAAgB,aAAa,EAAED,UAAU,CAAC,CAAC;QAGjJ,MAAMM,mBAAmB7D,KAAK,GAAG;QACjC,IAAI8D,iBAAiBD;QACrB,IAAIE,eAAe;QAEnB,MAAOD,iBAAiBD,oBAAoBN,UAAW;YACrD,MAAMS,oBAAoBhE,KAAK,GAAG;YAClC8D,iBAAiBE;YACjB,MAAMhC,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9C,WACAoB,YACAvB,aACA+B,mBACA7B;YAGF,MAAMhD,SAAU,MAAMT,QAAQ,YAAY,CAAC0D;YAO3C,IAAIjD,QAAQ,QACV,OAAO;gBACL,QAAQc;gBACRhB;YACF;YAGFkF,eACEhF,QAAQ,WACP,CAACA,UAAU,CAAC,0BAA0B,EAAEqE,YAAY,IACrD,CAAC,0CAA0C,EAAEA,YAAY;YAC3D,MAAMa,MAAMjE,KAAK,GAAG;YACpB,IAAIiE,MAAMD,oBAAoBR,iBAAiB;gBAC7C,MAAMnC,gBAAgBmC,kBAAmBS,CAAAA,MAAMD,iBAAgB;gBAC/D,MAAME,YAAY,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;oBACjD,QAAQ7C;gBACV;gBACA,MAAM/C,QAAQ,MAAM,CAAC4F;YACvB;QACF;QAEA,OAAO5F,QAAQ,eAAe,CAAC,CAAC,iBAAiB,EAAEyF,cAAc;IACnE;IAtnBA,YACEI,iBAAoC,EACpCC,OAAgB,EAChBC,IAMC,CACD;QAjCF;QAEA;QAEA;QAEA,uBAAiB,uBAAjB;QAEA,uBAAiB,eAAjB;QAEA,uBAAQ,uBAAR;QAEA;QAEA,uBAAiB,SAAjB;QAEA;QAkBE,IAAI,CAAC,SAAS,GAAGF;QACjB,IAAI,CAAC,OAAO,GAAGC;QACf,IAAI,CAAC,SAAS,GAAGC,KAAK,SAAS;QAC/B,IAAI,CAAC,mBAAmB,GAAGA,MAAM;QACjC,IAAI,CAAC,oBAAoB,GAAGA,KAAK,oBAAoB;QACrD,IAAI,CAAC,KAAK,GAAGA,KAAK,KAAK;QACvB,IAAI,CAAC,mBAAmB,GAAG,IAAIC;QAC/B,IAAI,CAAC,mBAAmB,GAAGD,KAAK,WAAW;QAC3C,IAAI,CAAC,WAAW,GAAG,IAAIE,YAAY;YACjCJ;YACAC;YACA,WAAWC,KAAK,SAAS;YACzB,aAAa,IAAI,CAAC,cAAc;QAClC;IACF;AA8lBF;AAEO,eAAe5E,gBACpB0E,iBAAoC,EACpC3E,iBAAuC,EACvCY,MAAwB;IAExB,IAAI,CAACZ,mBAAmB,QACtB,OAAOY;IAGT,IAAI,CAAC+D,kBAAkB,2BAA2B,EAChD,MAAM,IAAI1C,MACR,CAAC,gCAAgC,EAAE0C,kBAAkB,aAAa,EAAE;IAIxE,MAAMK,UAAU,OAAOC;QACrB,MAAMA,QAAQ,MAAM,CAACjF;IACvB;IAEA,MAAM,EAAEkF,OAAO,EAAEC,QAAQ,EAAE,GACzB,MAAMR,kBAAkB,2BAA2B,CAACK;IACtD,IAAI;QACF,MAAMzF,SAAS,MAAMqB;QAErB,MAAMU,QAAQ6D;QACd,IAAI7D,OACF,MAAMA;QAER,OAAO/B;IACT,SAAU;QACR2F;IACF;AACF"}
1
+ {"version":3,"file":"agent/tasks.mjs","sources":["../../../src/agent/tasks.ts"],"sourcesContent":["import { ConversationHistory, plan, uiTarsPlanning } from '@/ai-model';\nimport {\n type TMultimodalPrompt,\n type TUserPrompt,\n getReadableTimeString,\n} from '@/common';\nimport type { AbstractInterface, FileChooserHandler } from '@/device';\nimport type Service from '@/service';\nimport type { TaskRunner } from '@/task-runner';\nimport { TaskExecutionError } from '@/task-runner';\nimport type {\n DeepThinkOption,\n DeviceAction,\n ExecutionTaskApply,\n ExecutionTaskInsightQueryApply,\n ExecutionTaskPlanningApply,\n ExecutionTaskProgressOptions,\n InterfaceType,\n MidsceneYamlFlowItem,\n PlanningAIResponse,\n PlanningAction,\n PlanningActionParamWaitFor,\n ServiceDump,\n ServiceExtractOption,\n ServiceExtractParam,\n} from '@/types';\nimport { ServiceError } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ExecutionSession } from './execution-session';\nimport { TaskBuilder } from './task-builder';\nimport type { TaskCache } from './task-cache';\nexport { locatePlanForLocate } from './task-builder';\nimport { descriptionOfTree } from '@midscene/shared/extractor';\nimport { taskTitleStr } from './ui-utils';\nimport { parsePrompt } from './utils';\n\ninterface ExecutionResult<OutputType = any> {\n output: OutputType;\n thought?: string;\n runner: TaskRunner;\n}\n\ninterface TaskExecutorHooks {\n onTaskUpdate?: (\n runner: TaskRunner,\n error?: TaskExecutionError,\n ) => Promise<void> | void;\n}\n\nconst debug = getDebug('device-task-executor');\nconst maxErrorCountAllowedInOnePlanningLoop = 5;\n\nexport { TaskExecutionError };\n\nexport class TaskExecutor {\n interface: AbstractInterface;\n\n service: Service;\n\n taskCache?: TaskCache;\n\n private readonly providedActionSpace: DeviceAction[];\n\n private readonly taskBuilder: TaskBuilder;\n\n private conversationHistory: ConversationHistory;\n\n onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];\n\n private readonly hooks?: TaskExecutorHooks;\n\n replanningCycleLimit?: number;\n\n // @deprecated use .interface instead\n get page() {\n return this.interface;\n }\n\n constructor(\n interfaceInstance: AbstractInterface,\n service: Service,\n opts: {\n taskCache?: TaskCache;\n onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];\n replanningCycleLimit?: number;\n hooks?: TaskExecutorHooks;\n actionSpace: DeviceAction[];\n },\n ) {\n this.interface = interfaceInstance;\n this.service = service;\n this.taskCache = opts.taskCache;\n this.onTaskStartCallback = opts?.onTaskStart;\n this.replanningCycleLimit = opts.replanningCycleLimit;\n this.hooks = opts.hooks;\n this.conversationHistory = new ConversationHistory();\n this.providedActionSpace = opts.actionSpace;\n this.taskBuilder = new TaskBuilder({\n interfaceInstance,\n service,\n taskCache: opts.taskCache,\n actionSpace: this.getActionSpace(),\n });\n }\n\n private createExecutionSession(\n title: string,\n options?: { tasks?: ExecutionTaskApply[] },\n ) {\n return new ExecutionSession(\n title,\n () => Promise.resolve(this.service.contextRetrieverFn()),\n {\n onTaskStart: this.onTaskStartCallback,\n tasks: options?.tasks,\n onTaskUpdate: this.hooks?.onTaskUpdate,\n },\n );\n }\n\n private getActionSpace(): DeviceAction[] {\n return this.providedActionSpace;\n }\n\n public async convertPlanToExecutable(\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n options?: {\n cacheable?: boolean;\n subTask?: boolean;\n },\n ) {\n return this.taskBuilder.build(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n options,\n );\n }\n\n async loadYamlFlowAsPlanning(userInstruction: string, yamlString: string) {\n const session = this.createExecutionSession(\n taskTitleStr('Action', userInstruction),\n );\n\n const task: ExecutionTaskPlanningApply = {\n type: 'Planning',\n subType: 'LoadYaml',\n param: {\n userInstruction,\n },\n executor: async (param, executorContext) => {\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n return {\n output: {\n actions: [],\n shouldContinuePlanning: false,\n log: '',\n yamlString,\n },\n cache: {\n hit: true,\n },\n hitBy: {\n from: 'Cache',\n context: {\n yamlString,\n },\n },\n };\n },\n };\n const runner = session.getRunner();\n await session.appendAndRun(task);\n\n return {\n runner,\n };\n }\n\n async runPlans(\n title: string,\n plans: PlanningAction[],\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n ): Promise<ExecutionResult> {\n const session = this.createExecutionSession(title);\n const { tasks } = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n );\n const runner = session.getRunner();\n const result = await session.appendAndRun(tasks);\n const { output } = result ?? {};\n return {\n output,\n runner,\n };\n }\n\n async action(\n userPrompt: string,\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n includeBboxInPlanning: boolean,\n aiActContext?: string,\n cacheable?: boolean,\n replanningCycleLimitOverride?: number,\n imagesIncludeCount?: number,\n deepThink?: DeepThinkOption,\n fileChooserAccept?: string[],\n ): Promise<\n ExecutionResult<\n | {\n yamlFlow?: MidsceneYamlFlowItem[]; // for cache use\n output?: string;\n }\n | undefined\n >\n > {\n return withFileChooser(this.interface, fileChooserAccept, async () => {\n return this.runAction(\n userPrompt,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n includeBboxInPlanning,\n aiActContext,\n cacheable,\n replanningCycleLimitOverride,\n imagesIncludeCount,\n deepThink,\n );\n });\n }\n\n private async runAction(\n userPrompt: string,\n modelConfigForPlanning: IModelConfig,\n modelConfigForDefaultIntent: IModelConfig,\n includeBboxInPlanning: boolean,\n aiActContext?: string,\n cacheable?: boolean,\n replanningCycleLimitOverride?: number,\n imagesIncludeCount?: number,\n deepThink?: DeepThinkOption,\n ): Promise<\n ExecutionResult<\n | {\n yamlFlow?: MidsceneYamlFlowItem[]; // for cache use\n output?: string;\n }\n | undefined\n >\n > {\n this.conversationHistory.reset();\n\n const session = this.createExecutionSession(\n taskTitleStr('Action', userPrompt),\n );\n const runner = session.getRunner();\n\n let replanCount = 0;\n const yamlFlow: MidsceneYamlFlowItem[] = [];\n const replanningCycleLimit =\n replanningCycleLimitOverride ?? this.replanningCycleLimit;\n assert(\n replanningCycleLimit !== undefined,\n 'replanningCycleLimit is required for TaskExecutor.action',\n );\n\n let errorCountInOnePlanningLoop = 0; // count the number of errors in one planning loop\n let outputString: string | undefined;\n\n // Main planning loop - unified plan/replan logic\n while (true) {\n const result = await session.appendAndRun(\n {\n type: 'Planning',\n subType: 'Plan',\n param: {\n userInstruction: userPrompt,\n aiActContext,\n imagesIncludeCount,\n deepThink,\n },\n executor: async (param, executorContext) => {\n const { uiContext } = executorContext;\n assert(uiContext, 'uiContext is required for Planning task');\n const { vlMode } = modelConfigForPlanning;\n const uiTarsModelVersion =\n vlMode === 'vlm-ui-tars'\n ? modelConfigForPlanning.uiTarsModelVersion\n : undefined;\n\n const actionSpace = this.getActionSpace();\n debug(\n 'actionSpace for this interface is:',\n actionSpace.map((action) => action.name).join(', '),\n );\n assert(Array.isArray(actionSpace), 'actionSpace must be an array');\n if (actionSpace.length === 0) {\n console.warn(\n `ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`,\n );\n }\n\n const planResult = await (uiTarsModelVersion\n ? uiTarsPlanning\n : plan)(param.userInstruction, {\n context: uiContext,\n actionContext: param.aiActContext,\n interfaceType: this.interface.interfaceType as InterfaceType,\n actionSpace,\n modelConfig: modelConfigForPlanning,\n conversationHistory: this.conversationHistory,\n includeBbox: includeBboxInPlanning,\n imagesIncludeCount,\n deepThink,\n });\n debug('planResult', JSON.stringify(planResult, null, 2));\n\n const {\n actions,\n log,\n note,\n error,\n usage,\n rawResponse,\n reasoning_content,\n } = planResult;\n\n executorContext.task.log = {\n ...(executorContext.task.log || {}),\n rawResponse,\n };\n executorContext.task.usage = usage;\n executorContext.task.reasoning_content = reasoning_content;\n executorContext.task.output = {\n actions: actions || [],\n log,\n note,\n yamlFlow: planResult.yamlFlow,\n output: outputString,\n shouldContinuePlanning: planResult.shouldContinuePlanning,\n };\n executorContext.uiContext = uiContext;\n\n assert(!error, `Failed to continue: ${error}\\n${log || ''}`);\n\n return {\n cache: {\n hit: false,\n },\n } as any;\n },\n },\n {\n allowWhenError: true,\n },\n );\n\n const planResult = result?.output as PlanningAIResponse | undefined;\n\n // Execute planned actions\n const plans = planResult?.actions || [];\n yamlFlow.push(...(planResult?.yamlFlow || []));\n\n let executables: Awaited<ReturnType<typeof this.convertPlanToExecutable>>;\n try {\n executables = await this.convertPlanToExecutable(\n plans,\n modelConfigForPlanning,\n modelConfigForDefaultIntent,\n {\n cacheable,\n subTask: true,\n },\n );\n } catch (error) {\n return session.appendErrorPlan(\n `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(\n plans,\n )}`,\n );\n }\n if (this.conversationHistory.pendingFeedbackMessage) {\n console.warn(\n 'unconsumed pending feedback message detected, this may lead to unexpected planning result:',\n this.conversationHistory.pendingFeedbackMessage,\n );\n }\n // todo: set time string\n\n try {\n const result = await session.appendAndRun(executables.tasks);\n outputString = result?.output;\n } catch (error: any) {\n // errorFlag = true;\n errorCountInOnePlanningLoop++;\n this.conversationHistory.pendingFeedbackMessage = `Time: ${getReadableTimeString()}, Error executing running tasks: ${error?.message || String(error)}`;\n debug(\n 'error when executing running tasks, but continue to run if it is not too many errors:',\n error instanceof Error ? error.message : String(error),\n 'current error count in one planning loop:',\n errorCountInOnePlanningLoop,\n );\n }\n\n if (errorCountInOnePlanningLoop > maxErrorCountAllowedInOnePlanningLoop) {\n return session.appendErrorPlan('Too many errors in one planning loop');\n }\n\n // // Check if task is complete\n if (!planResult?.shouldContinuePlanning) {\n break;\n }\n\n // Increment replan count for next iteration\n ++replanCount;\n\n if (replanCount > replanningCycleLimit) {\n const errorMsg = `Replanned ${replanningCycleLimit} times, exceeding the limit. Please configure a larger value for replanningCycleLimit (or use MIDSCENE_REPLANNING_CYCLE_LIMIT) to handle more complex tasks.`;\n return session.appendErrorPlan(errorMsg);\n }\n\n if (!this.conversationHistory.pendingFeedbackMessage) {\n this.conversationHistory.pendingFeedbackMessage = `Time: ${getReadableTimeString()}, I have finished the action previously planned.`;\n }\n }\n\n return {\n output: {\n yamlFlow,\n output: outputString,\n },\n runner,\n };\n }\n\n private createTypeQueryTask(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert' | 'WaitFor',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ) {\n const queryTask: ExecutionTaskInsightQueryApply = {\n type: 'Insight',\n subType: type,\n param: {\n dataDemand: multimodalPrompt\n ? ({\n demand,\n multimodalPrompt,\n } as never)\n : demand, // for user param presentation in report right sidebar\n },\n executor: async (param, taskContext) => {\n const { task } = taskContext;\n let queryDump: ServiceDump | undefined;\n const applyDump = (dump: ServiceDump) => {\n queryDump = dump;\n task.log = {\n dump,\n };\n };\n\n // Get context for query operations\n const uiContext = taskContext.uiContext;\n assert(uiContext, 'uiContext is required for Query task');\n\n const ifTypeRestricted = type !== 'Query';\n let demandInput = demand;\n let keyOfResult = 'result';\n if (ifTypeRestricted && (type === 'Assert' || type === 'WaitFor')) {\n keyOfResult = 'StatementIsTruthy';\n const booleanPrompt =\n type === 'Assert'\n ? `Boolean, whether the following statement is true: ${demand}`\n : `Boolean, the user wants to do some 'wait for' operation, please check whether the following statement is true: ${demand}`;\n demandInput = {\n [keyOfResult]: booleanPrompt,\n };\n } else if (ifTypeRestricted) {\n demandInput = {\n [keyOfResult]: `${type}, ${demand}`,\n };\n }\n\n let extractResult;\n\n let extraPageDescription = '';\n if (opt?.domIncluded && this.interface.getElementsNodeTree) {\n debug('appending tree info for page');\n const tree = await this.interface.getElementsNodeTree();\n extraPageDescription = await descriptionOfTree(\n tree,\n 200,\n false,\n opt?.domIncluded === 'visible-only',\n );\n }\n\n try {\n extractResult = await this.service.extract<any>(\n demandInput,\n modelConfig,\n opt,\n extraPageDescription,\n multimodalPrompt,\n );\n } catch (error) {\n if (error instanceof ServiceError) {\n applyDump(error.dump);\n }\n throw error;\n }\n\n const { data, usage, thought, dump, reasoning_content } = extractResult;\n applyDump(dump);\n task.reasoning_content = reasoning_content;\n\n let outputResult = data;\n if (ifTypeRestricted) {\n // If AI returned a plain string instead of structured format, use it directly\n if (typeof data === 'string') {\n outputResult = data;\n } else if (type === 'WaitFor') {\n if (data === null || data === undefined) {\n outputResult = false;\n } else {\n outputResult = (data as any)[keyOfResult];\n }\n } else if (data === null || data === undefined) {\n outputResult = null;\n } else {\n assert(\n data?.[keyOfResult] !== undefined,\n 'No result in query data',\n );\n outputResult = (data as any)[keyOfResult];\n }\n }\n\n if (type === 'Assert' && !outputResult) {\n task.usage = usage;\n task.thought = thought;\n throw new Error(`Assertion failed: ${thought}`);\n }\n\n return {\n output: outputResult,\n log: queryDump,\n usage,\n thought,\n };\n },\n };\n\n return queryTask;\n }\n async createTypeQueryExecution<T>(\n type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert',\n demand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<ExecutionResult<T>> {\n const session = this.createExecutionSession(\n taskTitleStr(\n type,\n typeof demand === 'string' ? demand : JSON.stringify(demand),\n ),\n );\n\n const queryTask = await this.createTypeQueryTask(\n type,\n demand,\n modelConfig,\n opt,\n multimodalPrompt,\n );\n\n const runner = session.getRunner();\n const result = await session.appendAndRun(queryTask);\n\n if (!result) {\n throw new Error(\n 'result of taskExecutor.flush() is undefined in function createTypeQueryTask',\n );\n }\n\n const { output, thought } = result;\n\n return {\n output,\n thought,\n runner,\n };\n }\n\n async taskForSleep(timeMs: number, _modelConfig: IModelConfig) {\n return this.taskBuilder.createSleepTask({\n timeMs,\n });\n }\n\n async waitFor(\n assertion: TUserPrompt,\n opt: PlanningActionParamWaitFor,\n modelConfig: IModelConfig,\n ): Promise<ExecutionResult<void>> {\n const { textPrompt, multimodalPrompt } = parsePrompt(assertion);\n\n const description = `waitFor: ${textPrompt}`;\n const session = this.createExecutionSession(\n taskTitleStr('WaitFor', description),\n );\n const runner = session.getRunner();\n const {\n timeoutMs,\n checkIntervalMs,\n domIncluded,\n screenshotIncluded,\n ...restOpt\n } = opt;\n const serviceExtractOpt: ServiceExtractOption = {\n domIncluded,\n screenshotIncluded,\n ...restOpt,\n };\n\n assert(assertion, 'No assertion for waitFor');\n assert(timeoutMs, 'No timeoutMs for waitFor');\n assert(checkIntervalMs, 'No checkIntervalMs for waitFor');\n\n assert(\n checkIntervalMs <= timeoutMs,\n `wrong config for waitFor: checkIntervalMs must be less than timeoutMs, config: {checkIntervalMs: ${checkIntervalMs}, timeoutMs: ${timeoutMs}}`,\n );\n\n const overallStartTime = Date.now();\n let lastCheckStart = overallStartTime;\n let errorThought = '';\n // Continue checking as long as the previous iteration began within the timeout window.\n while (lastCheckStart - overallStartTime <= timeoutMs) {\n const currentCheckStart = Date.now();\n lastCheckStart = currentCheckStart;\n const queryTask = await this.createTypeQueryTask(\n 'WaitFor',\n textPrompt,\n modelConfig,\n serviceExtractOpt,\n multimodalPrompt,\n );\n\n const result = (await session.appendAndRun(queryTask)) as\n | {\n output: boolean;\n thought?: string;\n }\n | undefined;\n\n if (result?.output) {\n return {\n output: undefined,\n runner,\n };\n }\n\n errorThought =\n result?.thought ||\n (!result && `No result from assertion: ${textPrompt}`) ||\n `unknown error when waiting for assertion: ${textPrompt}`;\n const now = Date.now();\n if (now - currentCheckStart < checkIntervalMs) {\n const timeRemaining = checkIntervalMs - (now - currentCheckStart);\n const sleepTask = this.taskBuilder.createSleepTask({\n timeMs: timeRemaining,\n });\n await session.append(sleepTask);\n }\n }\n\n return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);\n }\n}\n\nexport async function withFileChooser<T>(\n interfaceInstance: AbstractInterface,\n fileChooserAccept: string[] | undefined,\n action: () => Promise<T>,\n): Promise<T> {\n if (!fileChooserAccept?.length) {\n return action();\n }\n\n if (!interfaceInstance.registerFileChooserListener) {\n throw new Error(\n `File upload is not supported on ${interfaceInstance.interfaceType}`,\n );\n }\n\n const handler = async (chooser: FileChooserHandler) => {\n await chooser.accept(fileChooserAccept);\n };\n\n const { dispose, getError } =\n await interfaceInstance.registerFileChooserListener(handler);\n try {\n const result = await action();\n // Check for errors that occurred during file chooser handling\n const error = getError();\n if (error) {\n throw error;\n }\n return result;\n } finally {\n dispose();\n }\n}\n"],"names":["debug","getDebug","maxErrorCountAllowedInOnePlanningLoop","TaskExecutor","title","options","ExecutionSession","Promise","plans","modelConfigForPlanning","modelConfigForDefaultIntent","userInstruction","yamlString","session","taskTitleStr","task","param","executorContext","uiContext","assert","runner","tasks","result","output","userPrompt","includeBboxInPlanning","aiActContext","cacheable","replanningCycleLimitOverride","imagesIncludeCount","deepThink","fileChooserAccept","withFileChooser","replanCount","yamlFlow","replanningCycleLimit","undefined","errorCountInOnePlanningLoop","outputString","vlMode","uiTarsModelVersion","actionSpace","action","Array","console","planResult","uiTarsPlanning","plan","JSON","actions","log","note","error","usage","rawResponse","reasoning_content","executables","getReadableTimeString","String","Error","errorMsg","type","demand","modelConfig","opt","multimodalPrompt","queryTask","taskContext","queryDump","applyDump","dump","ifTypeRestricted","demandInput","keyOfResult","booleanPrompt","extractResult","extraPageDescription","tree","descriptionOfTree","ServiceError","data","thought","outputResult","timeMs","_modelConfig","assertion","textPrompt","parsePrompt","description","timeoutMs","checkIntervalMs","domIncluded","screenshotIncluded","restOpt","serviceExtractOpt","overallStartTime","Date","lastCheckStart","errorThought","currentCheckStart","now","timeRemaining","sleepTask","interfaceInstance","service","opts","ConversationHistory","TaskBuilder","handler","chooser","dispose","getError"],"mappings":";;;;;;;;;;;;;;;;;;;;;AAmDA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,wCAAwC;AAIvC,MAAMC;IAoBX,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,SAAS;IACvB;IA6BQ,uBACNC,KAAa,EACbC,OAA0C,EAC1C;QACA,OAAO,IAAIC,iBACTF,OACA,IAAMG,QAAQ,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,kBAAkB,KACrD;YACE,aAAa,IAAI,CAAC,mBAAmB;YACrC,OAAOF,SAAS;YAChB,cAAc,IAAI,CAAC,KAAK,EAAE;QAC5B;IAEJ;IAEQ,iBAAiC;QACvC,OAAO,IAAI,CAAC,mBAAmB;IACjC;IAEA,MAAa,wBACXG,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACzCL,OAGC,EACD;QACA,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,CAC3BG,OACAC,wBACAC,6BACAL;IAEJ;IAEA,MAAM,uBAAuBM,eAAuB,EAAEC,UAAkB,EAAE;QACxE,MAAMC,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUH;QAGzB,MAAMI,OAAmC;YACvC,MAAM;YACN,SAAS;YACT,OAAO;gBACLJ;YACF;YACA,UAAU,OAAOK,OAAOC;gBACtB,MAAM,EAAEC,SAAS,EAAE,GAAGD;gBACtBE,OAAOD,WAAW;gBAClB,OAAO;oBACL,QAAQ;wBACN,SAAS,EAAE;wBACX,wBAAwB;wBACxB,KAAK;wBACLN;oBACF;oBACA,OAAO;wBACL,KAAK;oBACP;oBACA,OAAO;wBACL,MAAM;wBACN,SAAS;4BACPA;wBACF;oBACF;gBACF;YACF;QACF;QACA,MAAMQ,SAASP,QAAQ,SAAS;QAChC,MAAMA,QAAQ,YAAY,CAACE;QAE3B,OAAO;YACLK;QACF;IACF;IAEA,MAAM,SACJhB,KAAa,EACbI,KAAuB,EACvBC,sBAAoC,EACpCC,2BAAyC,EACf;QAC1B,MAAMG,UAAU,IAAI,CAAC,sBAAsB,CAACT;QAC5C,MAAM,EAAEiB,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAClDb,OACAC,wBACAC;QAEF,MAAMU,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAACQ;QAC1C,MAAM,EAAEE,MAAM,EAAE,GAAGD,UAAU,CAAC;QAC9B,OAAO;YACLC;YACAH;QACF;IACF;IAEA,MAAM,OACJI,UAAkB,EAClBf,sBAAoC,EACpCC,2BAAyC,EACzCe,qBAA8B,EAC9BC,YAAqB,EACrBC,SAAmB,EACnBC,4BAAqC,EACrCC,kBAA2B,EAC3BC,SAA2B,EAC3BC,iBAA4B,EAS5B;QACA,OAAOC,gBAAgB,IAAI,CAAC,SAAS,EAAED,mBAAmB,UACjD,IAAI,CAAC,SAAS,CACnBP,YACAf,wBACAC,6BACAe,uBACAC,cACAC,WACAC,8BACAC,oBACAC;IAGN;IAEA,MAAc,UACZN,UAAkB,EAClBf,sBAAoC,EACpCC,2BAAyC,EACzCe,qBAA8B,EAC9BC,YAAqB,EACrBC,SAAmB,EACnBC,4BAAqC,EACrCC,kBAA2B,EAC3BC,SAA2B,EAS3B;QACA,IAAI,CAAC,mBAAmB,CAAC,KAAK;QAE9B,MAAMjB,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,UAAUU;QAEzB,MAAMJ,SAASP,QAAQ,SAAS;QAEhC,IAAIoB,cAAc;QAClB,MAAMC,WAAmC,EAAE;QAC3C,MAAMC,uBACJP,gCAAgC,IAAI,CAAC,oBAAoB;QAC3DT,OACEgB,AAAyBC,WAAzBD,sBACA;QAGF,IAAIE,8BAA8B;QAClC,IAAIC;QAGJ,MAAO,KAAM;YACX,MAAMhB,SAAS,MAAMT,QAAQ,YAAY,CACvC;gBACE,MAAM;gBACN,SAAS;gBACT,OAAO;oBACL,iBAAiBW;oBACjBE;oBACAG;oBACAC;gBACF;gBACA,UAAU,OAAOd,OAAOC;oBACtB,MAAM,EAAEC,SAAS,EAAE,GAAGD;oBACtBE,OAAOD,WAAW;oBAClB,MAAM,EAAEqB,MAAM,EAAE,GAAG9B;oBACnB,MAAM+B,qBACJD,AAAW,kBAAXA,SACI9B,uBAAuB,kBAAkB,GACzC2B;oBAEN,MAAMK,cAAc,IAAI,CAAC,cAAc;oBACvCzC,MACE,sCACAyC,YAAY,GAAG,CAAC,CAACC,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;oBAEhDvB,OAAOwB,MAAM,OAAO,CAACF,cAAc;oBACnC,IAAIA,AAAuB,MAAvBA,YAAY,MAAM,EACpBG,QAAQ,IAAI,CACV,CAAC,gBAAgB,EAAE,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,gDAAgD,CAAC;oBAIrG,MAAMC,aAAa,MAAOL,AAAAA,CAAAA,qBACtBM,iBACAC,IAAG,EAAG/B,MAAM,eAAe,EAAE;wBAC/B,SAASE;wBACT,eAAeF,MAAM,YAAY;wBACjC,eAAe,IAAI,CAAC,SAAS,CAAC,aAAa;wBAC3CyB;wBACA,aAAahC;wBACb,qBAAqB,IAAI,CAAC,mBAAmB;wBAC7C,aAAagB;wBACbI;wBACAC;oBACF;oBACA9B,MAAM,cAAcgD,KAAK,SAAS,CAACH,YAAY,MAAM;oBAErD,MAAM,EACJI,OAAO,EACPC,GAAG,EACHC,IAAI,EACJC,KAAK,EACLC,KAAK,EACLC,WAAW,EACXC,iBAAiB,EAClB,GAAGV;oBAEJ5B,gBAAgB,IAAI,CAAC,GAAG,GAAG;wBACzB,GAAIA,gBAAgB,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;wBAClCqC;oBACF;oBACArC,gBAAgB,IAAI,CAAC,KAAK,GAAGoC;oBAC7BpC,gBAAgB,IAAI,CAAC,iBAAiB,GAAGsC;oBACzCtC,gBAAgB,IAAI,CAAC,MAAM,GAAG;wBAC5B,SAASgC,WAAW,EAAE;wBACtBC;wBACAC;wBACA,UAAUN,WAAW,QAAQ;wBAC7B,QAAQP;wBACR,wBAAwBO,WAAW,sBAAsB;oBAC3D;oBACA5B,gBAAgB,SAAS,GAAGC;oBAE5BC,OAAO,CAACiC,OAAO,CAAC,oBAAoB,EAAEA,MAAM,EAAE,EAAEF,OAAO,IAAI;oBAE3D,OAAO;wBACL,OAAO;4BACL,KAAK;wBACP;oBACF;gBACF;YACF,GACA;gBACE,gBAAgB;YAClB;YAGF,MAAML,aAAavB,QAAQ;YAG3B,MAAMd,QAAQqC,YAAY,WAAW,EAAE;YACvCX,SAAS,IAAI,IAAKW,YAAY,YAAY,EAAE;YAE5C,IAAIW;YACJ,IAAI;gBACFA,cAAc,MAAM,IAAI,CAAC,uBAAuB,CAC9ChD,OACAC,wBACAC,6BACA;oBACEiB;oBACA,SAAS;gBACX;YAEJ,EAAE,OAAOyB,OAAO;gBACd,OAAOvC,QAAQ,eAAe,CAC5B,CAAC,4CAA4C,EAAEuC,MAAM,SAAS,EAAEJ,KAAK,SAAS,CAC5ExC,QACC;YAEP;YACA,IAAI,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EACjDoC,QAAQ,IAAI,CACV,8FACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB;YAKnD,IAAI;gBACF,MAAMtB,SAAS,MAAMT,QAAQ,YAAY,CAAC2C,YAAY,KAAK;gBAC3DlB,eAAehB,QAAQ;YACzB,EAAE,OAAO8B,OAAY;gBAEnBf;gBACA,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAAG,CAAC,MAAM,EAAEoB,wBAAwB,iCAAiC,EAAEL,OAAO,WAAWM,OAAON,QAAQ;gBACvJpD,MACE,yFACAoD,iBAAiBO,QAAQP,MAAM,OAAO,GAAGM,OAAON,QAChD,6CACAf;YAEJ;YAEA,IAAIA,8BAA8BnC,uCAChC,OAAOW,QAAQ,eAAe,CAAC;YAIjC,IAAI,CAACgC,YAAY,wBACf;YAIF,EAAEZ;YAEF,IAAIA,cAAcE,sBAAsB;gBACtC,MAAMyB,WAAW,CAAC,UAAU,EAAEzB,qBAAqB,4JAA4J,CAAC;gBAChN,OAAOtB,QAAQ,eAAe,CAAC+C;YACjC;YAEA,IAAI,CAAC,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,EAClD,IAAI,CAAC,mBAAmB,CAAC,sBAAsB,GAAG,CAAC,MAAM,EAAEH,wBAAwB,gDAAgD,CAAC;QAExI;QAEA,OAAO;YACL,QAAQ;gBACNvB;gBACA,QAAQI;YACV;YACAlB;QACF;IACF;IAEQ,oBACNyC,IAAsE,EACtEC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACpC;QACA,MAAMC,YAA4C;YAChD,MAAM;YACN,SAASL;YACT,OAAO;gBACL,YAAYI,mBACP;oBACCH;oBACAG;gBACF,IACAH;YACN;YACA,UAAU,OAAO9C,OAAOmD;gBACtB,MAAM,EAAEpD,IAAI,EAAE,GAAGoD;gBACjB,IAAIC;gBACJ,MAAMC,YAAY,CAACC;oBACjBF,YAAYE;oBACZvD,KAAK,GAAG,GAAG;wBACTuD;oBACF;gBACF;gBAGA,MAAMpD,YAAYiD,YAAY,SAAS;gBACvChD,OAAOD,WAAW;gBAElB,MAAMqD,mBAAmBV,AAAS,YAATA;gBACzB,IAAIW,cAAcV;gBAClB,IAAIW,cAAc;gBAClB,IAAIF,oBAAqBV,CAAAA,AAAS,aAATA,QAAqBA,AAAS,cAATA,IAAiB,GAAI;oBACjEY,cAAc;oBACd,MAAMC,gBACJb,AAAS,aAATA,OACI,CAAC,kDAAkD,EAAEC,QAAQ,GAC7D,CAAC,+GAA+G,EAAEA,QAAQ;oBAChIU,cAAc;wBACZ,CAACC,YAAY,EAAEC;oBACjB;gBACF,OAAO,IAAIH,kBACTC,cAAc;oBACZ,CAACC,YAAY,EAAE,GAAGZ,KAAK,EAAE,EAAEC,QAAQ;gBACrC;gBAGF,IAAIa;gBAEJ,IAAIC,uBAAuB;gBAC3B,IAAIZ,KAAK,eAAe,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE;oBAC1DhE,MAAM;oBACN,MAAM6E,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,mBAAmB;oBACrDD,uBAAuB,MAAME,kBAC3BD,MACA,KACA,OACAb,KAAK,gBAAgB;gBAEzB;gBAEA,IAAI;oBACFW,gBAAgB,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CACxCH,aACAT,aACAC,KACAY,sBACAX;gBAEJ,EAAE,OAAOb,OAAO;oBACd,IAAIA,iBAAiB2B,cACnBV,UAAUjB,MAAM,IAAI;oBAEtB,MAAMA;gBACR;gBAEA,MAAM,EAAE4B,IAAI,EAAE3B,KAAK,EAAE4B,OAAO,EAAEX,IAAI,EAAEf,iBAAiB,EAAE,GAAGoB;gBAC1DN,UAAUC;gBACVvD,KAAK,iBAAiB,GAAGwC;gBAEzB,IAAI2B,eAAeF;gBACnB,IAAIT,kBAEF,IAAI,AAAgB,YAAhB,OAAOS,MACTE,eAAeF;qBACV,IAAInB,AAAS,cAATA,MAEPqB,eADEF,QAAAA,OACa,QAECA,IAAY,CAACP,YAAY;qBAEtC,IAAIO,QAAAA,MACTE,eAAe;qBACV;oBACL/D,OACE6D,MAAM,CAACP,YAAY,KAAKrC,QACxB;oBAEF8C,eAAgBF,IAAY,CAACP,YAAY;gBAC3C;gBAGF,IAAIZ,AAAS,aAATA,QAAqB,CAACqB,cAAc;oBACtCnE,KAAK,KAAK,GAAGsC;oBACbtC,KAAK,OAAO,GAAGkE;oBACf,MAAM,IAAItB,MAAM,CAAC,kBAAkB,EAAEsB,SAAS;gBAChD;gBAEA,OAAO;oBACL,QAAQC;oBACR,KAAKd;oBACLf;oBACA4B;gBACF;YACF;QACF;QAEA,OAAOf;IACT;IACA,MAAM,yBACJL,IAA0D,EAC1DC,MAA2B,EAC3BC,WAAyB,EACzBC,GAA0B,EAC1BC,gBAAoC,EACP;QAC7B,MAAMpD,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aACE+C,MACA,AAAkB,YAAlB,OAAOC,SAAsBA,SAASd,KAAK,SAAS,CAACc;QAIzD,MAAMI,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9CL,MACAC,QACAC,aACAC,KACAC;QAGF,MAAM7C,SAASP,QAAQ,SAAS;QAChC,MAAMS,SAAS,MAAMT,QAAQ,YAAY,CAACqD;QAE1C,IAAI,CAAC5C,QACH,MAAM,IAAIqC,MACR;QAIJ,MAAM,EAAEpC,MAAM,EAAE0D,OAAO,EAAE,GAAG3D;QAE5B,OAAO;YACLC;YACA0D;YACA7D;QACF;IACF;IAEA,MAAM,aAAa+D,MAAc,EAAEC,YAA0B,EAAE;QAC7D,OAAO,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;YACtCD;QACF;IACF;IAEA,MAAM,QACJE,SAAsB,EACtBrB,GAA+B,EAC/BD,WAAyB,EACO;QAChC,MAAM,EAAEuB,UAAU,EAAErB,gBAAgB,EAAE,GAAGsB,YAAYF;QAErD,MAAMG,cAAc,CAAC,SAAS,EAAEF,YAAY;QAC5C,MAAMzE,UAAU,IAAI,CAAC,sBAAsB,CACzCC,aAAa,WAAW0E;QAE1B,MAAMpE,SAASP,QAAQ,SAAS;QAChC,MAAM,EACJ4E,SAAS,EACTC,eAAe,EACfC,WAAW,EACXC,kBAAkB,EAClB,GAAGC,SACJ,GAAG7B;QACJ,MAAM8B,oBAA0C;YAC9CH;YACAC;YACA,GAAGC,OAAO;QACZ;QAEA1E,OAAOkE,WAAW;QAClBlE,OAAOsE,WAAW;QAClBtE,OAAOuE,iBAAiB;QAExBvE,OACEuE,mBAAmBD,WACnB,CAAC,iGAAiG,EAAEC,gBAAgB,aAAa,EAAED,UAAU,CAAC,CAAC;QAGjJ,MAAMM,mBAAmBC,KAAK,GAAG;QACjC,IAAIC,iBAAiBF;QACrB,IAAIG,eAAe;QAEnB,MAAOD,iBAAiBF,oBAAoBN,UAAW;YACrD,MAAMU,oBAAoBH,KAAK,GAAG;YAClCC,iBAAiBE;YACjB,MAAMjC,YAAY,MAAM,IAAI,CAAC,mBAAmB,CAC9C,WACAoB,YACAvB,aACA+B,mBACA7B;YAGF,MAAM3C,SAAU,MAAMT,QAAQ,YAAY,CAACqD;YAO3C,IAAI5C,QAAQ,QACV,OAAO;gBACL,QAAQc;gBACRhB;YACF;YAGF8E,eACE5E,QAAQ,WACP,CAACA,UAAU,CAAC,0BAA0B,EAAEgE,YAAY,IACrD,CAAC,0CAA0C,EAAEA,YAAY;YAC3D,MAAMc,MAAMJ,KAAK,GAAG;YACpB,IAAII,MAAMD,oBAAoBT,iBAAiB;gBAC7C,MAAMW,gBAAgBX,kBAAmBU,CAAAA,MAAMD,iBAAgB;gBAC/D,MAAMG,YAAY,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC;oBACjD,QAAQD;gBACV;gBACA,MAAMxF,QAAQ,MAAM,CAACyF;YACvB;QACF;QAEA,OAAOzF,QAAQ,eAAe,CAAC,CAAC,iBAAiB,EAAEqF,cAAc;IACnE;IAlmBA,YACEK,iBAAoC,EACpCC,OAAgB,EAChBC,IAMC,CACD;QAjCF;QAEA;QAEA;QAEA,uBAAiB,uBAAjB;QAEA,uBAAiB,eAAjB;QAEA,uBAAQ,uBAAR;QAEA;QAEA,uBAAiB,SAAjB;QAEA;QAkBE,IAAI,CAAC,SAAS,GAAGF;QACjB,IAAI,CAAC,OAAO,GAAGC;QACf,IAAI,CAAC,SAAS,GAAGC,KAAK,SAAS;QAC/B,IAAI,CAAC,mBAAmB,GAAGA,MAAM;QACjC,IAAI,CAAC,oBAAoB,GAAGA,KAAK,oBAAoB;QACrD,IAAI,CAAC,KAAK,GAAGA,KAAK,KAAK;QACvB,IAAI,CAAC,mBAAmB,GAAG,IAAIC;QAC/B,IAAI,CAAC,mBAAmB,GAAGD,KAAK,WAAW;QAC3C,IAAI,CAAC,WAAW,GAAG,IAAIE,YAAY;YACjCJ;YACAC;YACA,WAAWC,KAAK,SAAS;YACzB,aAAa,IAAI,CAAC,cAAc;QAClC;IACF;AA0kBF;AAEO,eAAezE,gBACpBuE,iBAAoC,EACpCxE,iBAAuC,EACvCW,MAAwB;IAExB,IAAI,CAACX,mBAAmB,QACtB,OAAOW;IAGT,IAAI,CAAC6D,kBAAkB,2BAA2B,EAChD,MAAM,IAAI5C,MACR,CAAC,gCAAgC,EAAE4C,kBAAkB,aAAa,EAAE;IAIxE,MAAMK,UAAU,OAAOC;QACrB,MAAMA,QAAQ,MAAM,CAAC9E;IACvB;IAEA,MAAM,EAAE+E,OAAO,EAAEC,QAAQ,EAAE,GACzB,MAAMR,kBAAkB,2BAA2B,CAACK;IACtD,IAAI;QACF,MAAMtF,SAAS,MAAMoB;QAErB,MAAMU,QAAQ2D;QACd,IAAI3D,OACF,MAAMA;QAER,OAAO9B;IACT,SAAU;QACRwF;IACF;AACF"}
@@ -1,3 +1,4 @@
1
+ import { ScreenshotItem } from "../screenshot-item.mjs";
1
2
  import { uploadTestInfoToServer } from "../utils.mjs";
2
3
  import { MIDSCENE_REPORT_TAG_NAME, globalConfigManager } from "@midscene/shared/env";
3
4
  import { generateElementByPosition } from "@midscene/shared/extractor";
@@ -22,9 +23,10 @@ async function commonContextParser(interfaceInstance, _opt) {
22
23
  debugProfile('will get size');
23
24
  const size = await interfaceInstance.size();
24
25
  debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);
26
+ const screenshot = ScreenshotItem.create(screenshotBase64);
25
27
  return {
26
28
  size,
27
- screenshotBase64: screenshotBase64
29
+ screenshot
28
30
  };
29
31
  }
30
32
  function getReportFileName(tag = 'web') {
@@ -101,7 +103,7 @@ async function matchElementFromCache(context, cacheEntry, cachePrompt, cacheable
101
103
  return;
102
104
  }
103
105
  }
104
- const getMidsceneVersion = ()=>"1.2.2-beta-20260115092052.0";
106
+ const getMidsceneVersion = ()=>"1.2.2-beta-20260116060040.0";
105
107
  const parsePrompt = (prompt)=>{
106
108
  if ('string' == typeof prompt) return {
107
109
  textPrompt: prompt,
@@ -1 +1 @@
1
- {"version":3,"file":"agent/utils.mjs","sources":["../../../src/agent/utils.ts"],"sourcesContent":["import type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface } from '@/device';\nimport type {\n ElementCacheFeature,\n LocateResultElement,\n PlanningLocateParam,\n UIContext,\n} from '@/types';\nimport { uploadTestInfoToServer } from '@/utils';\nimport {\n MIDSCENE_REPORT_TAG_NAME,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';\nimport { assert, logMsg, uuid } from '@midscene/shared/utils';\nimport dayjs from 'dayjs';\nimport type { TaskCache } from './task-cache';\nimport { debug as cacheDebug } from './task-cache';\n\nconst debugProfile = getDebug('web:tool:profile');\n\nexport async function commonContextParser(\n interfaceInstance: AbstractInterface,\n _opt: { uploadServerUrl?: string },\n): Promise<UIContext> {\n assert(interfaceInstance, 'interfaceInstance is required');\n\n debugProfile('Getting interface description');\n const description = interfaceInstance.describe?.() || '';\n debugProfile('Interface description end');\n\n debugProfile('Uploading test info to server');\n uploadTestInfoToServer({\n testUrl: description,\n serverUrl: _opt.uploadServerUrl,\n });\n debugProfile('UploadTestInfoToServer end');\n\n const screenshotBase64 = await interfaceInstance.screenshotBase64();\n assert(screenshotBase64!, 'screenshotBase64 is required');\n\n debugProfile('will get size');\n const size = await interfaceInstance.size();\n debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);\n\n return {\n size,\n screenshotBase64: screenshotBase64!,\n };\n}\n\nexport function getReportFileName(tag = 'web') {\n const reportTagName = globalConfigManager.getEnvConfigValue(\n MIDSCENE_REPORT_TAG_NAME,\n );\n const dateTimeInFileName = dayjs().format('YYYY-MM-DD_HH-mm-ss');\n // ensure uniqueness at the same time\n const uniqueId = uuid().substring(0, 8);\n return `${reportTagName || tag}-${dateTimeInFileName}-${uniqueId}`;\n}\n\nexport function printReportMsg(filepath: string) {\n logMsg(`Midscene - report file updated: ${filepath}`);\n}\n\n/**\n * Get the current execution file name\n * @returns The name of the current execution file\n */\nexport function getCurrentExecutionFile(trace?: string): string | false {\n const error = new Error();\n const stackTrace = trace || error.stack;\n const pkgDir = process.cwd() || '';\n if (stackTrace) {\n const stackLines = stackTrace.split('\\n');\n for (const line of stackLines) {\n if (\n line.includes('.spec.') ||\n line.includes('.test.') ||\n line.includes('.ts') ||\n line.includes('.js')\n ) {\n const match = line.match(/(?:at\\s+)?(.*?\\.(?:spec|test)\\.[jt]s)/);\n if (match?.[1]) {\n const targetFileName = match[1]\n .replace(pkgDir, '')\n .trim()\n .replace('at ', '');\n return targetFileName;\n }\n }\n }\n }\n return false;\n}\n\nconst testFileIndex = new Map<string, number>();\n\nexport function generateCacheId(fileName?: string): string {\n let taskFile = fileName || getCurrentExecutionFile();\n if (!taskFile) {\n taskFile = uuid();\n console.warn(\n 'Midscene - using random UUID for cache id. Cache may be invalid.',\n );\n }\n\n if (testFileIndex.has(taskFile)) {\n const currentIndex = testFileIndex.get(taskFile);\n if (currentIndex !== undefined) {\n testFileIndex.set(taskFile, currentIndex + 1);\n }\n } else {\n testFileIndex.set(taskFile, 1);\n }\n return `${taskFile}-${testFileIndex.get(taskFile)}`;\n}\n\nexport function ifPlanLocateParamIsBbox(\n planLocateParam: PlanningLocateParam,\n): boolean {\n return !!(\n planLocateParam.bbox &&\n Array.isArray(planLocateParam.bbox) &&\n planLocateParam.bbox.length === 4\n );\n}\n\nexport function matchElementFromPlan(\n planLocateParam: PlanningLocateParam,\n): LocateResultElement | undefined {\n if (!planLocateParam) {\n return undefined;\n }\n\n if (planLocateParam.bbox) {\n const centerPosition = {\n x: Math.floor((planLocateParam.bbox[0] + planLocateParam.bbox[2]) / 2),\n y: Math.floor((planLocateParam.bbox[1] + planLocateParam.bbox[3]) / 2),\n };\n\n const element = generateElementByPosition(\n centerPosition,\n typeof planLocateParam.prompt === 'string'\n ? planLocateParam.prompt\n : planLocateParam.prompt?.prompt || '',\n );\n return element;\n }\n\n return undefined;\n}\n\nexport async function matchElementFromCache(\n context: {\n taskCache?: TaskCache;\n interfaceInstance: AbstractInterface;\n },\n cacheEntry: ElementCacheFeature | undefined,\n cachePrompt: TUserPrompt,\n cacheable: boolean | undefined,\n): Promise<LocateResultElement | undefined> {\n if (!cacheEntry) {\n return undefined;\n }\n\n if (cacheable === false) {\n cacheDebug('cache disabled for prompt: %s', cachePrompt);\n return undefined;\n }\n\n if (!context.taskCache?.isCacheResultUsed) {\n return undefined;\n }\n\n if (!context.interfaceInstance.rectMatchesCacheFeature) {\n cacheDebug(\n 'interface does not implement rectMatchesCacheFeature, skip cache',\n );\n return undefined;\n }\n\n try {\n const rect =\n await context.interfaceInstance.rectMatchesCacheFeature(cacheEntry);\n const element: LocateResultElement = {\n center: [\n Math.round(rect.left + rect.width / 2),\n Math.round(rect.top + rect.height / 2),\n ],\n rect,\n description:\n typeof cachePrompt === 'string'\n ? cachePrompt\n : cachePrompt.prompt || '',\n };\n\n cacheDebug('cache hit, prompt: %s', cachePrompt);\n return element;\n } catch (error) {\n cacheDebug('rectMatchesCacheFeature error: %s', error);\n return undefined;\n }\n}\n\ndeclare const __VERSION__: string | undefined;\n\nexport const getMidsceneVersion = (): string => {\n if (typeof __VERSION__ !== 'undefined') {\n return __VERSION__;\n } else if (\n process.env.__VERSION__ &&\n process.env.__VERSION__ !== 'undefined'\n ) {\n return process.env.__VERSION__;\n }\n throw new Error('__VERSION__ inject failed during build');\n};\n\nexport const parsePrompt = (\n prompt: TUserPrompt,\n): {\n textPrompt: string;\n multimodalPrompt?: TMultimodalPrompt;\n} => {\n if (typeof prompt === 'string') {\n return {\n textPrompt: prompt,\n multimodalPrompt: undefined,\n };\n }\n return {\n textPrompt: prompt.prompt,\n multimodalPrompt: prompt.images\n ? {\n images: prompt.images,\n convertHttpImage2Base64: !!prompt.convertHttpImage2Base64,\n }\n : undefined,\n };\n};\n"],"names":["debugProfile","getDebug","commonContextParser","interfaceInstance","_opt","assert","description","uploadTestInfoToServer","screenshotBase64","size","getReportFileName","tag","reportTagName","globalConfigManager","MIDSCENE_REPORT_TAG_NAME","dateTimeInFileName","dayjs","uniqueId","uuid","printReportMsg","filepath","logMsg","getCurrentExecutionFile","trace","error","Error","stackTrace","pkgDir","process","stackLines","line","match","targetFileName","testFileIndex","Map","generateCacheId","fileName","taskFile","console","currentIndex","undefined","ifPlanLocateParamIsBbox","planLocateParam","Array","matchElementFromPlan","centerPosition","Math","element","generateElementByPosition","matchElementFromCache","context","cacheEntry","cachePrompt","cacheable","cacheDebug","rect","getMidsceneVersion","__VERSION__","parsePrompt","prompt"],"mappings":";;;;;;;AAqBA,MAAMA,eAAeC,SAAS;AAEvB,eAAeC,oBACpBC,iBAAoC,EACpCC,IAAkC;IAElCC,OAAOF,mBAAmB;IAE1BH,aAAa;IACb,MAAMM,cAAcH,kBAAkB,QAAQ,QAAQ;IACtDH,aAAa;IAEbA,aAAa;IACbO,uBAAuB;QACrB,SAASD;QACT,WAAWF,KAAK,eAAe;IACjC;IACAJ,aAAa;IAEb,MAAMQ,mBAAmB,MAAML,kBAAkB,gBAAgB;IACjEE,OAAOG,kBAAmB;IAE1BR,aAAa;IACb,MAAMS,OAAO,MAAMN,kBAAkB,IAAI;IACzCH,aAAa,CAAC,MAAM,EAAES,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,MAAM,EAAEA,KAAK,GAAG,EAAE;IAElE,OAAO;QACLA;QACA,kBAAkBD;IACpB;AACF;AAEO,SAASE,kBAAkBC,MAAM,KAAK;IAC3C,MAAMC,gBAAgBC,oBAAoB,iBAAiB,CACzDC;IAEF,MAAMC,qBAAqBC,QAAQ,MAAM,CAAC;IAE1C,MAAMC,WAAWC,OAAO,SAAS,CAAC,GAAG;IACrC,OAAO,GAAGN,iBAAiBD,IAAI,CAAC,EAAEI,mBAAmB,CAAC,EAAEE,UAAU;AACpE;AAEO,SAASE,eAAeC,QAAgB;IAC7CC,OAAO,CAAC,gCAAgC,EAAED,UAAU;AACtD;AAMO,SAASE,wBAAwBC,KAAc;IACpD,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,aAAaH,SAASC,MAAM,KAAK;IACvC,MAAMG,SAASC,QAAQ,GAAG,MAAM;IAChC,IAAIF,YAAY;QACd,MAAMG,aAAaH,WAAW,KAAK,CAAC;QACpC,KAAK,MAAMI,QAAQD,WACjB,IACEC,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,UACdA,KAAK,QAAQ,CAAC,QACd;YACA,MAAMC,QAAQD,KAAK,KAAK,CAAC;YACzB,IAAIC,OAAO,CAAC,EAAE,EAAE;gBACd,MAAMC,iBAAiBD,KAAK,CAAC,EAAE,CAC5B,OAAO,CAACJ,QAAQ,IAChB,IAAI,GACJ,OAAO,CAAC,OAAO;gBAClB,OAAOK;YACT;QACF;IAEJ;IACA,OAAO;AACT;AAEA,MAAMC,gBAAgB,IAAIC;AAEnB,SAASC,gBAAgBC,QAAiB;IAC/C,IAAIC,WAAWD,YAAYd;IAC3B,IAAI,CAACe,UAAU;QACbA,WAAWnB;QACXoB,QAAQ,IAAI,CACV;IAEJ;IAEA,IAAIL,cAAc,GAAG,CAACI,WAAW;QAC/B,MAAME,eAAeN,cAAc,GAAG,CAACI;QACvC,IAAIE,AAAiBC,WAAjBD,cACFN,cAAc,GAAG,CAACI,UAAUE,eAAe;IAE/C,OACEN,cAAc,GAAG,CAACI,UAAU;IAE9B,OAAO,GAAGA,SAAS,CAAC,EAAEJ,cAAc,GAAG,CAACI,WAAW;AACrD;AAEO,SAASI,wBACdC,eAAoC;IAEpC,OAAO,CAAC,CACNA,CAAAA,gBAAgB,IAAI,IACpBC,MAAM,OAAO,CAACD,gBAAgB,IAAI,KAClCA,AAAgC,MAAhCA,gBAAgB,IAAI,CAAC,MAAM,AAAK;AAEpC;AAEO,SAASE,qBACdF,eAAoC;IAEpC,IAAI,CAACA,iBACH;IAGF,IAAIA,gBAAgB,IAAI,EAAE;QACxB,MAAMG,iBAAiB;YACrB,GAAGC,KAAK,KAAK,CAAEJ,AAAAA,CAAAA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAC,IAAK;YACpE,GAAGI,KAAK,KAAK,CAAEJ,AAAAA,CAAAA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAC,IAAK;QACtE;QAEA,MAAMK,UAAUC,0BACdH,gBACA,AAAkC,YAAlC,OAAOH,gBAAgB,MAAM,GACzBA,gBAAgB,MAAM,GACtBA,gBAAgB,MAAM,EAAE,UAAU;QAExC,OAAOK;IACT;AAGF;AAEO,eAAeE,sBACpBC,OAGC,EACDC,UAA2C,EAC3CC,WAAwB,EACxBC,SAA8B;IAE9B,IAAI,CAACF,YACH;IAGF,IAAIE,AAAc,UAAdA,WAAqB,YACvBC,MAAW,iCAAiCF;IAI9C,IAAI,CAACF,QAAQ,SAAS,EAAE,mBACtB;IAGF,IAAI,CAACA,QAAQ,iBAAiB,CAAC,uBAAuB,EAAE,YACtDI,MACE;IAKJ,IAAI;QACF,MAAMC,OACJ,MAAML,QAAQ,iBAAiB,CAAC,uBAAuB,CAACC;QAC1D,MAAMJ,UAA+B;YACnC,QAAQ;gBACND,KAAK,KAAK,CAACS,KAAK,IAAI,GAAGA,KAAK,KAAK,GAAG;gBACpCT,KAAK,KAAK,CAACS,KAAK,GAAG,GAAGA,KAAK,MAAM,GAAG;aACrC;YACDA;YACA,aACE,AAAuB,YAAvB,OAAOH,cACHA,cACAA,YAAY,MAAM,IAAI;QAC9B;QAEAE,MAAW,yBAAyBF;QACpC,OAAOL;IACT,EAAE,OAAOvB,OAAO;QACd8B,MAAW,qCAAqC9B;QAChD;IACF;AACF;AAIO,MAAMgC,qBAAqB,IAEvBC;AAUJ,MAAMC,cAAc,CACzBC;IAKA,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAO;QACL,YAAYA;QACZ,kBAAkBnB;IACpB;IAEF,OAAO;QACL,YAAYmB,OAAO,MAAM;QACzB,kBAAkBA,OAAO,MAAM,GAC3B;YACE,QAAQA,OAAO,MAAM;YACrB,yBAAyB,CAAC,CAACA,OAAO,uBAAuB;QAC3D,IACAnB;IACN;AACF"}
1
+ {"version":3,"file":"agent/utils.mjs","sources":["../../../src/agent/utils.ts"],"sourcesContent":["import type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface } from '@/device';\nimport { ScreenshotItem } from '@/screenshot-item';\nimport type {\n ElementCacheFeature,\n LocateResultElement,\n PlanningLocateParam,\n UIContext,\n} from '@/types';\nimport { uploadTestInfoToServer } from '@/utils';\nimport {\n MIDSCENE_REPORT_TAG_NAME,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';\nimport { assert, logMsg, uuid } from '@midscene/shared/utils';\nimport dayjs from 'dayjs';\nimport type { TaskCache } from './task-cache';\nimport { debug as cacheDebug } from './task-cache';\n\nconst debugProfile = getDebug('web:tool:profile');\n\nexport async function commonContextParser(\n interfaceInstance: AbstractInterface,\n _opt: { uploadServerUrl?: string },\n): Promise<UIContext> {\n assert(interfaceInstance, 'interfaceInstance is required');\n\n debugProfile('Getting interface description');\n const description = interfaceInstance.describe?.() || '';\n debugProfile('Interface description end');\n\n debugProfile('Uploading test info to server');\n uploadTestInfoToServer({\n testUrl: description,\n serverUrl: _opt.uploadServerUrl,\n });\n debugProfile('UploadTestInfoToServer end');\n\n const screenshotBase64 = await interfaceInstance.screenshotBase64();\n assert(screenshotBase64!, 'screenshotBase64 is required');\n\n debugProfile('will get size');\n const size = await interfaceInstance.size();\n debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);\n\n const screenshot = ScreenshotItem.create(screenshotBase64!);\n\n return {\n size,\n screenshot,\n };\n}\n\nexport function getReportFileName(tag = 'web') {\n const reportTagName = globalConfigManager.getEnvConfigValue(\n MIDSCENE_REPORT_TAG_NAME,\n );\n const dateTimeInFileName = dayjs().format('YYYY-MM-DD_HH-mm-ss');\n // ensure uniqueness at the same time\n const uniqueId = uuid().substring(0, 8);\n return `${reportTagName || tag}-${dateTimeInFileName}-${uniqueId}`;\n}\n\nexport function printReportMsg(filepath: string) {\n logMsg(`Midscene - report file updated: ${filepath}`);\n}\n\n/**\n * Get the current execution file name\n * @returns The name of the current execution file\n */\nexport function getCurrentExecutionFile(trace?: string): string | false {\n const error = new Error();\n const stackTrace = trace || error.stack;\n const pkgDir = process.cwd() || '';\n if (stackTrace) {\n const stackLines = stackTrace.split('\\n');\n for (const line of stackLines) {\n if (\n line.includes('.spec.') ||\n line.includes('.test.') ||\n line.includes('.ts') ||\n line.includes('.js')\n ) {\n const match = line.match(/(?:at\\s+)?(.*?\\.(?:spec|test)\\.[jt]s)/);\n if (match?.[1]) {\n const targetFileName = match[1]\n .replace(pkgDir, '')\n .trim()\n .replace('at ', '');\n return targetFileName;\n }\n }\n }\n }\n return false;\n}\n\nconst testFileIndex = new Map<string, number>();\n\nexport function generateCacheId(fileName?: string): string {\n let taskFile = fileName || getCurrentExecutionFile();\n if (!taskFile) {\n taskFile = uuid();\n console.warn(\n 'Midscene - using random UUID for cache id. Cache may be invalid.',\n );\n }\n\n if (testFileIndex.has(taskFile)) {\n const currentIndex = testFileIndex.get(taskFile);\n if (currentIndex !== undefined) {\n testFileIndex.set(taskFile, currentIndex + 1);\n }\n } else {\n testFileIndex.set(taskFile, 1);\n }\n return `${taskFile}-${testFileIndex.get(taskFile)}`;\n}\n\nexport function ifPlanLocateParamIsBbox(\n planLocateParam: PlanningLocateParam,\n): boolean {\n return !!(\n planLocateParam.bbox &&\n Array.isArray(planLocateParam.bbox) &&\n planLocateParam.bbox.length === 4\n );\n}\n\nexport function matchElementFromPlan(\n planLocateParam: PlanningLocateParam,\n): LocateResultElement | undefined {\n if (!planLocateParam) {\n return undefined;\n }\n\n if (planLocateParam.bbox) {\n const centerPosition = {\n x: Math.floor((planLocateParam.bbox[0] + planLocateParam.bbox[2]) / 2),\n y: Math.floor((planLocateParam.bbox[1] + planLocateParam.bbox[3]) / 2),\n };\n\n const element = generateElementByPosition(\n centerPosition,\n typeof planLocateParam.prompt === 'string'\n ? planLocateParam.prompt\n : planLocateParam.prompt?.prompt || '',\n );\n return element;\n }\n\n return undefined;\n}\n\nexport async function matchElementFromCache(\n context: {\n taskCache?: TaskCache;\n interfaceInstance: AbstractInterface;\n },\n cacheEntry: ElementCacheFeature | undefined,\n cachePrompt: TUserPrompt,\n cacheable: boolean | undefined,\n): Promise<LocateResultElement | undefined> {\n if (!cacheEntry) {\n return undefined;\n }\n\n if (cacheable === false) {\n cacheDebug('cache disabled for prompt: %s', cachePrompt);\n return undefined;\n }\n\n if (!context.taskCache?.isCacheResultUsed) {\n return undefined;\n }\n\n if (!context.interfaceInstance.rectMatchesCacheFeature) {\n cacheDebug(\n 'interface does not implement rectMatchesCacheFeature, skip cache',\n );\n return undefined;\n }\n\n try {\n const rect =\n await context.interfaceInstance.rectMatchesCacheFeature(cacheEntry);\n const element: LocateResultElement = {\n center: [\n Math.round(rect.left + rect.width / 2),\n Math.round(rect.top + rect.height / 2),\n ],\n rect,\n description:\n typeof cachePrompt === 'string'\n ? cachePrompt\n : cachePrompt.prompt || '',\n };\n\n cacheDebug('cache hit, prompt: %s', cachePrompt);\n return element;\n } catch (error) {\n cacheDebug('rectMatchesCacheFeature error: %s', error);\n return undefined;\n }\n}\n\ndeclare const __VERSION__: string | undefined;\n\nexport const getMidsceneVersion = (): string => {\n if (typeof __VERSION__ !== 'undefined') {\n return __VERSION__;\n } else if (\n process.env.__VERSION__ &&\n process.env.__VERSION__ !== 'undefined'\n ) {\n return process.env.__VERSION__;\n }\n throw new Error('__VERSION__ inject failed during build');\n};\n\nexport const parsePrompt = (\n prompt: TUserPrompt,\n): {\n textPrompt: string;\n multimodalPrompt?: TMultimodalPrompt;\n} => {\n if (typeof prompt === 'string') {\n return {\n textPrompt: prompt,\n multimodalPrompt: undefined,\n };\n }\n return {\n textPrompt: prompt.prompt,\n multimodalPrompt: prompt.images\n ? {\n images: prompt.images,\n convertHttpImage2Base64: !!prompt.convertHttpImage2Base64,\n }\n : undefined,\n };\n};\n"],"names":["debugProfile","getDebug","commonContextParser","interfaceInstance","_opt","assert","description","uploadTestInfoToServer","screenshotBase64","size","screenshot","ScreenshotItem","getReportFileName","tag","reportTagName","globalConfigManager","MIDSCENE_REPORT_TAG_NAME","dateTimeInFileName","dayjs","uniqueId","uuid","printReportMsg","filepath","logMsg","getCurrentExecutionFile","trace","error","Error","stackTrace","pkgDir","process","stackLines","line","match","targetFileName","testFileIndex","Map","generateCacheId","fileName","taskFile","console","currentIndex","undefined","ifPlanLocateParamIsBbox","planLocateParam","Array","matchElementFromPlan","centerPosition","Math","element","generateElementByPosition","matchElementFromCache","context","cacheEntry","cachePrompt","cacheable","cacheDebug","rect","getMidsceneVersion","__VERSION__","parsePrompt","prompt"],"mappings":";;;;;;;;AAsBA,MAAMA,eAAeC,SAAS;AAEvB,eAAeC,oBACpBC,iBAAoC,EACpCC,IAAkC;IAElCC,OAAOF,mBAAmB;IAE1BH,aAAa;IACb,MAAMM,cAAcH,kBAAkB,QAAQ,QAAQ;IACtDH,aAAa;IAEbA,aAAa;IACbO,uBAAuB;QACrB,SAASD;QACT,WAAWF,KAAK,eAAe;IACjC;IACAJ,aAAa;IAEb,MAAMQ,mBAAmB,MAAML,kBAAkB,gBAAgB;IACjEE,OAAOG,kBAAmB;IAE1BR,aAAa;IACb,MAAMS,OAAO,MAAMN,kBAAkB,IAAI;IACzCH,aAAa,CAAC,MAAM,EAAES,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,MAAM,EAAEA,KAAK,GAAG,EAAE;IAElE,MAAMC,aAAaC,eAAe,MAAM,CAACH;IAEzC,OAAO;QACLC;QACAC;IACF;AACF;AAEO,SAASE,kBAAkBC,MAAM,KAAK;IAC3C,MAAMC,gBAAgBC,oBAAoB,iBAAiB,CACzDC;IAEF,MAAMC,qBAAqBC,QAAQ,MAAM,CAAC;IAE1C,MAAMC,WAAWC,OAAO,SAAS,CAAC,GAAG;IACrC,OAAO,GAAGN,iBAAiBD,IAAI,CAAC,EAAEI,mBAAmB,CAAC,EAAEE,UAAU;AACpE;AAEO,SAASE,eAAeC,QAAgB;IAC7CC,OAAO,CAAC,gCAAgC,EAAED,UAAU;AACtD;AAMO,SAASE,wBAAwBC,KAAc;IACpD,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,aAAaH,SAASC,MAAM,KAAK;IACvC,MAAMG,SAASC,QAAQ,GAAG,MAAM;IAChC,IAAIF,YAAY;QACd,MAAMG,aAAaH,WAAW,KAAK,CAAC;QACpC,KAAK,MAAMI,QAAQD,WACjB,IACEC,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,UACdA,KAAK,QAAQ,CAAC,QACd;YACA,MAAMC,QAAQD,KAAK,KAAK,CAAC;YACzB,IAAIC,OAAO,CAAC,EAAE,EAAE;gBACd,MAAMC,iBAAiBD,KAAK,CAAC,EAAE,CAC5B,OAAO,CAACJ,QAAQ,IAChB,IAAI,GACJ,OAAO,CAAC,OAAO;gBAClB,OAAOK;YACT;QACF;IAEJ;IACA,OAAO;AACT;AAEA,MAAMC,gBAAgB,IAAIC;AAEnB,SAASC,gBAAgBC,QAAiB;IAC/C,IAAIC,WAAWD,YAAYd;IAC3B,IAAI,CAACe,UAAU;QACbA,WAAWnB;QACXoB,QAAQ,IAAI,CACV;IAEJ;IAEA,IAAIL,cAAc,GAAG,CAACI,WAAW;QAC/B,MAAME,eAAeN,cAAc,GAAG,CAACI;QACvC,IAAIE,AAAiBC,WAAjBD,cACFN,cAAc,GAAG,CAACI,UAAUE,eAAe;IAE/C,OACEN,cAAc,GAAG,CAACI,UAAU;IAE9B,OAAO,GAAGA,SAAS,CAAC,EAAEJ,cAAc,GAAG,CAACI,WAAW;AACrD;AAEO,SAASI,wBACdC,eAAoC;IAEpC,OAAO,CAAC,CACNA,CAAAA,gBAAgB,IAAI,IACpBC,MAAM,OAAO,CAACD,gBAAgB,IAAI,KAClCA,AAAgC,MAAhCA,gBAAgB,IAAI,CAAC,MAAM,AAAK;AAEpC;AAEO,SAASE,qBACdF,eAAoC;IAEpC,IAAI,CAACA,iBACH;IAGF,IAAIA,gBAAgB,IAAI,EAAE;QACxB,MAAMG,iBAAiB;YACrB,GAAGC,KAAK,KAAK,CAAEJ,AAAAA,CAAAA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAC,IAAK;YACpE,GAAGI,KAAK,KAAK,CAAEJ,AAAAA,CAAAA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAC,IAAK;QACtE;QAEA,MAAMK,UAAUC,0BACdH,gBACA,AAAkC,YAAlC,OAAOH,gBAAgB,MAAM,GACzBA,gBAAgB,MAAM,GACtBA,gBAAgB,MAAM,EAAE,UAAU;QAExC,OAAOK;IACT;AAGF;AAEO,eAAeE,sBACpBC,OAGC,EACDC,UAA2C,EAC3CC,WAAwB,EACxBC,SAA8B;IAE9B,IAAI,CAACF,YACH;IAGF,IAAIE,AAAc,UAAdA,WAAqB,YACvBC,MAAW,iCAAiCF;IAI9C,IAAI,CAACF,QAAQ,SAAS,EAAE,mBACtB;IAGF,IAAI,CAACA,QAAQ,iBAAiB,CAAC,uBAAuB,EAAE,YACtDI,MACE;IAKJ,IAAI;QACF,MAAMC,OACJ,MAAML,QAAQ,iBAAiB,CAAC,uBAAuB,CAACC;QAC1D,MAAMJ,UAA+B;YACnC,QAAQ;gBACND,KAAK,KAAK,CAACS,KAAK,IAAI,GAAGA,KAAK,KAAK,GAAG;gBACpCT,KAAK,KAAK,CAACS,KAAK,GAAG,GAAGA,KAAK,MAAM,GAAG;aACrC;YACDA;YACA,aACE,AAAuB,YAAvB,OAAOH,cACHA,cACAA,YAAY,MAAM,IAAI;QAC9B;QAEAE,MAAW,yBAAyBF;QACpC,OAAOL;IACT,EAAE,OAAOvB,OAAO;QACd8B,MAAW,qCAAqC9B;QAChD;IACF;AACF;AAIO,MAAMgC,qBAAqB,IAEvBC;AAUJ,MAAMC,cAAc,CACzBC;IAKA,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAO;QACL,YAAYA;QACZ,kBAAkBnB;IACpB;IAEF,OAAO;QACL,YAAYmB,OAAO,MAAM;QACzB,kBAAkBA,OAAO,MAAM,GAC3B;YACE,QAAQA,OAAO,MAAM;YACrB,yBAAyB,CAAC,CAACA,OAAO,uBAAuB;QAC3D,IACAnB;IACN;AACF"}
@@ -56,7 +56,7 @@ const promptsToChatParam = async (multimodalPrompt)=>{
56
56
  async function AiLocateElement(options) {
57
57
  const { context, targetElementDescription, callAIFn, modelConfig } = options;
58
58
  const { vlMode } = modelConfig;
59
- const { screenshotBase64 } = context;
59
+ const screenshotBase64 = context.screenshot.getData();
60
60
  assert(targetElementDescription, "cannot find the target element description");
61
61
  const targetElementDescriptionText = extraTextFromUserPrompt(targetElementDescription);
62
62
  const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);
@@ -149,7 +149,7 @@ async function AiLocateElement(options) {
149
149
  async function AiLocateSection(options) {
150
150
  const { context, sectionDescription, modelConfig } = options;
151
151
  const { vlMode } = modelConfig;
152
- const { screenshotBase64 } = context;
152
+ const screenshotBase64 = context.screenshot.getData();
153
153
  const systemPrompt = systemPromptToLocateSection(vlMode);
154
154
  const sectionLocatorInstructionText = sectionLocatorInstruction(extraTextFromUserPrompt(sectionDescription));
155
155
  const msgs = [
@@ -217,7 +217,7 @@ async function AiLocateSection(options) {
217
217
  async function AiExtractElementInfo(options) {
218
218
  const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } = options;
219
219
  const systemPrompt = systemPromptToExtract();
220
- const { screenshotBase64 } = context;
220
+ const screenshotBase64 = context.screenshot.getData();
221
221
  const extractDataPromptText = extractDataQueryPrompt(options.pageDescription || '', dataQuery);
222
222
  const userContent = [];
223
223
  if (extractOption?.screenshotIncluded !== false) userContent.push({
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const { screenshotBase64 } = context;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","res","rawResponse","JSON","resRect","matchedElements","errors","Array","adaptBboxToRect","rectCenter","element","generateElementByPosition","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;AAiDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OASrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7BM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,4BAA4BP;IAEjD,IAAIQ,eAAeP;IACnB,IAAIQ,aAAab,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIc,cAAcd,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIe,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIf,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFa,eAAeb,QAAQ,YAAY,CAAC,WAAW;QAC/Cc,aAAad,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCe,cAAcf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCgB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIV,AAAW,iBAAXA,QAAyB;QAClC,MAAMa,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMvB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMkB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMC,MAAM,MAAMlB,SAASR,MAAMS;IAEjC,MAAMkB,cAAcC,KAAK,SAAS,CAACF,IAAI,OAAO;IAE9C,IAAIG;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBM,MAAM,OAAO,CAACN,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAG,UAAUI,gBACRP,IAAI,OAAO,CAAC,IAAI,EAChBP,YACAC,aACAf,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BgB,oBACAC,qBACAZ;YAGFjB,aAAa,WAAWoC;YAExB,MAAMK,aAAa;gBACjB,GAAGL,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMM,UAA+BC,0BACnCF,YACArB;YAEFkB,SAAS,EAAE;YAEX,IAAII,SACFL,kBAAkB;gBAACK;aAAQ;QAE/B;IACF,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACN,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEO,IAAI,CAAC,CAAC;aAFtBP,SAAS;YAACO;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMT;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAJ;QACA,OAAOD,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAec,gBAAgBnC,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEmC,kBAAkB,EAAEhC,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMU,eAAe0B,4BAA4BhC;IACjD,MAAMiC,gCAAgCC,0BACpChD,wBAAwB6C;IAE1B,MAAMzC,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMgC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMhB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQ2C,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAzC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMoB,SAAS,MAAMC,yBACnB9C,MACAS;IAGF,IAAIsC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAahB,gBACjBe,aACA1C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BsD;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DlD,aAAa,wBAAwBuD;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASpB,MAAM,OAAO,CAACoB,OAC/B,GAAG,CAAC,CAACA,OACGnB,gBACLmB,MACA9C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqBwD;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DxD,aAAa,iBAAiB0D;QAG9BN,cAAcQ,iBAAiBF,YAAY/C,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BoD;IAC1C;IAEA,IAAIS,cAAc7C;IAClB,IAAIoC,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B/C,kBACAoC,aACArC,AAAW,iBAAXA;QAEF8C,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAajB,KAAK,SAAS,CAACiB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBtD,OAO7C;IACC,MAAM,EAAEuD,SAAS,EAAEtD,OAAO,EAAEuD,aAAa,EAAE9D,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe8C;IACrB,MAAM,EAAEnD,gBAAgB,EAAE,GAAGL;IAE7B,MAAMyD,wBAAwBC,uBAC5B3D,QAAQ,eAAe,IAAI,IAC3BuD;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKtD;YACL,QAAQ;QACV;IACF;IAGFsD,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM/D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASiD;QACX;KACD;IAED,IAAIlE,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMoB,SAAS,MAAMC,yBACnB9C,MACAS;IAEF,OAAO;QACL,aAAaoC,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB3D,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeoD;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMnE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASqD;QACX;KACD;IAED,MAAMxB,SAAS,MAAMrC,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBoC,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const screenshotBase64 = context.screenshot.getData();\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const screenshotBase64 = context.screenshot.getData();\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.getData();\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","res","rawResponse","JSON","resRect","matchedElements","errors","Array","adaptBboxToRect","rectCenter","element","generateElementByPosition","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;AAiDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OASrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,OAAO;IAEnDM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,4BAA4BP;IAEjD,IAAIQ,eAAeP;IACnB,IAAIQ,aAAab,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIc,cAAcd,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIe,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIf,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFa,eAAeb,QAAQ,YAAY,CAAC,WAAW;QAC/Cc,aAAad,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCe,cAAcf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCgB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIV,AAAW,iBAAXA,QAAyB;QAClC,MAAMa,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMvB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMkB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMC,MAAM,MAAMlB,SAASR,MAAMS;IAEjC,MAAMkB,cAAcC,KAAK,SAAS,CAACF,IAAI,OAAO;IAE9C,IAAIG;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBM,MAAM,OAAO,CAACN,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAG,UAAUI,gBACRP,IAAI,OAAO,CAAC,IAAI,EAChBP,YACAC,aACAf,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BgB,oBACAC,qBACAZ;YAGFjB,aAAa,WAAWoC;YAExB,MAAMK,aAAa;gBACjB,GAAGL,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMM,UAA+BC,0BACnCF,YACArB;YAEFkB,SAAS,EAAE;YAEX,IAAII,SACFL,kBAAkB;gBAACK;aAAQ;QAE/B;IACF,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACN,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEO,IAAI,CAAC,CAAC;aAFtBP,SAAS;YAACO;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMT;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAJ;QACA,OAAOD,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAec,gBAAgBnC,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEmC,kBAAkB,EAAEhC,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,OAAO;IAEnD,MAAMU,eAAe0B,4BAA4BhC;IACjD,MAAMiC,gCAAgCC,0BACpChD,wBAAwB6C;IAE1B,MAAMzC,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMgC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMhB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQ2C,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAzC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMoB,SAAS,MAAMC,yBACnB9C,MACAS;IAGF,IAAIsC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAahB,gBACjBe,aACA1C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BsD;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DlD,aAAa,wBAAwBuD;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASpB,MAAM,OAAO,CAACoB,OAC/B,GAAG,CAAC,CAACA,OACGnB,gBACLmB,MACA9C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqBwD;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DxD,aAAa,iBAAiB0D;QAG9BN,cAAcQ,iBAAiBF,YAAY/C,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BoD;IAC1C;IAEA,IAAIS,cAAc7C;IAClB,IAAIoC,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B/C,kBACAoC,aACArC,AAAW,iBAAXA;QAEF8C,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAajB,KAAK,SAAS,CAACiB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBtD,OAO7C;IACC,MAAM,EAAEuD,SAAS,EAAEtD,OAAO,EAAEuD,aAAa,EAAE9D,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe8C;IACrB,MAAMnD,mBAAmBL,QAAQ,UAAU,CAAC,OAAO;IAEnD,MAAMyD,wBAAwBC,uBAC5B3D,QAAQ,eAAe,IAAI,IAC3BuD;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKtD;YACL,QAAQ;QACV;IACF;IAGFsD,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM/D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASiD;QACX;KACD;IAED,IAAIlE,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMoB,SAAS,MAAMC,yBACnB9C,MACAS;IAEF,OAAO;QACL,aAAaoC,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB3D,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeoD;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMnE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASqD;QACX;KACD;IAED,MAAMxB,SAAS,MAAMrC,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBoC,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
@@ -1,13 +1,14 @@
1
1
  import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
2
2
  import { getDebug } from "@midscene/shared/logger";
3
3
  import { assert } from "@midscene/shared/utils";
4
- import { buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField } from "../common.mjs";
4
+ import { buildYamlFlowFromPlans, fillBboxParam, finalizeActionName, findAllMidsceneLocatorField } from "../common.mjs";
5
5
  import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
6
6
  import { callAIWithObjectResponse } from "./service-caller/index.mjs";
7
7
  const debug = getDebug('planning');
8
8
  async function plan(userInstruction, opts) {
9
9
  const { context, modelConfig, conversationHistory } = opts;
10
- const { screenshotBase64, size } = context;
10
+ const { size } = context;
11
+ const screenshotBase64 = context.screenshot.getData();
11
12
  const { vlMode } = modelConfig;
12
13
  const systemPrompt = await systemPromptToTaskPlanning({
13
14
  actionSpace: opts.actionSpace,
@@ -88,13 +89,23 @@ async function plan(userInstruction, opts) {
88
89
  const actions = planFromAI.action ? [
89
90
  planFromAI.action
90
91
  ] : [];
92
+ let shouldContinuePlanning = true;
93
+ if (!actions.length) {
94
+ debug('no actions planned and no sleep instruction, stop planning');
95
+ shouldContinuePlanning = false;
96
+ }
97
+ if (actions[0]?.type === finalizeActionName) {
98
+ debug('finalize action planned, stop planning');
99
+ shouldContinuePlanning = false;
100
+ }
91
101
  const returnValue = {
92
102
  ...planFromAI,
93
103
  actions,
94
104
  rawResponse,
95
105
  usage,
96
106
  reasoning_content,
97
- yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace, planFromAI.sleep)
107
+ yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),
108
+ shouldContinuePlanning
98
109
  };
99
110
  assert(planFromAI, "can't get plans from AI");
100
111
  actions.forEach((action)=>{
@@ -108,7 +119,6 @@ async function plan(userInstruction, opts) {
108
119
  if (locateResult && void 0 !== vlMode) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, vlMode);
109
120
  });
110
121
  });
111
- if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
112
122
  conversationHistory.append({
113
123
  role: 'assistant',
114
124
  content: [
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n reasoning_content,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n modelConfig,\n {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n },\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","reasoning_content","callAIWithObjectResponse","undefined","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","console"],"mappings":";;;;;;AAsBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeL;IACnB,IAAIM,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAC7B,MAAMO,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAEJ,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAC3B,MAAMC,aAAahB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMoB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCU;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,yBACRL,MACAlB,aACA;QACE,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAe0B,SAAY1B,KAAK,SAAS;IACpE;IAGF,MAAM2B,UAAUN,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,MAAMO,cAAkC;QACtC,GAAGP,UAAU;QACbM;QACAL;QACAC;QACAC;QACA,UAAUK,uBACRF,SACA3B,KAAK,WAAW,EAChBqB,WAAW,KAAK;IAEpB;IAEAS,OAAOT,YAAY;IAEnBM,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBjC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAAC+B,SAAWA,OAAO,IAAI,KAAKC;QAG9BpC,MAAM,+BAA+BqC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENrC,MAAM,gBAAgBsC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgB/B,AAAWoB,WAAXpB,QAElByB,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACA3B,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEA,IACEqB,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBW,QAAQ,IAAI,CACV,8EACAxC;IAIJI,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMmB;YACR;SACD;IACH;IAEA,OAAOM;AACT"}
1
+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n finalizeActionName,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { size } = context;\n const screenshotBase64 = context.screenshot.getData();\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n reasoning_content,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n modelConfig,\n {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n },\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n if (!actions.length) {\n debug('no actions planned and no sleep instruction, stop planning');\n shouldContinuePlanning = false;\n }\n if (actions[0]?.type === finalizeActionName) {\n debug('finalize action planned, stop planning');\n shouldContinuePlanning = false;\n }\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","size","screenshotBase64","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","reasoning_content","callAIWithObjectResponse","undefined","actions","shouldContinuePlanning","finalizeActionName","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam"],"mappings":";;;;;;AAuBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,IAAI,EAAE,GAAGH;IACjB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,OAAO;IAEnD,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeJ;IACnB,IAAIK,aAAaN,KAAK,KAAK;IAC3B,IAAIO,cAAcP,KAAK,MAAM;IAC7B,MAAMQ,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAEJ,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAC3B,MAAMC,aAAahB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMoB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCU;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,yBACRL,MACAlB,aACA;QACE,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAe0B,SAAY1B,KAAK,SAAS;IACpE;IAGF,MAAM2B,UAAUN,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,IAAIO,yBAAyB;IAC7B,IAAI,CAACD,QAAQ,MAAM,EAAE;QACnB/B,MAAM;QACNgC,yBAAyB;IAC3B;IACA,IAAID,OAAO,CAAC,EAAE,EAAE,SAASE,oBAAoB;QAC3CjC,MAAM;QACNgC,yBAAyB;IAC3B;IACA,MAAME,cAAkC;QACtC,GAAGT,UAAU;QACbM;QACAL;QACAC;QACAC;QACA,UAAUO,uBAAuBJ,SAAS3B,KAAK,WAAW;QAC1D4B;IACF;IAEAI,OAAOX,YAAY;IAEnBM,QAAQ,OAAO,CAAC,CAACM;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBnC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACiC,SAAWA,OAAO,IAAI,KAAKC;QAG9BtC,MAAM,+BAA+BuC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENvC,MAAM,gBAAgBwC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgBjC,AAAWoB,WAAXpB,QAElB2B,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACA7B,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEAH,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMmB;YACR;SACD;IACH;IAEA,OAAOQ;AACT"}
@@ -1,11 +1,25 @@
1
1
  import { getZodDescription, getZodTypeName } from "@midscene/shared/zod-schema-utils";
2
2
  import { bboxDescription } from "./common.mjs";
3
- const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
4
- "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
3
+ const commonOutputFields = `"thought": string, // your thought process about the next action
4
+ "note"?: string, // some important notes to finish the follow-up action should be written here, and the agent executing the subsequent steps will focus on this information. For example, the data extracted from the current screenshot which will be used in the follow-up action.
5
+ "log": string, // a brief preamble to the user explaining what you’re about to do
6
+ "error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.`;
5
7
  const vlLocateParam = (vlMode)=>{
6
8
  if (vlMode) return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;
7
9
  return "{ prompt: string /* description of the target element */ }";
8
10
  };
11
+ const findDefaultValue = (field)=>{
12
+ let current = field;
13
+ const visited = new Set();
14
+ while(current && !visited.has(current)){
15
+ visited.add(current);
16
+ const currentWithDef = current;
17
+ if (!currentWithDef._def?.typeName) break;
18
+ if ('ZodDefault' === currentWithDef._def.typeName) return currentWithDef._def.defaultValue?.();
19
+ if ('ZodOptional' === currentWithDef._def.typeName || 'ZodNullable' === currentWithDef._def.typeName) current = currentWithDef._def.innerType;
20
+ else break;
21
+ }
22
+ };
9
23
  const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
10
24
  const tab = ' ';
11
25
  const fields = [];
@@ -21,8 +35,16 @@ const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
21
35
  const keyWithOptional = isOptional ? `${key}?` : key;
22
36
  const typeName = getZodTypeName(field, locatorSchemaTypeDescription);
23
37
  const description = getZodDescription(field);
38
+ const defaultValue = findDefaultValue(field);
39
+ const hasDefault = void 0 !== defaultValue;
24
40
  let paramLine = `${keyWithOptional}: ${typeName}`;
25
- if (description) paramLine += ` // ${description}`;
41
+ const comments = [];
42
+ if (description) comments.push(description);
43
+ if (hasDefault) {
44
+ const defaultStr = 'string' == typeof defaultValue ? `"${defaultValue}"` : JSON.stringify(defaultValue);
45
+ comments.push(`default: ${defaultStr}`);
46
+ }
47
+ if (comments.length > 0) paramLine += ` // ${comments.join(', ')}`;
26
48
  paramLines.push(paramLine);
27
49
  }
28
50
  if (paramLines.length > 0) {
@@ -76,7 +98,6 @@ Please tell what the next one action is (or null if no action should be done) to
76
98
  - Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
77
99
  - Make sure the previous actions are completed successfully before performing the next step
78
100
  - If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the "error" field to the error message.
79
- - If there is nothing to do but waiting, set the "sleep" field to the positive waiting time in milliseconds and null for the "action" field.
80
101
  - Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the "Print_Assert_Result" action.
81
102
 
82
103
  ## Supporting actions
@@ -88,7 +109,6 @@ ${logFieldInstruction}
88
109
 
89
110
  Return in JSON format:
90
111
  {
91
- "log": string, // a brief preamble to the user explaining what you’re about to do
92
112
  ${commonOutputFields}
93
113
  "action":
94
114
  {
@@ -96,16 +116,14 @@ Return in JSON format:
96
116
  "param"?: { // The parameter of the action, if any
97
117
  // k-v style parameter fields
98
118
  },
99
- } | null,
100
- ,
101
- "sleep"?: number, // The sleep time after the action, in milliseconds.
119
+ } | null
102
120
  }
103
121
 
104
122
  For example, if the instruction is to login and the form has already been filled, this is a valid return value:
105
123
 
106
124
  {
125
+ "thought": "The form has already been filled, I need to click the login button to login",
107
126
  "log": "Click the login button",
108
- "more_actions_needed_by_instruction": false,
109
127
  "action": {
110
128
  "type": "Tap",
111
129
  "param": {
@@ -115,6 +133,23 @@ For example, if the instruction is to login and the form has already been filled
115
133
  }
116
134
  }
117
135
  }
136
+
137
+ For example, if the instruction is to find out every title in the screenshot, the return value should be:
138
+
139
+ {
140
+ "thought": "I need to note the titles in the current screenshot for further processing and scroll to find more titles",
141
+ "note": "The titles in the current screenshot are: 'Hello, world!', 'Midscene 101', 'Model strategy'",
142
+ "log": "Scroll to find more titles",
143
+ "action": {
144
+ "type": "Scroll",
145
+ "param": {
146
+ "locate": {
147
+ "prompt": "The page content area"
148
+ },
149
+ "direction": "down"
150
+ }
151
+ }
152
+ }
118
153
  `;
119
154
  }
120
155
  export { descriptionForAction, systemPromptToTaskPlanning };