@midscene/core 1.0.1-beta-20251204013753.0 → 1.0.1-beta-20251204032807.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +1 -1
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +15 -14
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/llm-planning.mjs +1 -3
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +31 -26
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/device/index.mjs +3 -12
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/service/index.mjs +1 -1
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/player.mjs +29 -61
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/lib/agent/agent.js +1 -1
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/tasks.js +15 -14
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/llm-planning.js +1 -3
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +31 -26
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/device/index.js +2 -14
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/service/index.js +1 -1
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/player.js +29 -61
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/types/agent/tasks.d.ts +2 -2
- package/dist/types/ai-model/llm-planning.d.ts +1 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -3
- package/dist/types/device/index.d.ts +4 -15
- package/dist/types/types.d.ts +2 -1
- package/dist/types/yaml.d.ts +6 -25
- package/package.json +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot();\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","callAIWithObjectResponse","AIActionType","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","undefined","fillBboxParam","console"],"mappings":";;;;;;AAsBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAQC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeL;IACnB,IAAIM,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAC7B,MAAMO,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,yBAAyB,EAAEhB,KAAK,aAAa,CAAC,8CAA8C,EAAED,gBAAgB,mBAAmB,CAAC;gBAC3I;aACD;QACH;KACD;IAED,IAAIkB;IAEJ,IAAId,oBAAoB,sBAAsB,EAAE;QAC9Cc,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGd,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEc,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKR;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACc;IAC3B,MAAMC,aAAaf,oBAAoB,QAAQ;IAE/C,MAAMgB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASZ;QAAa;WACrCS;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACN,GAAG,MAAMC,yBACRJ,MACAK,aAAa,IAAI,EACjBtB;IAGF,MAAMuB,UAAUL,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,MAAMM,cAAkC;QACtC,GAAGN,UAAU;QACbK;QACAJ;QACAC;QACA,UAAUK,uBACRF,SACAzB,KAAK,WAAW,EAChBoB,WAAW,KAAK;IAEpB;IAEAQ,OAAOR,YAAY;IAEnBK,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsB/B,KAAK,WAAW,CAAC,IAAI,CAC/C,CAAC6B,SAAWA,OAAO,IAAI,KAAKC;QAG9BlC,MAAM,+BAA+BmC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENnC,MAAM,gBAAgBoC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgB7B,AAAW8B,WAAX9B,QAElBuB,OAAO,KAAK,CAACK,MAAM,GAAGG,cACpBF,cACAzB,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEA,IACEmB,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACAvC;IAIJI,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMkB;YACR;SACD;IACH;IAEA,OAAOK;AACT"}
|
|
@@ -101,30 +101,52 @@ const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
|
|
|
101
101
|
${tab}${fields.join(`\n${tab}`)}
|
|
102
102
|
`.trim();
|
|
103
103
|
};
|
|
104
|
-
async function systemPromptToTaskPlanning({ actionSpace, vlMode, includeBbox
|
|
104
|
+
async function systemPromptToTaskPlanning({ actionSpace, vlMode, includeBbox }) {
|
|
105
105
|
if (includeBbox && !vlMode) throw new Error('vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.');
|
|
106
106
|
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam(includeBbox ? vlMode : void 0)));
|
|
107
107
|
const actionList = actionDescriptionList.join('\n');
|
|
108
|
-
const
|
|
109
|
-
|
|
108
|
+
const logFieldInstruction = `
|
|
109
|
+
## About the \`log\` field (preamble message)
|
|
110
|
+
|
|
111
|
+
The \`log\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:
|
|
112
|
+
|
|
113
|
+
- **Use the same language as the user's instruction**
|
|
114
|
+
- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).
|
|
115
|
+
- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.
|
|
116
|
+
- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.
|
|
117
|
+
|
|
118
|
+
**Examples:**
|
|
119
|
+
- "Click the login button"
|
|
120
|
+
- "Scroll to find the 'Yes' button in popup"
|
|
121
|
+
- "Previous actions failed to find the 'Yes' button, i will try again"
|
|
122
|
+
- "Go back to find the login button"
|
|
123
|
+
`;
|
|
110
124
|
return `
|
|
111
|
-
Target: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action to accomplish the instruction.
|
|
125
|
+
Target: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.
|
|
112
126
|
|
|
113
127
|
Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
114
128
|
|
|
115
|
-
|
|
116
|
-
|
|
129
|
+
## Rules
|
|
130
|
+
|
|
131
|
+
- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.
|
|
117
132
|
- Give just the next ONE action you should do
|
|
133
|
+
- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
|
|
118
134
|
- Make sure the previous actions are completed successfully before performing the next step
|
|
119
135
|
- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the "error" field to the error message.
|
|
120
|
-
- If
|
|
136
|
+
- If there is nothing to do but waiting, set the "sleep" field to the positive waiting time in milliseconds and null for the "action" field.
|
|
137
|
+
- When the next step is to assert something, this is a very important step, you should think about it carefully and give a solid result. Write your result in the "log" field like this: "Assert: <condition>. I think <...>, so the result is <true / false>". You don't need to give the next one action when you are asserting something. If the assertion result is false, think this an fatal error and set the reason into the "error" field. If the assertion result is true, you can continue to the next step.
|
|
121
138
|
|
|
122
|
-
Supporting actions
|
|
139
|
+
## Supporting actions
|
|
123
140
|
${actionList}
|
|
124
141
|
|
|
142
|
+
${logFieldInstruction}
|
|
143
|
+
|
|
144
|
+
## Return format
|
|
145
|
+
|
|
125
146
|
Return in JSON format:
|
|
126
147
|
{
|
|
127
|
-
|
|
148
|
+
"log": string, // a brief preamble to the user explaining what you’re about to do
|
|
149
|
+
${commonOutputFields}
|
|
128
150
|
"action":
|
|
129
151
|
{
|
|
130
152
|
// one of the supporting actions
|
|
@@ -132,23 +154,6 @@ Return in JSON format:
|
|
|
132
154
|
,
|
|
133
155
|
"sleep"?: number, // The sleep time after the action, in milliseconds.
|
|
134
156
|
}
|
|
135
|
-
|
|
136
|
-
For example, when the instruction is "click 'Confirm' button, and click 'Yes' in popup" and the previous log shows "The 'Confirm' button has been clicked", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.
|
|
137
|
-
|
|
138
|
-
this and output the JSON:
|
|
139
|
-
|
|
140
|
-
{
|
|
141
|
-
${exampleLogField}"action": {
|
|
142
|
-
"type": "Tap",
|
|
143
|
-
"param": {
|
|
144
|
-
"locate": {
|
|
145
|
-
${vlMode ? '"bbox": [100, 100, 200, 200],' : ''}
|
|
146
|
-
"prompt": "The 'Yes' button in popup"
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
},
|
|
150
|
-
"more_actions_needed_by_instruction": false,
|
|
151
|
-
}
|
|
152
157
|
`;
|
|
153
158
|
}
|
|
154
159
|
const planSchema = {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction, ThinkingStrategy } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { ifMidsceneLocatorField } from '../../common';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as any;\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n // Helper function to get type name from zod schema\n const getTypeName = (field: any): string => {\n // Recursively unwrap optional, nullable, and other wrapper types to get the actual inner type\n const unwrapField = (f: any): any => {\n if (!f._def) return f;\n\n const typeName = f._def.typeName;\n\n // Handle wrapper types that have innerType\n if (\n typeName === 'ZodOptional' ||\n typeName === 'ZodNullable' ||\n typeName === 'ZodDefault'\n ) {\n return unwrapField(f._def.innerType);\n }\n\n // Handle ZodEffects (transformations, refinements, preprocessors)\n if (typeName === 'ZodEffects') {\n // For ZodEffects, unwrap the schema field which contains the underlying type\n if (f._def.schema) {\n return unwrapField(f._def.schema);\n }\n }\n\n return f;\n };\n\n const actualField = unwrapField(field);\n const fieldTypeName = actualField._def?.typeName;\n\n if (fieldTypeName === 'ZodString') return 'string';\n if (fieldTypeName === 'ZodNumber') return 'number';\n if (fieldTypeName === 'ZodBoolean') return 'boolean';\n if (fieldTypeName === 'ZodArray') return 'array';\n if (fieldTypeName === 'ZodObject') {\n // Check if this is a passthrough object (like MidsceneLocation)\n if (ifMidsceneLocatorField(actualField)) {\n return locatorSchemaTypeDescription;\n }\n return 'object';\n }\n if (fieldTypeName === 'ZodEnum') {\n const values =\n (actualField._def?.values as unknown[] | undefined)\n ?.map((option: unknown) => String(`'${option}'`))\n .join(', ') ?? 'enum';\n\n return `enum(${values})`;\n }\n // Handle ZodUnion by taking the first option (for display purposes)\n if (fieldTypeName === 'ZodUnion') {\n const options = actualField._def?.options as any[] | undefined;\n if (options && options.length > 0) {\n // For unions, list all types\n const types = options.map((opt: any) => getTypeName(opt));\n return types.join(' | ');\n }\n return 'union';\n }\n\n console.warn(\n 'failed to parse Zod type. This may lead to wrong params from the LLM.\\n',\n actualField._def,\n );\n return actualField.toString();\n };\n\n // Helper function to get description from zod schema\n const getDescription = (field: z.ZodTypeAny): string | null => {\n // Recursively unwrap optional, nullable, and other wrapper types to get the actual inner type\n const unwrapField = (f: any): any => {\n if (!f._def) return f;\n\n const typeName = f._def.typeName;\n\n // Handle wrapper types that have innerType\n if (\n typeName === 'ZodOptional' ||\n typeName === 'ZodNullable' ||\n typeName === 'ZodDefault'\n ) {\n return unwrapField(f._def.innerType);\n }\n\n // Handle ZodEffects (transformations, refinements, preprocessors)\n if (typeName === 'ZodEffects') {\n // For ZodEffects, unwrap the schema field which contains the underlying type\n if (f._def.schema) {\n return unwrapField(f._def.schema);\n }\n }\n\n return f;\n };\n\n // Check for direct description on the original field (wrapper may have description)\n if ('description' in field) {\n return field.description || null;\n }\n\n const actualField = unwrapField(field);\n\n // Check for description on the unwrapped field\n if ('description' in actualField) {\n return actualField.description || null;\n }\n\n // Check for MidsceneLocation fields and add description\n if (actualField._def?.typeName === 'ZodObject') {\n if ('midscene_location_field_flag' in actualField._def.shape()) {\n return 'Location information for the target element';\n }\n }\n\n return null;\n };\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as any).isOptional === 'function' &&\n (field as any).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name\n const typeName = getTypeName(field);\n\n // Get description\n const description = getDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n // For simple primitive types, the param should be passed directly as the value\n const schemaTypeName = schema._def?.typeName;\n let typeName = 'unknown';\n\n if (schemaTypeName === 'ZodString') typeName = 'string';\n else if (schemaTypeName === 'ZodNumber') typeName = 'number';\n else if (schemaTypeName === 'ZodBoolean') typeName = 'boolean';\n\n // Get description if available\n const description = 'description' in schema ? schema.description : null;\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n thinkingStrategy,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n thinkingStrategy: ThinkingStrategy;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n // Conditionally include log field based on thinkingStrategy\n const logFieldDefinition =\n thinkingStrategy === 'off'\n ? ''\n : '\"log\": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: \"The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action \\'{ action-type }\\' to do this\". If no action should be done, log the reason. Use the same language as the user\\'s instruction.\\n ';\n\n const exampleLogField =\n thinkingStrategy === 'off'\n ? ''\n : \"\\\"log\\\": \\\"The user wants to do click 'Confirm' button, and click 'Yes' in popup. The current progress is ..., we still need to ... . Now i am going to compose an action '...' to click 'Yes' in popup.\\\",\\n \";\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\nRestriction:\n- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If the user mentions something to assert and the condition is not met, you should think this is an error and set the \"error\" field to the error message.\n\nSupporting actions:\n${actionList}\n\nReturn in JSON format:\n{\n ${logFieldDefinition}${commonOutputFields}\n \"action\": \n {\n // one of the supporting actions\n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, when the instruction is \"click 'Confirm' button, and click 'Yes' in popup\" and the previous log shows \"The 'Confirm' button has been clicked\", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.\n\nthis and output the JSON:\n\n{\n ${exampleLogField}\"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": {\n ${vlMode ? `\"bbox\": [100, 100, 200, 200],` : ''}\n \"prompt\": \"The 'Yes' button in popup\"\n }\n }\n },\n \"more_actions_needed_by_instruction\": false,\n}\n`;\n}\n\nexport const planSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'action_items',\n strict: false,\n schema: {\n type: 'object',\n strict: false,\n properties: {\n actions: {\n type: 'array',\n items: {\n type: 'object',\n strict: false,\n properties: {\n thought: {\n type: 'string',\n description:\n 'Reasons for generating this task, and why this task is feasible on this page',\n },\n type: {\n type: 'string',\n description: 'Type of action',\n },\n param: {\n anyOf: [\n { type: 'null' },\n {\n type: 'object',\n additionalProperties: true,\n },\n ],\n description: 'Parameter of the action',\n },\n locate: {\n type: ['object', 'null'],\n properties: {\n id: { type: 'string' },\n prompt: { type: 'string' },\n },\n required: ['id', 'prompt'],\n additionalProperties: false,\n description: 'Location information for the target element',\n },\n },\n required: ['thought', 'type', 'param', 'locate'],\n additionalProperties: false,\n },\n description: 'List of actions to be performed',\n },\n more_actions_needed_by_instruction: {\n type: 'boolean',\n description:\n 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.',\n },\n log: {\n type: 'string',\n description:\n 'Log what these planned actions do. Do not include further actions that have not been planned.',\n },\n error: {\n type: ['string', 'null'],\n description: 'Error messages about unexpected situations',\n },\n },\n required: [\n 'actions',\n 'more_actions_needed_by_instruction',\n 'log',\n 'error',\n ],\n additionalProperties: false,\n },\n },\n};\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","getTypeName","field","unwrapField","f","typeName","actualField","fieldTypeName","ifMidsceneLocatorField","values","option","String","options","types","opt","console","getDescription","key","Object","isOptional","keyWithOptional","description","paramLine","line","schemaTypeName","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","thinkingStrategy","Error","actionDescriptionList","undefined","actionList","logFieldDefinition","exampleLogField","planSchema"],"mappings":";;AASA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QACjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAG1B,MAAMG,cAAc,CAACC;gBAEnB,MAAMC,cAAc,CAACC;oBACnB,IAAI,CAACA,EAAE,IAAI,EAAE,OAAOA;oBAEpB,MAAMC,WAAWD,EAAE,IAAI,CAAC,QAAQ;oBAGhC,IACEC,AAAa,kBAAbA,YACAA,AAAa,kBAAbA,YACAA,AAAa,iBAAbA,UAEA,OAAOF,YAAYC,EAAE,IAAI,CAAC,SAAS;oBAIrC,IAAIC,AAAa,iBAAbA,UAEF;wBAAA,IAAID,EAAE,IAAI,CAAC,MAAM,EACf,OAAOD,YAAYC,EAAE,IAAI,CAAC,MAAM;oBAClC;oBAGF,OAAOA;gBACT;gBAEA,MAAME,cAAcH,YAAYD;gBAChC,MAAMK,gBAAgBD,YAAY,IAAI,EAAE;gBAExC,IAAIC,AAAkB,gBAAlBA,eAA+B,OAAO;gBAC1C,IAAIA,AAAkB,gBAAlBA,eAA+B,OAAO;gBAC1C,IAAIA,AAAkB,iBAAlBA,eAAgC,OAAO;gBAC3C,IAAIA,AAAkB,eAAlBA,eAA8B,OAAO;gBACzC,IAAIA,AAAkB,gBAAlBA,eAA+B;oBAEjC,IAAIC,uBAAuBF,cACzB,OAAOZ;oBAET,OAAO;gBACT;gBACA,IAAIa,AAAkB,cAAlBA,eAA6B;oBAC/B,MAAME,SACHH,YAAY,IAAI,EAAE,QACf,IAAI,CAACI,SAAoBC,OAAO,CAAC,CAAC,EAAED,OAAO,CAAC,CAAC,GAC9C,KAAK,SAAS;oBAEnB,OAAO,CAAC,KAAK,EAAED,OAAO,CAAC,CAAC;gBAC1B;gBAEA,IAAIF,AAAkB,eAAlBA,eAA8B;oBAChC,MAAMK,UAAUN,YAAY,IAAI,EAAE;oBAClC,IAAIM,WAAWA,QAAQ,MAAM,GAAG,GAAG;wBAEjC,MAAMC,QAAQD,QAAQ,GAAG,CAAC,CAACE,MAAab,YAAYa;wBACpD,OAAOD,MAAM,IAAI,CAAC;oBACpB;oBACA,OAAO;gBACT;gBAEAE,QAAQ,IAAI,CACV,2EACAT,YAAY,IAAI;gBAElB,OAAOA,YAAY,QAAQ;YAC7B;YAGA,MAAMU,iBAAiB,CAACd;gBAEtB,MAAMC,cAAc,CAACC;oBACnB,IAAI,CAACA,EAAE,IAAI,EAAE,OAAOA;oBAEpB,MAAMC,WAAWD,EAAE,IAAI,CAAC,QAAQ;oBAGhC,IACEC,AAAa,kBAAbA,YACAA,AAAa,kBAAbA,YACAA,AAAa,iBAAbA,UAEA,OAAOF,YAAYC,EAAE,IAAI,CAAC,SAAS;oBAIrC,IAAIC,AAAa,iBAAbA,UAEF;wBAAA,IAAID,EAAE,IAAI,CAAC,MAAM,EACf,OAAOD,YAAYC,EAAE,IAAI,CAAC,MAAM;oBAClC;oBAGF,OAAOA;gBACT;gBAGA,IAAI,iBAAiBF,OACnB,OAAOA,MAAM,WAAW,IAAI;gBAG9B,MAAMI,cAAcH,YAAYD;gBAGhC,IAAI,iBAAiBI,aACnB,OAAOA,YAAY,WAAW,IAAI;gBAIpC,IAAIA,YAAY,IAAI,EAAE,aAAa,aACjC;oBAAA,IAAI,kCAAkCA,YAAY,IAAI,CAAC,KAAK,IAC1D,OAAO;gBACT;gBAGF,OAAO;YACT;YAEA,KAAK,MAAM,CAACW,KAAKf,MAAM,IAAIgB,OAAO,OAAO,CAAClB,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMiB,aACJ,AAAqC,cAArC,OAAQjB,MAAc,UAAU,IAC/BA,MAAc,UAAU;gBAC3B,MAAMkB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMZ,WAAWJ,YAAYC;gBAG7B,MAAMmB,cAAcL,eAAed;gBAGnC,IAAIoB,YAAY,GAAGF,gBAAgB,EAAE,EAAEf,UAAU;gBACjD,IAAIgB,aACFC,aAAa,CAAC,IAAI,EAAED,aAAa;gBAGnCxB,WAAW,IAAI,CAACyB;YAClB;YAIF,IAAIzB,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAAC0B;oBAClB3B,OAAO,IAAI,CAAC,CAAC,IAAI,EAAE2B,MAAM;gBAC3B;YACF;QACF,OAAO;YAGL,MAAMC,iBAAiB1B,OAAO,IAAI,EAAE;YACpC,IAAIO,WAAW;YAEf,IAAImB,AAAmB,gBAAnBA,gBAAgCnB,WAAW;iBAC1C,IAAImB,AAAmB,gBAAnBA,gBAAgCnB,WAAW;iBAC/C,IAAImB,AAAmB,iBAAnBA,gBAAiCnB,WAAW;YAGrD,MAAMgB,cAAc,iBAAiBvB,SAASA,OAAO,WAAW,GAAG;YAGnE,IAAI2B,mBAAmB,CAAC,SAAS,EAAEpB,UAAU;YAC7C,IAAIgB,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpB7B,OAAO,IAAI,CAAC6B;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEhC,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAe+B,2BAA2B,EAC/CC,WAAW,EACXrC,MAAM,EACNsC,WAAW,EACXC,gBAAgB,EAMjB;IAEC,IAAID,eAAe,CAACtC,QAClB,MAAM,IAAIwC,MACR;IAIJ,MAAMC,wBAAwBJ,YAAY,GAAG,CAAC,CAAClC,SACtCD,qBACLC,QACAJ,cAAcuC,cAActC,SAAS0C;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAG9C,MAAMG,qBACJL,AAAqB,UAArBA,mBACI,KACA;IAEN,MAAMM,kBACJN,AAAqB,UAArBA,mBACI,KACA;IAEN,OAAO,CAAC;;;;;;;;;;;;;AAaV,EAAEI,WAAW;;;;EAIX,EAAEC,qBAAqB9C,mBAAmB;;;;;;;;;;;;;;EAc1C,EAAE+C,gBAAgB;;;;QAIZ,EAAE7C,SAAS,kCAAkC,GAAG;;;;;;;AAOxD,CAAC;AACD;AAEO,MAAM8C,aAAuC;IAClD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,QAAQ;YACR,YAAY;gBACV,SAAS;oBACP,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,QAAQ;wBACR,YAAY;4BACV,SAAS;gCACP,MAAM;gCACN,aACE;4BACJ;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,OAAO;gCACL,OAAO;oCACL;wCAAE,MAAM;oCAAO;oCACf;wCACE,MAAM;wCACN,sBAAsB;oCACxB;iCACD;gCACD,aAAa;4BACf;4BACA,QAAQ;gCACN,MAAM;oCAAC;oCAAU;iCAAO;gCACxB,YAAY;oCACV,IAAI;wCAAE,MAAM;oCAAS;oCACrB,QAAQ;wCAAE,MAAM;oCAAS;gCAC3B;gCACA,UAAU;oCAAC;oCAAM;iCAAS;gCAC1B,sBAAsB;gCACtB,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAW;4BAAQ;4BAAS;yBAAS;wBAChD,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,oCAAoC;oBAClC,MAAM;oBACN,aACE;gBACJ;gBACA,KAAK;oBACH,MAAM;oBACN,aACE;gBACJ;gBACA,OAAO;oBACL,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBACR;gBACA;gBACA;gBACA;aACD;YACD,sBAAsB;QACxB;IACF;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { ifMidsceneLocatorField } from '../../common';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as any;\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n // Helper function to get type name from zod schema\n const getTypeName = (field: any): string => {\n // Recursively unwrap optional, nullable, and other wrapper types to get the actual inner type\n const unwrapField = (f: any): any => {\n if (!f._def) return f;\n\n const typeName = f._def.typeName;\n\n // Handle wrapper types that have innerType\n if (\n typeName === 'ZodOptional' ||\n typeName === 'ZodNullable' ||\n typeName === 'ZodDefault'\n ) {\n return unwrapField(f._def.innerType);\n }\n\n // Handle ZodEffects (transformations, refinements, preprocessors)\n if (typeName === 'ZodEffects') {\n // For ZodEffects, unwrap the schema field which contains the underlying type\n if (f._def.schema) {\n return unwrapField(f._def.schema);\n }\n }\n\n return f;\n };\n\n const actualField = unwrapField(field);\n const fieldTypeName = actualField._def?.typeName;\n\n if (fieldTypeName === 'ZodString') return 'string';\n if (fieldTypeName === 'ZodNumber') return 'number';\n if (fieldTypeName === 'ZodBoolean') return 'boolean';\n if (fieldTypeName === 'ZodArray') return 'array';\n if (fieldTypeName === 'ZodObject') {\n // Check if this is a passthrough object (like MidsceneLocation)\n if (ifMidsceneLocatorField(actualField)) {\n return locatorSchemaTypeDescription;\n }\n return 'object';\n }\n if (fieldTypeName === 'ZodEnum') {\n const values =\n (actualField._def?.values as unknown[] | undefined)\n ?.map((option: unknown) => String(`'${option}'`))\n .join(', ') ?? 'enum';\n\n return `enum(${values})`;\n }\n // Handle ZodUnion by taking the first option (for display purposes)\n if (fieldTypeName === 'ZodUnion') {\n const options = actualField._def?.options as any[] | undefined;\n if (options && options.length > 0) {\n // For unions, list all types\n const types = options.map((opt: any) => getTypeName(opt));\n return types.join(' | ');\n }\n return 'union';\n }\n\n console.warn(\n 'failed to parse Zod type. This may lead to wrong params from the LLM.\\n',\n actualField._def,\n );\n return actualField.toString();\n };\n\n // Helper function to get description from zod schema\n const getDescription = (field: z.ZodTypeAny): string | null => {\n // Recursively unwrap optional, nullable, and other wrapper types to get the actual inner type\n const unwrapField = (f: any): any => {\n if (!f._def) return f;\n\n const typeName = f._def.typeName;\n\n // Handle wrapper types that have innerType\n if (\n typeName === 'ZodOptional' ||\n typeName === 'ZodNullable' ||\n typeName === 'ZodDefault'\n ) {\n return unwrapField(f._def.innerType);\n }\n\n // Handle ZodEffects (transformations, refinements, preprocessors)\n if (typeName === 'ZodEffects') {\n // For ZodEffects, unwrap the schema field which contains the underlying type\n if (f._def.schema) {\n return unwrapField(f._def.schema);\n }\n }\n\n return f;\n };\n\n // Check for direct description on the original field (wrapper may have description)\n if ('description' in field) {\n return field.description || null;\n }\n\n const actualField = unwrapField(field);\n\n // Check for description on the unwrapped field\n if ('description' in actualField) {\n return actualField.description || null;\n }\n\n // Check for MidsceneLocation fields and add description\n if (actualField._def?.typeName === 'ZodObject') {\n if ('midscene_location_field_flag' in actualField._def.shape()) {\n return 'Location information for the target element';\n }\n }\n\n return null;\n };\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as any).isOptional === 'function' &&\n (field as any).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name\n const typeName = getTypeName(field);\n\n // Get description\n const description = getDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n // For simple primitive types, the param should be passed directly as the value\n const schemaTypeName = schema._def?.typeName;\n let typeName = 'unknown';\n\n if (schemaTypeName === 'ZodString') typeName = 'string';\n else if (schemaTypeName === 'ZodNumber') typeName = 'number';\n else if (schemaTypeName === 'ZodBoolean') typeName = 'boolean';\n\n // Get description if available\n const description = 'description' in schema ? schema.description : null;\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If there is nothing to do but waiting, set the \"sleep\" field to the positive waiting time in milliseconds and null for the \"action\" field.\n- When the next step is to assert something, this is a very important step, you should think about it carefully and give a solid result. Write your result in the \"log\" field like this: \"Assert: <condition>. I think <...>, so the result is <true / false>\". You don't need to give the next one action when you are asserting something. If the assertion result is false, think this an fatal error and set the reason into the \"error\" field. If the assertion result is true, you can continue to the next step.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n // one of the supporting actions\n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n`;\n}\n\nexport const planSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'action_items',\n strict: false,\n schema: {\n type: 'object',\n strict: false,\n properties: {\n actions: {\n type: 'array',\n items: {\n type: 'object',\n strict: false,\n properties: {\n thought: {\n type: 'string',\n description:\n 'Reasons for generating this task, and why this task is feasible on this page',\n },\n type: {\n type: 'string',\n description: 'Type of action',\n },\n param: {\n anyOf: [\n { type: 'null' },\n {\n type: 'object',\n additionalProperties: true,\n },\n ],\n description: 'Parameter of the action',\n },\n locate: {\n type: ['object', 'null'],\n properties: {\n id: { type: 'string' },\n prompt: { type: 'string' },\n },\n required: ['id', 'prompt'],\n additionalProperties: false,\n description: 'Location information for the target element',\n },\n },\n required: ['thought', 'type', 'param', 'locate'],\n additionalProperties: false,\n },\n description: 'List of actions to be performed',\n },\n more_actions_needed_by_instruction: {\n type: 'boolean',\n description:\n 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.',\n },\n log: {\n type: 'string',\n description:\n 'Log what these planned actions do. Do not include further actions that have not been planned.',\n },\n error: {\n type: ['string', 'null'],\n description: 'Error messages about unexpected situations',\n },\n },\n required: [\n 'actions',\n 'more_actions_needed_by_instruction',\n 'log',\n 'error',\n ],\n additionalProperties: false,\n },\n },\n};\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","getTypeName","field","unwrapField","f","typeName","actualField","fieldTypeName","ifMidsceneLocatorField","values","option","String","options","types","opt","console","getDescription","key","Object","isOptional","keyWithOptional","description","paramLine","line","schemaTypeName","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","undefined","actionList","logFieldInstruction","planSchema"],"mappings":";;AASA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QACjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAG1B,MAAMG,cAAc,CAACC;gBAEnB,MAAMC,cAAc,CAACC;oBACnB,IAAI,CAACA,EAAE,IAAI,EAAE,OAAOA;oBAEpB,MAAMC,WAAWD,EAAE,IAAI,CAAC,QAAQ;oBAGhC,IACEC,AAAa,kBAAbA,YACAA,AAAa,kBAAbA,YACAA,AAAa,iBAAbA,UAEA,OAAOF,YAAYC,EAAE,IAAI,CAAC,SAAS;oBAIrC,IAAIC,AAAa,iBAAbA,UAEF;wBAAA,IAAID,EAAE,IAAI,CAAC,MAAM,EACf,OAAOD,YAAYC,EAAE,IAAI,CAAC,MAAM;oBAClC;oBAGF,OAAOA;gBACT;gBAEA,MAAME,cAAcH,YAAYD;gBAChC,MAAMK,gBAAgBD,YAAY,IAAI,EAAE;gBAExC,IAAIC,AAAkB,gBAAlBA,eAA+B,OAAO;gBAC1C,IAAIA,AAAkB,gBAAlBA,eAA+B,OAAO;gBAC1C,IAAIA,AAAkB,iBAAlBA,eAAgC,OAAO;gBAC3C,IAAIA,AAAkB,eAAlBA,eAA8B,OAAO;gBACzC,IAAIA,AAAkB,gBAAlBA,eAA+B;oBAEjC,IAAIC,uBAAuBF,cACzB,OAAOZ;oBAET,OAAO;gBACT;gBACA,IAAIa,AAAkB,cAAlBA,eAA6B;oBAC/B,MAAME,SACHH,YAAY,IAAI,EAAE,QACf,IAAI,CAACI,SAAoBC,OAAO,CAAC,CAAC,EAAED,OAAO,CAAC,CAAC,GAC9C,KAAK,SAAS;oBAEnB,OAAO,CAAC,KAAK,EAAED,OAAO,CAAC,CAAC;gBAC1B;gBAEA,IAAIF,AAAkB,eAAlBA,eAA8B;oBAChC,MAAMK,UAAUN,YAAY,IAAI,EAAE;oBAClC,IAAIM,WAAWA,QAAQ,MAAM,GAAG,GAAG;wBAEjC,MAAMC,QAAQD,QAAQ,GAAG,CAAC,CAACE,MAAab,YAAYa;wBACpD,OAAOD,MAAM,IAAI,CAAC;oBACpB;oBACA,OAAO;gBACT;gBAEAE,QAAQ,IAAI,CACV,2EACAT,YAAY,IAAI;gBAElB,OAAOA,YAAY,QAAQ;YAC7B;YAGA,MAAMU,iBAAiB,CAACd;gBAEtB,MAAMC,cAAc,CAACC;oBACnB,IAAI,CAACA,EAAE,IAAI,EAAE,OAAOA;oBAEpB,MAAMC,WAAWD,EAAE,IAAI,CAAC,QAAQ;oBAGhC,IACEC,AAAa,kBAAbA,YACAA,AAAa,kBAAbA,YACAA,AAAa,iBAAbA,UAEA,OAAOF,YAAYC,EAAE,IAAI,CAAC,SAAS;oBAIrC,IAAIC,AAAa,iBAAbA,UAEF;wBAAA,IAAID,EAAE,IAAI,CAAC,MAAM,EACf,OAAOD,YAAYC,EAAE,IAAI,CAAC,MAAM;oBAClC;oBAGF,OAAOA;gBACT;gBAGA,IAAI,iBAAiBF,OACnB,OAAOA,MAAM,WAAW,IAAI;gBAG9B,MAAMI,cAAcH,YAAYD;gBAGhC,IAAI,iBAAiBI,aACnB,OAAOA,YAAY,WAAW,IAAI;gBAIpC,IAAIA,YAAY,IAAI,EAAE,aAAa,aACjC;oBAAA,IAAI,kCAAkCA,YAAY,IAAI,CAAC,KAAK,IAC1D,OAAO;gBACT;gBAGF,OAAO;YACT;YAEA,KAAK,MAAM,CAACW,KAAKf,MAAM,IAAIgB,OAAO,OAAO,CAAClB,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMiB,aACJ,AAAqC,cAArC,OAAQjB,MAAc,UAAU,IAC/BA,MAAc,UAAU;gBAC3B,MAAMkB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMZ,WAAWJ,YAAYC;gBAG7B,MAAMmB,cAAcL,eAAed;gBAGnC,IAAIoB,YAAY,GAAGF,gBAAgB,EAAE,EAAEf,UAAU;gBACjD,IAAIgB,aACFC,aAAa,CAAC,IAAI,EAAED,aAAa;gBAGnCxB,WAAW,IAAI,CAACyB;YAClB;YAIF,IAAIzB,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAAC0B;oBAClB3B,OAAO,IAAI,CAAC,CAAC,IAAI,EAAE2B,MAAM;gBAC3B;YACF;QACF,OAAO;YAGL,MAAMC,iBAAiB1B,OAAO,IAAI,EAAE;YACpC,IAAIO,WAAW;YAEf,IAAImB,AAAmB,gBAAnBA,gBAAgCnB,WAAW;iBAC1C,IAAImB,AAAmB,gBAAnBA,gBAAgCnB,WAAW;iBAC/C,IAAImB,AAAmB,iBAAnBA,gBAAiCnB,WAAW;YAGrD,MAAMgB,cAAc,iBAAiBvB,SAASA,OAAO,WAAW,GAAG;YAGnE,IAAI2B,mBAAmB,CAAC,SAAS,EAAEpB,UAAU;YAC7C,IAAIgB,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpB7B,OAAO,IAAI,CAAC6B;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEhC,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAe+B,2BAA2B,EAC/CC,WAAW,EACXrC,MAAM,EACNsC,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACtC,QAClB,MAAM,IAAIuC,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAAClC,SACtCD,qBACLC,QACAJ,cAAcuC,cAActC,SAASyC;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAE9C,MAAMG,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;EAOpB,EAAE7C,mBAAmB;;;;;;;;AAQvB,CAAC;AACD;AAEO,MAAM8C,aAAuC;IAClD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,QAAQ;YACR,YAAY;gBACV,SAAS;oBACP,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,QAAQ;wBACR,YAAY;4BACV,SAAS;gCACP,MAAM;gCACN,aACE;4BACJ;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,OAAO;gCACL,OAAO;oCACL;wCAAE,MAAM;oCAAO;oCACf;wCACE,MAAM;wCACN,sBAAsB;oCACxB;iCACD;gCACD,aAAa;4BACf;4BACA,QAAQ;gCACN,MAAM;oCAAC;oCAAU;iCAAO;gCACxB,YAAY;oCACV,IAAI;wCAAE,MAAM;oCAAS;oCACrB,QAAQ;wCAAE,MAAM;oCAAS;gCAC3B;gCACA,UAAU;oCAAC;oCAAM;iCAAS;gCAC1B,sBAAsB;gCACtB,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAW;4BAAQ;4BAAS;yBAAS;wBAChD,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,oCAAoC;oBAClC,MAAM;oBACN,aACE;gBACJ;gBACA,KAAK;oBACH,MAAM;oBACN,aACE;gBACJ;gBACA,OAAO;oBACL,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBACR;gBACA;gBACA;gBACA;aACD;YACD,sBAAsB;QACxB;IACF;AACF"}
|
package/dist/es/device/index.mjs
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { getMidsceneLocationSchema } from "../common.mjs";
|
|
2
|
-
import { getDebug } from "@midscene/shared/logger";
|
|
3
2
|
import { z } from "zod";
|
|
4
3
|
class AbstractInterface {
|
|
5
4
|
}
|
|
@@ -150,23 +149,15 @@ const defineActionClearInput = (call)=>defineAction({
|
|
|
150
149
|
call
|
|
151
150
|
});
|
|
152
151
|
const actionAssertParamSchema = z.object({
|
|
153
|
-
|
|
154
|
-
result: z.boolean().describe('Whether the assertion is truthy, return true or false')
|
|
152
|
+
condition: z.string().describe('The condition of the assertion')
|
|
155
153
|
});
|
|
156
154
|
const defineActionAssert = (call)=>defineAction({
|
|
157
155
|
name: 'Assert',
|
|
158
|
-
description: 'If the user explicitly requires making an assertion (like "there should be a button with text "Yes" in the popup"), think about it, provide
|
|
156
|
+
description: 'If the user explicitly requires making an assertion (like "there should be a button with text "Yes" in the popup"), this tool will think about it, and then provide a solid result',
|
|
159
157
|
interfaceAlias: 'aiAssert',
|
|
160
158
|
paramSchema: actionAssertParamSchema,
|
|
161
159
|
call
|
|
162
160
|
});
|
|
163
|
-
|
|
164
|
-
const debug = getDebug('core:planning-assert');
|
|
165
|
-
return defineActionAssert(async (param)=>{
|
|
166
|
-
debug('assertion', param.thought, param.result);
|
|
167
|
-
if (!param.result) throw new Error(`Assertion failed: ${param.thought}`);
|
|
168
|
-
});
|
|
169
|
-
};
|
|
170
|
-
export { AbstractInterface, ActionLongPressParamSchema, ActionSwipeParamSchema, actionAssertParamSchema, actionClearInputParamSchema, actionDoubleClickParamSchema, actionDragAndDropParamSchema, actionHoverParamSchema, actionInputParamSchema, actionKeyboardPressParamSchema, actionRightClickParamSchema, actionScrollParamSchema, actionTapParamSchema, createAssertAction, defineAction, defineActionAssert, defineActionClearInput, defineActionDoubleClick, defineActionDragAndDrop, defineActionHover, defineActionInput, defineActionKeyboardPress, defineActionLongPress, defineActionRightClick, defineActionScroll, defineActionSwipe, defineActionTap };
|
|
161
|
+
export { AbstractInterface, ActionLongPressParamSchema, ActionSwipeParamSchema, actionAssertParamSchema, actionClearInputParamSchema, actionDoubleClickParamSchema, actionDragAndDropParamSchema, actionHoverParamSchema, actionInputParamSchema, actionKeyboardPressParamSchema, actionRightClickParamSchema, actionScrollParamSchema, actionTapParamSchema, defineAction, defineActionAssert, defineActionClearInput, defineActionDoubleClick, defineActionDragAndDrop, defineActionHover, defineActionInput, defineActionKeyboardPress, defineActionLongPress, defineActionRightClick, defineActionScroll, defineActionSwipe, defineActionTap };
|
|
171
162
|
|
|
172
163
|
//# sourceMappingURL=index.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"device/index.mjs","sources":["../../../src/device/index.ts"],"sourcesContent":["import { getMidsceneLocationSchema } from '@/common';\nimport type {\n ActionScrollParam,\n DeviceAction,\n LocateResultElement,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport type { ElementNode } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';\nimport { z } from 'zod';\nimport type { ElementCacheFeature, Rect, Size, UIContext } from '../types';\n\nexport abstract class AbstractInterface {\n abstract interfaceType: string;\n\n abstract screenshotBase64(): Promise<string>;\n abstract size(): Promise<Size>;\n abstract actionSpace(): DeviceAction[] | Promise<DeviceAction[]>;\n\n abstract cacheFeatureForRect?(\n rect: Rect,\n options?: {\n targetDescription?: string;\n modelConfig?: IModelConfig;\n },\n ): Promise<ElementCacheFeature>;\n abstract rectMatchesCacheFeature?(\n feature: ElementCacheFeature,\n ): Promise<Rect>;\n\n abstract destroy?(): Promise<void>;\n\n abstract describe?(): string;\n abstract beforeInvokeAction?(actionName: string, param: any): Promise<void>;\n abstract afterInvokeAction?(actionName: string, param: any): Promise<void>;\n\n // @deprecated do NOT extend this method\n abstract getElementsNodeTree?: () => Promise<ElementNode>;\n\n // @deprecated do NOT extend this method\n abstract url?: () => string | Promise<string>;\n\n // @deprecated do NOT extend this method\n abstract evaluateJavaScript?<T = any>(script: string): Promise<T>;\n\n // @deprecated do NOT extend this method\n abstract getContext?(): Promise<UIContext>;\n}\n\n// Generic function to define actions with proper type inference\n// TRuntime allows specifying a different type for the runtime parameter (after location resolution)\n// TReturn allows specifying the return type of the action\nexport const defineAction = <\n TSchema extends z.ZodType | undefined = undefined,\n TRuntime = TSchema extends z.ZodType ? z.infer<TSchema> : undefined,\n TReturn = any,\n>(\n config: {\n name: string;\n description: string;\n interfaceAlias?: string;\n paramSchema?: TSchema;\n call: (param: TRuntime) => Promise<TReturn> | TReturn;\n } & Partial<\n Omit<\n DeviceAction<TRuntime, TReturn>,\n 'name' | 'description' | 'interfaceAlias' | 'paramSchema' | 'call'\n >\n >,\n): DeviceAction<TRuntime, TReturn> => {\n return config as any; // Type assertion needed because schema validation type differs from runtime type\n};\n\n// Tap\nexport const actionTapParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe('The element to be tapped'),\n});\n// Override the inferred type to use LocateResultElement for the runtime locate field\nexport type ActionTapParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionTap = (\n call: (param: ActionTapParam) => Promise<void>,\n): DeviceAction<ActionTapParam> => {\n return defineAction<typeof actionTapParamSchema, ActionTapParam>({\n name: 'Tap',\n description: 'Tap the element',\n interfaceAlias: 'aiTap',\n paramSchema: actionTapParamSchema,\n call,\n });\n};\n\n// RightClick\nexport const actionRightClickParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe(\n 'The element to be right clicked',\n ),\n});\nexport type ActionRightClickParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionRightClick = (\n call: (param: ActionRightClickParam) => Promise<void>,\n): DeviceAction<ActionRightClickParam> => {\n return defineAction<\n typeof actionRightClickParamSchema,\n ActionRightClickParam\n >({\n name: 'RightClick',\n description: 'Right click the element',\n interfaceAlias: 'aiRightClick',\n paramSchema: actionRightClickParamSchema,\n call,\n });\n};\n\n// DoubleClick\nexport const actionDoubleClickParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe(\n 'The element to be double clicked',\n ),\n});\nexport type ActionDoubleClickParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionDoubleClick = (\n call: (param: ActionDoubleClickParam) => Promise<void>,\n): DeviceAction<ActionDoubleClickParam> => {\n return defineAction<\n typeof actionDoubleClickParamSchema,\n ActionDoubleClickParam\n >({\n name: 'DoubleClick',\n description: 'Double click the element',\n interfaceAlias: 'aiDoubleClick',\n paramSchema: actionDoubleClickParamSchema,\n call,\n });\n};\n\n// Hover\nexport const actionHoverParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe('The element to be hovered'),\n});\nexport type ActionHoverParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionHover = (\n call: (param: ActionHoverParam) => Promise<void>,\n): DeviceAction<ActionHoverParam> => {\n return defineAction<typeof actionHoverParamSchema, ActionHoverParam>({\n name: 'Hover',\n description: 'Move the mouse to the element',\n interfaceAlias: 'aiHover',\n paramSchema: actionHoverParamSchema,\n call,\n });\n};\n\n// Input\nconst inputLocateDescription =\n 'the position of the placeholder or text content in the target input field. If there is no content, locate the center of the input field.';\nexport const actionInputParamSchema = z.object({\n value: z\n .union([z.string(), z.number()])\n .transform((val) => String(val))\n .describe(\n 'The text to input. Provide the final content for replace/append modes, or an empty string when using clear mode to remove existing text.',\n ),\n locate: getMidsceneLocationSchema()\n .describe(inputLocateDescription)\n .optional(),\n mode: z\n .enum(['replace', 'clear', 'append'])\n .default('replace')\n .optional()\n .describe(\n 'Input mode: \"replace\" (default) - clear the field and input the value; \"append\" - append the value to existing content; \"clear\" - clear the field without inputting new text.',\n ),\n});\nexport type ActionInputParam = {\n value: string;\n locate?: LocateResultElement;\n mode?: 'replace' | 'clear' | 'append';\n};\n\nexport const defineActionInput = (\n call: (param: ActionInputParam) => Promise<void>,\n): DeviceAction<ActionInputParam> => {\n return defineAction<typeof actionInputParamSchema, ActionInputParam>({\n name: 'Input',\n description: 'Input the value into the element',\n interfaceAlias: 'aiInput',\n paramSchema: actionInputParamSchema,\n call,\n });\n};\n\n// KeyboardPress\nexport const actionKeyboardPressParamSchema = z.object({\n locate: getMidsceneLocationSchema()\n .describe('The element to be clicked before pressing the key')\n .optional(),\n keyName: z\n .string()\n .describe(\n \"The key to be pressed. Use '+' for key combinations, e.g., 'Control+A', 'Shift+Enter'\",\n ),\n});\nexport type ActionKeyboardPressParam = {\n locate?: LocateResultElement;\n keyName: string;\n};\n\nexport const defineActionKeyboardPress = (\n call: (param: ActionKeyboardPressParam) => Promise<void>,\n): DeviceAction<ActionKeyboardPressParam> => {\n return defineAction<\n typeof actionKeyboardPressParamSchema,\n ActionKeyboardPressParam\n >({\n name: 'KeyboardPress',\n description:\n 'Press a key or key combination, like \"Enter\", \"Tab\", \"Escape\", or \"Control+A\", \"Shift+Enter\". Do not use this to type text.',\n interfaceAlias: 'aiKeyboardPress',\n paramSchema: actionKeyboardPressParamSchema,\n call,\n });\n};\n\n// Scroll\nexport const actionScrollParamSchema = z.object({\n direction: z\n .enum(['down', 'up', 'right', 'left'])\n .default('down')\n .describe('The direction to scroll'),\n scrollType: z\n .enum([\n 'singleAction',\n 'scrollToBottom',\n 'scrollToTop',\n 'scrollToRight',\n 'scrollToLeft',\n ])\n .default('singleAction')\n .describe(\n 'The scroll behavior: \"singleAction\" for a single scroll action, \"scrollToBottom\" for scrolling to the bottom, \"scrollToTop\" for scrolling to the top, \"scrollToRight\" for scrolling to the right, \"scrollToLeft\" for scrolling to the left',\n ),\n distance: z\n .number()\n .nullable()\n .optional()\n .describe('The distance in pixels to scroll'),\n locate: getMidsceneLocationSchema()\n .optional()\n .describe('The target element to be scrolled'),\n});\n\nexport const defineActionScroll = (\n call: (param: ActionScrollParam) => Promise<void>,\n): DeviceAction<ActionScrollParam> => {\n return defineAction<typeof actionScrollParamSchema, ActionScrollParam>({\n name: 'Scroll',\n description:\n 'Scroll the page or an element. The direction to scroll, the scroll type, and the distance to scroll. The distance is the number of pixels to scroll. If not specified, use `down` direction, `once` scroll type, and `null` distance.',\n interfaceAlias: 'aiScroll',\n paramSchema: actionScrollParamSchema,\n call,\n });\n};\n\n// DragAndDrop\nexport const actionDragAndDropParamSchema = z.object({\n from: getMidsceneLocationSchema().describe('The position to be dragged'),\n to: getMidsceneLocationSchema().describe('The position to be dropped'),\n});\nexport type ActionDragAndDropParam = {\n from: LocateResultElement;\n to: LocateResultElement;\n};\n\nexport const defineActionDragAndDrop = (\n call: (param: ActionDragAndDropParam) => Promise<void>,\n): DeviceAction<ActionDragAndDropParam> => {\n return defineAction<\n typeof actionDragAndDropParamSchema,\n ActionDragAndDropParam\n >({\n name: 'DragAndDrop',\n description: 'Drag and drop the element',\n interfaceAlias: 'aiDragAndDrop',\n paramSchema: actionDragAndDropParamSchema,\n call,\n });\n};\n\nexport const ActionLongPressParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe(\n 'The element to be long pressed',\n ),\n duration: z\n .number()\n .default(500)\n .optional()\n .describe('Long press duration in milliseconds'),\n});\n\nexport type ActionLongPressParam = {\n locate: LocateResultElement;\n duration?: number;\n};\nexport const defineActionLongPress = (\n call: (param: ActionLongPressParam) => Promise<void>,\n): DeviceAction<ActionLongPressParam> => {\n return defineAction<typeof ActionLongPressParamSchema, ActionLongPressParam>({\n name: 'LongPress',\n description: 'Long press the element',\n paramSchema: ActionLongPressParamSchema,\n call,\n });\n};\n\nexport const ActionSwipeParamSchema = z.object({\n start: getMidsceneLocationSchema()\n .optional()\n .describe(\n 'Starting point of the swipe gesture, if not specified, the center of the page will be used',\n ),\n direction: z\n .enum(['up', 'down', 'left', 'right'])\n .optional()\n .describe(\n 'The direction to swipe (required when using distance). The direction means the direction of the finger swipe.',\n ),\n distance: z\n .number()\n .optional()\n .describe('The distance in pixels to swipe (mutually exclusive with end)'),\n end: getMidsceneLocationSchema()\n .optional()\n .describe(\n 'Ending point of the swipe gesture (mutually exclusive with distance)',\n ),\n duration: z\n .number()\n .default(300)\n .describe('Duration of the swipe gesture in milliseconds'),\n repeat: z\n .number()\n .optional()\n .describe(\n 'The number of times to repeat the swipe gesture. 1 for default, 0 for infinite (e.g. endless swipe until the end of the page)',\n ),\n});\n\nexport type ActionSwipeParam = {\n start?: LocateResultElement;\n direction?: 'up' | 'down' | 'left' | 'right';\n distance?: number;\n end?: LocateResultElement;\n duration?: number;\n repeat?: number;\n};\n\nexport const defineActionSwipe = (\n call: (param: ActionSwipeParam) => Promise<void>,\n): DeviceAction<ActionSwipeParam> => {\n return defineAction<typeof ActionSwipeParamSchema, ActionSwipeParam>({\n name: 'Swipe',\n description:\n 'Perform a swipe gesture. You must specify either \"end\" (target location) or \"distance\" + \"direction\" - they are mutually exclusive. Use \"end\" for precise location-based swipes, or \"distance\" + \"direction\" for relative movement.',\n paramSchema: ActionSwipeParamSchema,\n call,\n });\n};\n\n// ClearInput\nexport const actionClearInputParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe('The input field to be cleared'),\n});\nexport type ActionClearInputParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionClearInput = (\n call: (param: ActionClearInputParam) => Promise<void>,\n): DeviceAction<ActionClearInputParam> => {\n return defineAction<\n typeof actionClearInputParamSchema,\n ActionClearInputParam\n >({\n name: 'ClearInput',\n description: inputLocateDescription,\n interfaceAlias: 'aiClearInput',\n paramSchema: actionClearInputParamSchema,\n call,\n });\n};\n\n// Assert\nexport const actionAssertParamSchema = z.object({\n thought: z\n .string()\n .describe('The thinking process of analyzing the assertion'),\n result: z\n .boolean()\n .describe('Whether the assertion is truthy, return true or false'),\n});\nexport type ActionAssertParam = {\n thought: string;\n result: boolean;\n};\n\nexport const defineActionAssert = (\n call: (param: ActionAssertParam) => Promise<void>,\n): DeviceAction<ActionAssertParam> => {\n return defineAction<typeof actionAssertParamSchema, ActionAssertParam>({\n name: 'Assert',\n description:\n 'If the user explicitly requires making an assertion (like \"there should be a button with text \"Yes\" in the popup\"), think about it, provide the reasoning and result here.',\n interfaceAlias: 'aiAssert',\n paramSchema: actionAssertParamSchema,\n call,\n });\n};\n\n/**\n * Creates an Assert action with default implementation.\n * This action can be used across all interfaces without modification.\n * If pass=true, the assertion succeeds silently.\n * If pass=false, the assertion fails and throws an error with the thought as the error message.\n */\nexport const createAssertAction = (): DeviceAction<ActionAssertParam> => {\n const debug = getDebug('core:planning-assert');\n return defineActionAssert(async (param: ActionAssertParam) => {\n debug('assertion', param.thought, param.result);\n if (!param.result) {\n throw new Error(`Assertion failed: ${param.thought}`);\n }\n // If pass is true, do nothing (assertion succeeds)\n });\n};\n\nexport type { DeviceAction } from '../types';\nexport type {\n AndroidDeviceOpt,\n AndroidDeviceInputOpt,\n IOSDeviceOpt,\n IOSDeviceInputOpt,\n} from './device-options';\n"],"names":["AbstractInterface","defineAction","config","actionTapParamSchema","z","getMidsceneLocationSchema","defineActionTap","call","actionRightClickParamSchema","defineActionRightClick","actionDoubleClickParamSchema","defineActionDoubleClick","actionHoverParamSchema","defineActionHover","inputLocateDescription","actionInputParamSchema","val","String","defineActionInput","actionKeyboardPressParamSchema","defineActionKeyboardPress","actionScrollParamSchema","defineActionScroll","actionDragAndDropParamSchema","defineActionDragAndDrop","ActionLongPressParamSchema","defineActionLongPress","ActionSwipeParamSchema","defineActionSwipe","actionClearInputParamSchema","defineActionClearInput","actionAssertParamSchema","defineActionAssert","createAssertAction","debug","getDebug","param","Error"],"mappings":";;;AAaO,MAAeA;AAmCtB;AAKO,MAAMC,eAAe,CAK1BC,SAaOA;AAIF,MAAMC,uBAAuBC,EAAE,MAAM,CAAC;IAC3C,QAAQC,4BAA4B,QAAQ,CAAC;AAC/C;AAMO,MAAMC,kBAAkB,CAC7BC,OAEON,aAA0D;QAC/D,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaE;QACbI;IACF;AAIK,MAAMC,8BAA8BJ,EAAE,MAAM,CAAC;IAClD,QAAQC,4BAA4B,QAAQ,CAC1C;AAEJ;AAKO,MAAMI,yBAAyB,CACpCF,OAEON,aAGL;QACA,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaO;QACbD;IACF;AAIK,MAAMG,+BAA+BN,EAAE,MAAM,CAAC;IACnD,QAAQC,4BAA4B,QAAQ,CAC1C;AAEJ;AAKO,MAAMM,0BAA0B,CACrCJ,OAEON,aAGL;QACA,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaS;QACbH;IACF;AAIK,MAAMK,yBAAyBR,EAAE,MAAM,CAAC;IAC7C,QAAQC,4BAA4B,QAAQ,CAAC;AAC/C;AAKO,MAAMQ,oBAAoB,CAC/BN,OAEON,aAA8D;QACnE,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaW;QACbL;IACF;AAIF,MAAMO,yBACJ;AACK,MAAMC,yBAAyBX,EAAE,MAAM,CAAC;IAC7C,OAAOA,EAAAA,KACC,CAAC;QAACA,EAAE,MAAM;QAAIA,EAAE,MAAM;KAAG,EAC9B,SAAS,CAAC,CAACY,MAAQC,OAAOD,MAC1B,QAAQ,CACP;IAEJ,QAAQX,4BACL,QAAQ,CAACS,wBACT,QAAQ;IACX,MAAMV,CAAC,CAADA,OACC,CAAC;QAAC;QAAW;QAAS;KAAS,EACnC,OAAO,CAAC,WACR,QAAQ,GACR,QAAQ,CACP;AAEN;AAOO,MAAMc,oBAAoB,CAC/BX,OAEON,aAA8D;QACnE,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAac;QACbR;IACF;AAIK,MAAMY,iCAAiCf,EAAE,MAAM,CAAC;IACrD,QAAQC,4BACL,QAAQ,CAAC,qDACT,QAAQ;IACX,SAASD,EAAAA,MACA,GACN,QAAQ,CACP;AAEN;AAMO,MAAMgB,4BAA4B,CACvCb,OAEON,aAGL;QACA,MAAM;QACN,aACE;QACF,gBAAgB;QAChB,aAAakB;QACbZ;IACF;AAIK,MAAMc,0BAA0BjB,EAAE,MAAM,CAAC;IAC9C,WAAWA,CAAC,CAADA,OACJ,CAAC;QAAC;QAAQ;QAAM;QAAS;KAAO,EACpC,OAAO,CAAC,QACR,QAAQ,CAAC;IACZ,YAAYA,CAAC,CAADA,OACL,CAAC;QACJ;QACA;QACA;QACA;QACA;KACD,EACA,OAAO,CAAC,gBACR,QAAQ,CACP;IAEJ,UAAUA,EAAAA,MACD,GACN,QAAQ,GACR,QAAQ,GACR,QAAQ,CAAC;IACZ,QAAQC,4BACL,QAAQ,GACR,QAAQ,CAAC;AACd;AAEO,MAAMiB,qBAAqB,CAChCf,OAEON,aAAgE;QACrE,MAAM;QACN,aACE;QACF,gBAAgB;QAChB,aAAaoB;QACbd;IACF;AAIK,MAAMgB,+BAA+BnB,EAAE,MAAM,CAAC;IACnD,MAAMC,4BAA4B,QAAQ,CAAC;IAC3C,IAAIA,4BAA4B,QAAQ,CAAC;AAC3C;AAMO,MAAMmB,0BAA0B,CACrCjB,OAEON,aAGL;QACA,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAasB;QACbhB;IACF;AAGK,MAAMkB,6BAA6BrB,EAAE,MAAM,CAAC;IACjD,QAAQC,4BAA4B,QAAQ,CAC1C;IAEF,UAAUD,EAAAA,MACD,GACN,OAAO,CAAC,KACR,QAAQ,GACR,QAAQ,CAAC;AACd;AAMO,MAAMsB,wBAAwB,CACnCnB,OAEON,aAAsE;QAC3E,MAAM;QACN,aAAa;QACb,aAAawB;QACblB;IACF;AAGK,MAAMoB,yBAAyBvB,EAAE,MAAM,CAAC;IAC7C,OAAOC,4BACJ,QAAQ,GACR,QAAQ,CACP;IAEJ,WAAWD,CAAC,CAADA,OACJ,CAAC;QAAC;QAAM;QAAQ;QAAQ;KAAQ,EACpC,QAAQ,GACR,QAAQ,CACP;IAEJ,UAAUA,EAAAA,MACD,GACN,QAAQ,GACR,QAAQ,CAAC;IACZ,KAAKC,4BACF,QAAQ,GACR,QAAQ,CACP;IAEJ,UAAUD,EAAAA,MACD,GACN,OAAO,CAAC,KACR,QAAQ,CAAC;IACZ,QAAQA,EAAAA,MACC,GACN,QAAQ,GACR,QAAQ,CACP;AAEN;AAWO,MAAMwB,oBAAoB,CAC/BrB,OAEON,aAA8D;QACnE,MAAM;QACN,aACE;QACF,aAAa0B;QACbpB;IACF;AAIK,MAAMsB,8BAA8BzB,EAAE,MAAM,CAAC;IAClD,QAAQC,4BAA4B,QAAQ,CAAC;AAC/C;AAKO,MAAMyB,yBAAyB,CACpCvB,OAEON,aAGL;QACA,MAAM;QACN,aAAaa;QACb,gBAAgB;QAChB,aAAae;QACbtB;IACF;AAIK,MAAMwB,0BAA0B3B,EAAE,MAAM,CAAC;IAC9C,SAASA,EAAAA,MACA,GACN,QAAQ,CAAC;IACZ,QAAQA,EAAAA,OACE,GACP,QAAQ,CAAC;AACd;AAMO,MAAM4B,qBAAqB,CAChCzB,OAEON,aAAgE;QACrE,MAAM;QACN,aACE;QACF,gBAAgB;QAChB,aAAa8B;QACbxB;IACF;AASK,MAAM0B,qBAAqB;IAChC,MAAMC,QAAQC,SAAS;IACvB,OAAOH,mBAAmB,OAAOI;QAC/BF,MAAM,aAAaE,MAAM,OAAO,EAAEA,MAAM,MAAM;QAC9C,IAAI,CAACA,MAAM,MAAM,EACf,MAAM,IAAIC,MAAM,CAAC,kBAAkB,EAAED,MAAM,OAAO,EAAE;IAGxD;AACF"}
|
|
1
|
+
{"version":3,"file":"device/index.mjs","sources":["../../../src/device/index.ts"],"sourcesContent":["import { getMidsceneLocationSchema } from '@/common';\nimport type {\n ActionScrollParam,\n DeviceAction,\n LocateResultElement,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport type { ElementNode } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';\nimport { z } from 'zod';\nimport type { ElementCacheFeature, Rect, Size, UIContext } from '../types';\n\nexport abstract class AbstractInterface {\n abstract interfaceType: string;\n\n abstract screenshotBase64(): Promise<string>;\n abstract size(): Promise<Size>;\n abstract actionSpace(): DeviceAction[] | Promise<DeviceAction[]>;\n\n abstract cacheFeatureForRect?(\n rect: Rect,\n options?: {\n targetDescription?: string;\n modelConfig?: IModelConfig;\n },\n ): Promise<ElementCacheFeature>;\n abstract rectMatchesCacheFeature?(\n feature: ElementCacheFeature,\n ): Promise<Rect>;\n\n abstract destroy?(): Promise<void>;\n\n abstract describe?(): string;\n abstract beforeInvokeAction?(actionName: string, param: any): Promise<void>;\n abstract afterInvokeAction?(actionName: string, param: any): Promise<void>;\n\n // @deprecated do NOT extend this method\n abstract getElementsNodeTree?: () => Promise<ElementNode>;\n\n // @deprecated do NOT extend this method\n abstract url?: () => string | Promise<string>;\n\n // @deprecated do NOT extend this method\n abstract evaluateJavaScript?<T = any>(script: string): Promise<T>;\n\n // @deprecated do NOT extend this method\n abstract getContext?(): Promise<UIContext>;\n}\n\n// Generic function to define actions with proper type inference\n// TRuntime allows specifying a different type for the runtime parameter (after location resolution)\n// TReturn allows specifying the return type of the action\nexport const defineAction = <\n TSchema extends z.ZodType | undefined = undefined,\n TRuntime = TSchema extends z.ZodType ? z.infer<TSchema> : undefined,\n TReturn = any,\n>(\n config: {\n name: string;\n description: string;\n interfaceAlias?: string;\n paramSchema?: TSchema;\n call: (param: TRuntime) => Promise<TReturn> | TReturn;\n } & Partial<\n Omit<\n DeviceAction<TRuntime, TReturn>,\n 'name' | 'description' | 'interfaceAlias' | 'paramSchema' | 'call'\n >\n >,\n): DeviceAction<TRuntime, TReturn> => {\n return config as any; // Type assertion needed because schema validation type differs from runtime type\n};\n\n// Tap\nexport const actionTapParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe('The element to be tapped'),\n});\n// Override the inferred type to use LocateResultElement for the runtime locate field\nexport type ActionTapParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionTap = (\n call: (param: ActionTapParam) => Promise<void>,\n): DeviceAction<ActionTapParam> => {\n return defineAction<typeof actionTapParamSchema, ActionTapParam>({\n name: 'Tap',\n description: 'Tap the element',\n interfaceAlias: 'aiTap',\n paramSchema: actionTapParamSchema,\n call,\n });\n};\n\n// RightClick\nexport const actionRightClickParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe(\n 'The element to be right clicked',\n ),\n});\nexport type ActionRightClickParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionRightClick = (\n call: (param: ActionRightClickParam) => Promise<void>,\n): DeviceAction<ActionRightClickParam> => {\n return defineAction<\n typeof actionRightClickParamSchema,\n ActionRightClickParam\n >({\n name: 'RightClick',\n description: 'Right click the element',\n interfaceAlias: 'aiRightClick',\n paramSchema: actionRightClickParamSchema,\n call,\n });\n};\n\n// DoubleClick\nexport const actionDoubleClickParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe(\n 'The element to be double clicked',\n ),\n});\nexport type ActionDoubleClickParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionDoubleClick = (\n call: (param: ActionDoubleClickParam) => Promise<void>,\n): DeviceAction<ActionDoubleClickParam> => {\n return defineAction<\n typeof actionDoubleClickParamSchema,\n ActionDoubleClickParam\n >({\n name: 'DoubleClick',\n description: 'Double click the element',\n interfaceAlias: 'aiDoubleClick',\n paramSchema: actionDoubleClickParamSchema,\n call,\n });\n};\n\n// Hover\nexport const actionHoverParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe('The element to be hovered'),\n});\nexport type ActionHoverParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionHover = (\n call: (param: ActionHoverParam) => Promise<void>,\n): DeviceAction<ActionHoverParam> => {\n return defineAction<typeof actionHoverParamSchema, ActionHoverParam>({\n name: 'Hover',\n description: 'Move the mouse to the element',\n interfaceAlias: 'aiHover',\n paramSchema: actionHoverParamSchema,\n call,\n });\n};\n\n// Input\nconst inputLocateDescription =\n 'the position of the placeholder or text content in the target input field. If there is no content, locate the center of the input field.';\nexport const actionInputParamSchema = z.object({\n value: z\n .union([z.string(), z.number()])\n .transform((val) => String(val))\n .describe(\n 'The text to input. Provide the final content for replace/append modes, or an empty string when using clear mode to remove existing text.',\n ),\n locate: getMidsceneLocationSchema()\n .describe(inputLocateDescription)\n .optional(),\n mode: z\n .enum(['replace', 'clear', 'append'])\n .default('replace')\n .optional()\n .describe(\n 'Input mode: \"replace\" (default) - clear the field and input the value; \"append\" - append the value to existing content; \"clear\" - clear the field without inputting new text.',\n ),\n});\nexport type ActionInputParam = {\n value: string;\n locate?: LocateResultElement;\n mode?: 'replace' | 'clear' | 'append';\n};\n\nexport const defineActionInput = (\n call: (param: ActionInputParam) => Promise<void>,\n): DeviceAction<ActionInputParam> => {\n return defineAction<typeof actionInputParamSchema, ActionInputParam>({\n name: 'Input',\n description: 'Input the value into the element',\n interfaceAlias: 'aiInput',\n paramSchema: actionInputParamSchema,\n call,\n });\n};\n\n// KeyboardPress\nexport const actionKeyboardPressParamSchema = z.object({\n locate: getMidsceneLocationSchema()\n .describe('The element to be clicked before pressing the key')\n .optional(),\n keyName: z\n .string()\n .describe(\n \"The key to be pressed. Use '+' for key combinations, e.g., 'Control+A', 'Shift+Enter'\",\n ),\n});\nexport type ActionKeyboardPressParam = {\n locate?: LocateResultElement;\n keyName: string;\n};\n\nexport const defineActionKeyboardPress = (\n call: (param: ActionKeyboardPressParam) => Promise<void>,\n): DeviceAction<ActionKeyboardPressParam> => {\n return defineAction<\n typeof actionKeyboardPressParamSchema,\n ActionKeyboardPressParam\n >({\n name: 'KeyboardPress',\n description:\n 'Press a key or key combination, like \"Enter\", \"Tab\", \"Escape\", or \"Control+A\", \"Shift+Enter\". Do not use this to type text.',\n interfaceAlias: 'aiKeyboardPress',\n paramSchema: actionKeyboardPressParamSchema,\n call,\n });\n};\n\n// Scroll\nexport const actionScrollParamSchema = z.object({\n direction: z\n .enum(['down', 'up', 'right', 'left'])\n .default('down')\n .describe('The direction to scroll'),\n scrollType: z\n .enum([\n 'singleAction',\n 'scrollToBottom',\n 'scrollToTop',\n 'scrollToRight',\n 'scrollToLeft',\n ])\n .default('singleAction')\n .describe(\n 'The scroll behavior: \"singleAction\" for a single scroll action, \"scrollToBottom\" for scrolling to the bottom, \"scrollToTop\" for scrolling to the top, \"scrollToRight\" for scrolling to the right, \"scrollToLeft\" for scrolling to the left',\n ),\n distance: z\n .number()\n .nullable()\n .optional()\n .describe('The distance in pixels to scroll'),\n locate: getMidsceneLocationSchema()\n .optional()\n .describe('The target element to be scrolled'),\n});\n\nexport const defineActionScroll = (\n call: (param: ActionScrollParam) => Promise<void>,\n): DeviceAction<ActionScrollParam> => {\n return defineAction<typeof actionScrollParamSchema, ActionScrollParam>({\n name: 'Scroll',\n description:\n 'Scroll the page or an element. The direction to scroll, the scroll type, and the distance to scroll. The distance is the number of pixels to scroll. If not specified, use `down` direction, `once` scroll type, and `null` distance.',\n interfaceAlias: 'aiScroll',\n paramSchema: actionScrollParamSchema,\n call,\n });\n};\n\n// DragAndDrop\nexport const actionDragAndDropParamSchema = z.object({\n from: getMidsceneLocationSchema().describe('The position to be dragged'),\n to: getMidsceneLocationSchema().describe('The position to be dropped'),\n});\nexport type ActionDragAndDropParam = {\n from: LocateResultElement;\n to: LocateResultElement;\n};\n\nexport const defineActionDragAndDrop = (\n call: (param: ActionDragAndDropParam) => Promise<void>,\n): DeviceAction<ActionDragAndDropParam> => {\n return defineAction<\n typeof actionDragAndDropParamSchema,\n ActionDragAndDropParam\n >({\n name: 'DragAndDrop',\n description: 'Drag and drop the element',\n interfaceAlias: 'aiDragAndDrop',\n paramSchema: actionDragAndDropParamSchema,\n call,\n });\n};\n\nexport const ActionLongPressParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe(\n 'The element to be long pressed',\n ),\n duration: z\n .number()\n .default(500)\n .optional()\n .describe('Long press duration in milliseconds'),\n});\n\nexport type ActionLongPressParam = {\n locate: LocateResultElement;\n duration?: number;\n};\nexport const defineActionLongPress = (\n call: (param: ActionLongPressParam) => Promise<void>,\n): DeviceAction<ActionLongPressParam> => {\n return defineAction<typeof ActionLongPressParamSchema, ActionLongPressParam>({\n name: 'LongPress',\n description: 'Long press the element',\n paramSchema: ActionLongPressParamSchema,\n call,\n });\n};\n\nexport const ActionSwipeParamSchema = z.object({\n start: getMidsceneLocationSchema()\n .optional()\n .describe(\n 'Starting point of the swipe gesture, if not specified, the center of the page will be used',\n ),\n direction: z\n .enum(['up', 'down', 'left', 'right'])\n .optional()\n .describe(\n 'The direction to swipe (required when using distance). The direction means the direction of the finger swipe.',\n ),\n distance: z\n .number()\n .optional()\n .describe('The distance in pixels to swipe (mutually exclusive with end)'),\n end: getMidsceneLocationSchema()\n .optional()\n .describe(\n 'Ending point of the swipe gesture (mutually exclusive with distance)',\n ),\n duration: z\n .number()\n .default(300)\n .describe('Duration of the swipe gesture in milliseconds'),\n repeat: z\n .number()\n .optional()\n .describe(\n 'The number of times to repeat the swipe gesture. 1 for default, 0 for infinite (e.g. endless swipe until the end of the page)',\n ),\n});\n\nexport type ActionSwipeParam = {\n start?: LocateResultElement;\n direction?: 'up' | 'down' | 'left' | 'right';\n distance?: number;\n end?: LocateResultElement;\n duration?: number;\n repeat?: number;\n};\n\nexport const defineActionSwipe = (\n call: (param: ActionSwipeParam) => Promise<void>,\n): DeviceAction<ActionSwipeParam> => {\n return defineAction<typeof ActionSwipeParamSchema, ActionSwipeParam>({\n name: 'Swipe',\n description:\n 'Perform a swipe gesture. You must specify either \"end\" (target location) or \"distance\" + \"direction\" - they are mutually exclusive. Use \"end\" for precise location-based swipes, or \"distance\" + \"direction\" for relative movement.',\n paramSchema: ActionSwipeParamSchema,\n call,\n });\n};\n\n// ClearInput\nexport const actionClearInputParamSchema = z.object({\n locate: getMidsceneLocationSchema().describe('The input field to be cleared'),\n});\nexport type ActionClearInputParam = {\n locate: LocateResultElement;\n};\n\nexport const defineActionClearInput = (\n call: (param: ActionClearInputParam) => Promise<void>,\n): DeviceAction<ActionClearInputParam> => {\n return defineAction<\n typeof actionClearInputParamSchema,\n ActionClearInputParam\n >({\n name: 'ClearInput',\n description: inputLocateDescription,\n interfaceAlias: 'aiClearInput',\n paramSchema: actionClearInputParamSchema,\n call,\n });\n};\n\n// Assert\nexport const actionAssertParamSchema = z.object({\n condition: z.string().describe('The condition of the assertion'),\n});\nexport type ActionAssertParam = {\n condition: string;\n};\n\nexport const defineActionAssert = (\n call: (param: ActionAssertParam) => Promise<void>,\n): DeviceAction<ActionAssertParam> => {\n return defineAction<typeof actionAssertParamSchema, ActionAssertParam>({\n name: 'Assert',\n description:\n 'If the user explicitly requires making an assertion (like \"there should be a button with text \"Yes\" in the popup\"), this tool will think about it, and then provide a solid result',\n interfaceAlias: 'aiAssert',\n paramSchema: actionAssertParamSchema,\n call,\n });\n};\n\nexport type { DeviceAction } from '../types';\nexport type {\n AndroidDeviceOpt,\n AndroidDeviceInputOpt,\n IOSDeviceOpt,\n IOSDeviceInputOpt,\n} from './device-options';\n"],"names":["AbstractInterface","defineAction","config","actionTapParamSchema","z","getMidsceneLocationSchema","defineActionTap","call","actionRightClickParamSchema","defineActionRightClick","actionDoubleClickParamSchema","defineActionDoubleClick","actionHoverParamSchema","defineActionHover","inputLocateDescription","actionInputParamSchema","val","String","defineActionInput","actionKeyboardPressParamSchema","defineActionKeyboardPress","actionScrollParamSchema","defineActionScroll","actionDragAndDropParamSchema","defineActionDragAndDrop","ActionLongPressParamSchema","defineActionLongPress","ActionSwipeParamSchema","defineActionSwipe","actionClearInputParamSchema","defineActionClearInput","actionAssertParamSchema","defineActionAssert"],"mappings":";;AAaO,MAAeA;AAmCtB;AAKO,MAAMC,eAAe,CAK1BC,SAaOA;AAIF,MAAMC,uBAAuBC,EAAE,MAAM,CAAC;IAC3C,QAAQC,4BAA4B,QAAQ,CAAC;AAC/C;AAMO,MAAMC,kBAAkB,CAC7BC,OAEON,aAA0D;QAC/D,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaE;QACbI;IACF;AAIK,MAAMC,8BAA8BJ,EAAE,MAAM,CAAC;IAClD,QAAQC,4BAA4B,QAAQ,CAC1C;AAEJ;AAKO,MAAMI,yBAAyB,CACpCF,OAEON,aAGL;QACA,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaO;QACbD;IACF;AAIK,MAAMG,+BAA+BN,EAAE,MAAM,CAAC;IACnD,QAAQC,4BAA4B,QAAQ,CAC1C;AAEJ;AAKO,MAAMM,0BAA0B,CACrCJ,OAEON,aAGL;QACA,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaS;QACbH;IACF;AAIK,MAAMK,yBAAyBR,EAAE,MAAM,CAAC;IAC7C,QAAQC,4BAA4B,QAAQ,CAAC;AAC/C;AAKO,MAAMQ,oBAAoB,CAC/BN,OAEON,aAA8D;QACnE,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAaW;QACbL;IACF;AAIF,MAAMO,yBACJ;AACK,MAAMC,yBAAyBX,EAAE,MAAM,CAAC;IAC7C,OAAOA,EAAAA,KACC,CAAC;QAACA,EAAE,MAAM;QAAIA,EAAE,MAAM;KAAG,EAC9B,SAAS,CAAC,CAACY,MAAQC,OAAOD,MAC1B,QAAQ,CACP;IAEJ,QAAQX,4BACL,QAAQ,CAACS,wBACT,QAAQ;IACX,MAAMV,CAAC,CAADA,OACC,CAAC;QAAC;QAAW;QAAS;KAAS,EACnC,OAAO,CAAC,WACR,QAAQ,GACR,QAAQ,CACP;AAEN;AAOO,MAAMc,oBAAoB,CAC/BX,OAEON,aAA8D;QACnE,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAac;QACbR;IACF;AAIK,MAAMY,iCAAiCf,EAAE,MAAM,CAAC;IACrD,QAAQC,4BACL,QAAQ,CAAC,qDACT,QAAQ;IACX,SAASD,EAAAA,MACA,GACN,QAAQ,CACP;AAEN;AAMO,MAAMgB,4BAA4B,CACvCb,OAEON,aAGL;QACA,MAAM;QACN,aACE;QACF,gBAAgB;QAChB,aAAakB;QACbZ;IACF;AAIK,MAAMc,0BAA0BjB,EAAE,MAAM,CAAC;IAC9C,WAAWA,CAAC,CAADA,OACJ,CAAC;QAAC;QAAQ;QAAM;QAAS;KAAO,EACpC,OAAO,CAAC,QACR,QAAQ,CAAC;IACZ,YAAYA,CAAC,CAADA,OACL,CAAC;QACJ;QACA;QACA;QACA;QACA;KACD,EACA,OAAO,CAAC,gBACR,QAAQ,CACP;IAEJ,UAAUA,EAAAA,MACD,GACN,QAAQ,GACR,QAAQ,GACR,QAAQ,CAAC;IACZ,QAAQC,4BACL,QAAQ,GACR,QAAQ,CAAC;AACd;AAEO,MAAMiB,qBAAqB,CAChCf,OAEON,aAAgE;QACrE,MAAM;QACN,aACE;QACF,gBAAgB;QAChB,aAAaoB;QACbd;IACF;AAIK,MAAMgB,+BAA+BnB,EAAE,MAAM,CAAC;IACnD,MAAMC,4BAA4B,QAAQ,CAAC;IAC3C,IAAIA,4BAA4B,QAAQ,CAAC;AAC3C;AAMO,MAAMmB,0BAA0B,CACrCjB,OAEON,aAGL;QACA,MAAM;QACN,aAAa;QACb,gBAAgB;QAChB,aAAasB;QACbhB;IACF;AAGK,MAAMkB,6BAA6BrB,EAAE,MAAM,CAAC;IACjD,QAAQC,4BAA4B,QAAQ,CAC1C;IAEF,UAAUD,EAAAA,MACD,GACN,OAAO,CAAC,KACR,QAAQ,GACR,QAAQ,CAAC;AACd;AAMO,MAAMsB,wBAAwB,CACnCnB,OAEON,aAAsE;QAC3E,MAAM;QACN,aAAa;QACb,aAAawB;QACblB;IACF;AAGK,MAAMoB,yBAAyBvB,EAAE,MAAM,CAAC;IAC7C,OAAOC,4BACJ,QAAQ,GACR,QAAQ,CACP;IAEJ,WAAWD,CAAC,CAADA,OACJ,CAAC;QAAC;QAAM;QAAQ;QAAQ;KAAQ,EACpC,QAAQ,GACR,QAAQ,CACP;IAEJ,UAAUA,EAAAA,MACD,GACN,QAAQ,GACR,QAAQ,CAAC;IACZ,KAAKC,4BACF,QAAQ,GACR,QAAQ,CACP;IAEJ,UAAUD,EAAAA,MACD,GACN,OAAO,CAAC,KACR,QAAQ,CAAC;IACZ,QAAQA,EAAAA,MACC,GACN,QAAQ,GACR,QAAQ,CACP;AAEN;AAWO,MAAMwB,oBAAoB,CAC/BrB,OAEON,aAA8D;QACnE,MAAM;QACN,aACE;QACF,aAAa0B;QACbpB;IACF;AAIK,MAAMsB,8BAA8BzB,EAAE,MAAM,CAAC;IAClD,QAAQC,4BAA4B,QAAQ,CAAC;AAC/C;AAKO,MAAMyB,yBAAyB,CACpCvB,OAEON,aAGL;QACA,MAAM;QACN,aAAaa;QACb,gBAAgB;QAChB,aAAae;QACbtB;IACF;AAIK,MAAMwB,0BAA0B3B,EAAE,MAAM,CAAC;IAC9C,WAAWA,EAAE,MAAM,GAAG,QAAQ,CAAC;AACjC;AAKO,MAAM4B,qBAAqB,CAChCzB,OAEON,aAAgE;QACrE,MAAM;QACN,aACE;QACF,gBAAgB;QAChB,aAAa8B;QACbxB;IACF"}
|
|
@@ -69,7 +69,7 @@ class Service {
|
|
|
69
69
|
searchAreaUsage
|
|
70
70
|
};
|
|
71
71
|
let errorLog;
|
|
72
|
-
if (parseResult.errors?.length) errorLog = `
|
|
72
|
+
if (parseResult.errors?.length) errorLog = `failed to locate element: \n${parseResult.errors.join('\n')}`;
|
|
73
73
|
const dumpData = {
|
|
74
74
|
type: 'locate',
|
|
75
75
|
userQuery: {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"service/index.mjs","sources":["../../../src/service/index.ts"],"sourcesContent":["import {\n AiExtractElementInfo,\n AiLocateElement,\n callAIWithObjectResponse,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport { AIActionType, type AIArgs, expandSearchArea } from '@/common';\nimport type {\n AIDescribeElementResponse,\n AIUsageInfo,\n DetailedLocateParam,\n LocateResultWithDump,\n PartialServiceDumpFromSDK,\n Rect,\n ServiceExtractOption,\n ServiceExtractParam,\n ServiceExtractResult,\n ServiceTaskInfo,\n UIContext,\n} from '@/types';\nimport { ServiceError } from '@/types';\nimport {\n type IModelConfig,\n MIDSCENE_FORCE_DEEP_THINK,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../common';\nimport { createServiceDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\ninterface ServiceOptions {\n taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;\n aiVendorFn?: typeof callAIWithObjectResponse;\n}\n\nconst debug = getDebug('ai:service');\nexport default class Service {\n contextRetrieverFn: () => Promise<UIContext> | UIContext;\n\n aiVendorFn: Exclude<ServiceOptions['aiVendorFn'], undefined> =\n callAIWithObjectResponse;\n\n taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;\n\n constructor(\n context: UIContext | (() => Promise<UIContext> | UIContext),\n opt?: ServiceOptions,\n ) {\n assert(context, 'context is required for Service');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n // just for unit test, aiVendorFn is callAIWithObjectResponse by default\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt: LocateOpts,\n modelConfig: IModelConfig,\n ): Promise<LocateResultWithDump> {\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = globalConfigManager.getEnvConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n\n const { vlMode } = modelConfig;\n\n if (searchAreaPrompt && !vlMode) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn());\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n modelConfig,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const { parseResult, rect, rawResponse, usage } = await AiLocateElement({\n callAIFn: this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n modelConfig,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: ServiceTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialServiceDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements = parseResult.elements || [];\n\n const dump = createServiceDump({\n ...dumpData,\n matchedElement: elements,\n });\n\n if (errorLog) {\n throw new ServiceError(errorLog, dump);\n }\n\n if (elements.length > 1) {\n throw new ServiceError(\n `locate: multiple elements found, length = ${elements.length}`,\n dump,\n );\n }\n\n if (elements.length === 1) {\n return {\n element: {\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n description: elements[0]!.description,\n },\n rect,\n dump,\n };\n }\n\n return {\n element: null,\n rect,\n dump,\n };\n }\n\n async extract<T>(\n dataDemand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n pageDescription?: string,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<ServiceExtractResult<T>> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const context = await this.contextRetrieverFn();\n\n const startTime = Date.now();\n\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelConfig,\n pageDescription,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: ServiceTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialServiceDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n const dump = createServiceDump({\n ...dumpData,\n data,\n });\n\n if (errorLog && !data) {\n throw new ServiceError(errorLog, dump);\n }\n\n return {\n data,\n thought,\n usage,\n dump,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n modelConfig: IModelConfig,\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for service.describe');\n const context = await this.contextRetrieverFn();\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for service.describe');\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const { vlMode } = modelConfig;\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n const searchArea = expandSearchArea(targetRect, context.size, vlMode);\n debug('describe: set searchArea', searchArea);\n const croppedResult = await cropByRect(\n imagePayload,\n searchArea,\n vlMode === 'qwen2.5-vl',\n );\n imagePayload = croppedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn = this\n .aiVendorFn as typeof callAIWithObjectResponse<AIDescribeElementResponse>;\n\n const res = await callAIFn(\n msgs,\n AIActionType.DESCRIBE_ELEMENT,\n modelConfig,\n );\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["debug","getDebug","Service","query","opt","modelConfig","queryPrompt","assert","globalDeepThinkSwitch","globalConfigManager","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","vlMode","console","undefined","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","rawResponse","usage","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","dump","createServiceDump","ServiceError","dataDemand","pageDescription","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","croppedResult","cropByRect","msgs","callAIFn","res","AIActionType","content","callAIWithObjectResponse","Promise"],"mappings":";;;;;;;;;;;;;;;;;;;;AA8CA,MAAMA,QAAQC,SAAS;AACR,MAAMC;IA4BnB,MAAM,OACJC,KAA0B,EAC1BC,GAAe,EACfC,WAAyB,EACM;QAC/B,MAAMC,cAAc,AAAiB,YAAjB,OAAOH,QAAqBA,QAAQA,MAAM,MAAM;QACpEI,OAAOD,aAAa;QAEpBC,OAAO,AAAiB,YAAjB,OAAOJ,OAAoB;QAElC,MAAMK,wBAAwBC,oBAAoB,qBAAqB,CACrEC;QAEF,IAAIF,uBACFR,MAAM,yBAAyBQ;QAEjC,IAAIG;QACJ,IAAIR,MAAM,SAAS,IAAIK,uBACrBG,mBAAmBR,MAAM,MAAM;QAGjC,MAAM,EAAES,MAAM,EAAE,GAAGP;QAEnB,IAAIM,oBAAoB,CAACC,QAAQ;YAC/BC,QAAQ,IAAI,CACV;YAEFF,mBAAmBG;QACrB;QAEA,MAAMC,UAAUX,KAAK,WAAY,MAAM,IAAI,CAAC,kBAAkB;QAE9D,IAAIY;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,gBAAgB;gBACzCL;gBACA,oBAAoBJ;gBACpBN;YACF;YACAE,OACEY,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEC,IAAI,EAAEC,WAAW,EAAEC,KAAK,EAAE,GAAG,MAAMC,gBAAgB;YACtE,UAAU,IAAI,CAAC,UAAU;YACzBZ;YACA,0BAA0BT;YAC1B,cAAca;YACdd;QACF;QAEA,MAAMuB,WAAWN,KAAK,GAAG,KAAKD;QAC9B,MAAMQ,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACL;YAC5B,gBAAgBK,KAAK,SAAS,CAACP;YAC/BG;YACAV;YACAC;YACAC;QACF;QAEA,IAAIa;QACJ,IAAIR,YAAY,MAAM,EAAE,QACtBQ,WAAW,CAAC,6BAA6B,EAAER,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMS,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS1B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAakB;YACb,MAAM;YACNK;YACA,WAAW,CAAC,CAACb;YACb,OAAOe;QACT;QAEA,MAAME,WAAWV,YAAY,QAAQ,IAAI,EAAE;QAE3C,MAAMW,OAAOC,kBAAkB;YAC7B,GAAGH,QAAQ;YACX,gBAAgBC;QAClB;QAEA,IAAIF,UACF,MAAM,IAAIK,aAAaL,UAAUG;QAGnC,IAAID,SAAS,MAAM,GAAG,GACpB,MAAM,IAAIG,aACR,CAAC,0CAA0C,EAAEH,SAAS,MAAM,EAAE,EAC9DC;QAIJ,IAAID,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,aAAaA,QAAQ,CAAC,EAAE,CAAE,WAAW;YACvC;YACAT;YACAU;QACF;QAGF,OAAO;YACL,SAAS;YACTV;YACAU;QACF;IACF;IAEA,MAAM,QACJG,UAA+B,EAC/BhC,WAAyB,EACzBD,GAA0B,EAC1BkC,eAAwB,EACxBC,gBAAoC,EACF;QAClChC,OACE,AAAsB,YAAtB,OAAO8B,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMtB,UAAU,MAAM,IAAI,CAAC,kBAAkB;QAE7C,MAAMM,YAAYC,KAAK,GAAG;QAE1B,MAAM,EAAEC,WAAW,EAAEG,KAAK,EAAE,GAAG,MAAMc,qBAAwB;YAC3DzB;YACA,WAAWsB;YACXE;YACA,eAAenC;YACfC;YACAiC;QACF;QAEA,MAAMV,WAAWN,KAAK,GAAG,KAAKD;QAC9B,MAAMQ,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACP;QAC9B;QAEA,IAAIQ;QACJ,IAAIR,YAAY,MAAM,EAAE,QACtBQ,WAAW,CAAC,qBAAqB,EAAER,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMS,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTK;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNR;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGnB,eAAe,CAAC;QAG1C,MAAMW,OAAOC,kBAAkB;YAC7B,GAAGH,QAAQ;YACXS;QACF;QAEA,IAAIV,YAAY,CAACU,MACf,MAAM,IAAIL,aAAaL,UAAUG;QAGnC,OAAO;YACLO;YACAC;YACAhB;YACAQ;QACF;IACF;IAEA,MAAM,SACJS,MAA+B,EAC/BtC,WAAyB,EACzBD,GAEC,EACwD;QACzDG,OAAOoC,QAAQ;QACf,MAAM5B,UAAU,MAAM,IAAI,CAAC,kBAAkB;QAC7C,MAAM,EAAE6B,gBAAgB,EAAEC,IAAI,EAAE,GAAG9B;QACnCR,OAAOqC,kBAAkB;QAEzB,MAAM,EAAEhC,MAAM,EAAE,GAAGP;QACnB,MAAMyC,eAAeC;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,wBAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAI7C,KAAK,WAAW;YAClB,MAAMY,aAAasC,iBAAiBL,YAAYlC,QAAQ,IAAI,EAAEH;YAC9DZ,MAAM,4BAA4BgB;YAClC,MAAMuC,gBAAgB,MAAMC,WAC1BJ,cACApC,YACAJ,AAAW,iBAAXA;YAEFwC,eAAeG,cAAc,WAAW;QAC1C;QAEA,MAAME,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WAAW,IAAI,CAClB,UAAU;QAEb,MAAMC,MAAM,MAAMD,SAChBD,MACAG,aAAa,gBAAgB,EAC7BvD;QAGF,MAAM,EAAEwD,OAAO,EAAE,GAAGF;QACpBpD,OAAO,CAACsD,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1DtD,OAAOsD,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IAvSA,YACE9C,OAA2D,EAC3DX,GAAoB,CACpB;QAVF;QAEA,qCACE0D;QAEF;QAMEvD,OAAOQ,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMgD,QAAQ,OAAO,CAAChD;QAIlD,IAAI,AAA2B,WAApBX,KAAK,YACd,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,KAAK,UACd,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAsRF"}
|
|
1
|
+
{"version":3,"file":"service/index.mjs","sources":["../../../src/service/index.ts"],"sourcesContent":["import {\n AiExtractElementInfo,\n AiLocateElement,\n callAIWithObjectResponse,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport { AIActionType, type AIArgs, expandSearchArea } from '@/common';\nimport type {\n AIDescribeElementResponse,\n AIUsageInfo,\n DetailedLocateParam,\n LocateResultWithDump,\n PartialServiceDumpFromSDK,\n Rect,\n ServiceExtractOption,\n ServiceExtractParam,\n ServiceExtractResult,\n ServiceTaskInfo,\n UIContext,\n} from '@/types';\nimport { ServiceError } from '@/types';\nimport {\n type IModelConfig,\n MIDSCENE_FORCE_DEEP_THINK,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../common';\nimport { createServiceDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\ninterface ServiceOptions {\n taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;\n aiVendorFn?: typeof callAIWithObjectResponse;\n}\n\nconst debug = getDebug('ai:service');\nexport default class Service {\n contextRetrieverFn: () => Promise<UIContext> | UIContext;\n\n aiVendorFn: Exclude<ServiceOptions['aiVendorFn'], undefined> =\n callAIWithObjectResponse;\n\n taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;\n\n constructor(\n context: UIContext | (() => Promise<UIContext> | UIContext),\n opt?: ServiceOptions,\n ) {\n assert(context, 'context is required for Service');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n // just for unit test, aiVendorFn is callAIWithObjectResponse by default\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt: LocateOpts,\n modelConfig: IModelConfig,\n ): Promise<LocateResultWithDump> {\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = globalConfigManager.getEnvConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n\n const { vlMode } = modelConfig;\n\n if (searchAreaPrompt && !vlMode) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn());\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n modelConfig,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const { parseResult, rect, rawResponse, usage } = await AiLocateElement({\n callAIFn: this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n modelConfig,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: ServiceTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `failed to locate element: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialServiceDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements = parseResult.elements || [];\n\n const dump = createServiceDump({\n ...dumpData,\n matchedElement: elements,\n });\n\n if (errorLog) {\n throw new ServiceError(errorLog, dump);\n }\n\n if (elements.length > 1) {\n throw new ServiceError(\n `locate: multiple elements found, length = ${elements.length}`,\n dump,\n );\n }\n\n if (elements.length === 1) {\n return {\n element: {\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n description: elements[0]!.description,\n },\n rect,\n dump,\n };\n }\n\n return {\n element: null,\n rect,\n dump,\n };\n }\n\n async extract<T>(\n dataDemand: ServiceExtractParam,\n modelConfig: IModelConfig,\n opt?: ServiceExtractOption,\n pageDescription?: string,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<ServiceExtractResult<T>> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const context = await this.contextRetrieverFn();\n\n const startTime = Date.now();\n\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelConfig,\n pageDescription,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: ServiceTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialServiceDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n const dump = createServiceDump({\n ...dumpData,\n data,\n });\n\n if (errorLog && !data) {\n throw new ServiceError(errorLog, dump);\n }\n\n return {\n data,\n thought,\n usage,\n dump,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n modelConfig: IModelConfig,\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for service.describe');\n const context = await this.contextRetrieverFn();\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for service.describe');\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const { vlMode } = modelConfig;\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n const searchArea = expandSearchArea(targetRect, context.size, vlMode);\n debug('describe: set searchArea', searchArea);\n const croppedResult = await cropByRect(\n imagePayload,\n searchArea,\n vlMode === 'qwen2.5-vl',\n );\n imagePayload = croppedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn = this\n .aiVendorFn as typeof callAIWithObjectResponse<AIDescribeElementResponse>;\n\n const res = await callAIFn(\n msgs,\n AIActionType.DESCRIBE_ELEMENT,\n modelConfig,\n );\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["debug","getDebug","Service","query","opt","modelConfig","queryPrompt","assert","globalDeepThinkSwitch","globalConfigManager","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","vlMode","console","undefined","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","rawResponse","usage","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","dump","createServiceDump","ServiceError","dataDemand","pageDescription","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","croppedResult","cropByRect","msgs","callAIFn","res","AIActionType","content","callAIWithObjectResponse","Promise"],"mappings":";;;;;;;;;;;;;;;;;;;;AA8CA,MAAMA,QAAQC,SAAS;AACR,MAAMC;IA4BnB,MAAM,OACJC,KAA0B,EAC1BC,GAAe,EACfC,WAAyB,EACM;QAC/B,MAAMC,cAAc,AAAiB,YAAjB,OAAOH,QAAqBA,QAAQA,MAAM,MAAM;QACpEI,OAAOD,aAAa;QAEpBC,OAAO,AAAiB,YAAjB,OAAOJ,OAAoB;QAElC,MAAMK,wBAAwBC,oBAAoB,qBAAqB,CACrEC;QAEF,IAAIF,uBACFR,MAAM,yBAAyBQ;QAEjC,IAAIG;QACJ,IAAIR,MAAM,SAAS,IAAIK,uBACrBG,mBAAmBR,MAAM,MAAM;QAGjC,MAAM,EAAES,MAAM,EAAE,GAAGP;QAEnB,IAAIM,oBAAoB,CAACC,QAAQ;YAC/BC,QAAQ,IAAI,CACV;YAEFF,mBAAmBG;QACrB;QAEA,MAAMC,UAAUX,KAAK,WAAY,MAAM,IAAI,CAAC,kBAAkB;QAE9D,IAAIY;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,gBAAgB;gBACzCL;gBACA,oBAAoBJ;gBACpBN;YACF;YACAE,OACEY,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEC,IAAI,EAAEC,WAAW,EAAEC,KAAK,EAAE,GAAG,MAAMC,gBAAgB;YACtE,UAAU,IAAI,CAAC,UAAU;YACzBZ;YACA,0BAA0BT;YAC1B,cAAca;YACdd;QACF;QAEA,MAAMuB,WAAWN,KAAK,GAAG,KAAKD;QAC9B,MAAMQ,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACL;YAC5B,gBAAgBK,KAAK,SAAS,CAACP;YAC/BG;YACAV;YACAC;YACAC;QACF;QAEA,IAAIa;QACJ,IAAIR,YAAY,MAAM,EAAE,QACtBQ,WAAW,CAAC,4BAA4B,EAAER,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG3E,MAAMS,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS1B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAakB;YACb,MAAM;YACNK;YACA,WAAW,CAAC,CAACb;YACb,OAAOe;QACT;QAEA,MAAME,WAAWV,YAAY,QAAQ,IAAI,EAAE;QAE3C,MAAMW,OAAOC,kBAAkB;YAC7B,GAAGH,QAAQ;YACX,gBAAgBC;QAClB;QAEA,IAAIF,UACF,MAAM,IAAIK,aAAaL,UAAUG;QAGnC,IAAID,SAAS,MAAM,GAAG,GACpB,MAAM,IAAIG,aACR,CAAC,0CAA0C,EAAEH,SAAS,MAAM,EAAE,EAC9DC;QAIJ,IAAID,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,aAAaA,QAAQ,CAAC,EAAE,CAAE,WAAW;YACvC;YACAT;YACAU;QACF;QAGF,OAAO;YACL,SAAS;YACTV;YACAU;QACF;IACF;IAEA,MAAM,QACJG,UAA+B,EAC/BhC,WAAyB,EACzBD,GAA0B,EAC1BkC,eAAwB,EACxBC,gBAAoC,EACF;QAClChC,OACE,AAAsB,YAAtB,OAAO8B,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMtB,UAAU,MAAM,IAAI,CAAC,kBAAkB;QAE7C,MAAMM,YAAYC,KAAK,GAAG;QAE1B,MAAM,EAAEC,WAAW,EAAEG,KAAK,EAAE,GAAG,MAAMc,qBAAwB;YAC3DzB;YACA,WAAWsB;YACXE;YACA,eAAenC;YACfC;YACAiC;QACF;QAEA,MAAMV,WAAWN,KAAK,GAAG,KAAKD;QAC9B,MAAMQ,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACP;QAC9B;QAEA,IAAIQ;QACJ,IAAIR,YAAY,MAAM,EAAE,QACtBQ,WAAW,CAAC,qBAAqB,EAAER,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMS,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTK;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNR;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGnB,eAAe,CAAC;QAG1C,MAAMW,OAAOC,kBAAkB;YAC7B,GAAGH,QAAQ;YACXS;QACF;QAEA,IAAIV,YAAY,CAACU,MACf,MAAM,IAAIL,aAAaL,UAAUG;QAGnC,OAAO;YACLO;YACAC;YACAhB;YACAQ;QACF;IACF;IAEA,MAAM,SACJS,MAA+B,EAC/BtC,WAAyB,EACzBD,GAEC,EACwD;QACzDG,OAAOoC,QAAQ;QACf,MAAM5B,UAAU,MAAM,IAAI,CAAC,kBAAkB;QAC7C,MAAM,EAAE6B,gBAAgB,EAAEC,IAAI,EAAE,GAAG9B;QACnCR,OAAOqC,kBAAkB;QAEzB,MAAM,EAAEhC,MAAM,EAAE,GAAGP;QACnB,MAAMyC,eAAeC;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,wBAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAI7C,KAAK,WAAW;YAClB,MAAMY,aAAasC,iBAAiBL,YAAYlC,QAAQ,IAAI,EAAEH;YAC9DZ,MAAM,4BAA4BgB;YAClC,MAAMuC,gBAAgB,MAAMC,WAC1BJ,cACApC,YACAJ,AAAW,iBAAXA;YAEFwC,eAAeG,cAAc,WAAW;QAC1C;QAEA,MAAME,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WAAW,IAAI,CAClB,UAAU;QAEb,MAAMC,MAAM,MAAMD,SAChBD,MACAG,aAAa,gBAAgB,EAC7BvD;QAGF,MAAM,EAAEwD,OAAO,EAAE,GAAGF;QACpBpD,OAAO,CAACsD,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1DtD,OAAOsD,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IAvSA,YACE9C,OAA2D,EAC3DX,GAAoB,CACpB;QAVF;QAEA,qCACE0D;QAEF;QAMEvD,OAAOQ,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMgD,QAAQ,OAAO,CAAChD;QAIlD,IAAI,AAA2B,WAApBX,KAAK,YACd,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,KAAK,UACd,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAsRF"}
|
package/dist/es/types.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.mjs","sources":["../../src/types.ts"],"sourcesContent":["/* eslint-disable @typescript-eslint/no-explicit-any */\n\nimport type { NodeType } from '@midscene/shared/constants';\nimport type { CreateOpenAIClientFn, TModelConfig } from '@midscene/shared/env';\nimport type {\n BaseElement,\n LocateResultElement,\n Rect,\n Size,\n} from '@midscene/shared/types';\nimport type { z } from 'zod';\nimport type { TUserPrompt } from './common';\nimport type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';\n\nexport type {\n ElementTreeNode,\n BaseElement,\n Rect,\n Size,\n Point,\n} from '@midscene/shared/types';\nexport * from './yaml';\n\nexport type AIUsageInfo = Record<string, any> & {\n prompt_tokens: number | undefined;\n completion_tokens: number | undefined;\n total_tokens: number | undefined;\n cached_input: number | undefined;\n time_cost: number | undefined;\n model_name: string | undefined;\n model_description: string | undefined;\n intent: string | undefined;\n};\n\nexport type { LocateResultElement };\n\n/**\n * openai\n *\n */\nexport enum AIResponseFormat {\n JSON = 'json_object',\n TEXT = 'text',\n}\n\nexport type AISingleElementResponseById = {\n id: string;\n reason?: string;\n text?: string;\n xpaths?: string[];\n};\n\nexport type AISingleElementResponseByPosition = {\n position?: {\n x: number;\n y: number;\n };\n bbox?: [number, number, number, number];\n reason: string;\n text: string;\n};\n\nexport type AISingleElementResponse = AISingleElementResponseById;\n\nexport interface AIElementCoordinatesResponse {\n bbox: [number, number, number, number];\n errors?: string[];\n}\n\nexport type AIElementResponse = AIElementCoordinatesResponse;\n\nexport interface AIDataExtractionResponse<DataDemand> {\n data: DataDemand;\n errors?: string[];\n thought?: string;\n}\n\nexport interface AISectionLocatorResponse {\n bbox: [number, number, number, number];\n references_bbox?: [number, number, number, number][];\n error?: string;\n}\n\nexport interface AIAssertionResponse {\n pass: boolean;\n thought: string;\n}\n\nexport interface AIDescribeElementResponse {\n description: string;\n error?: string;\n}\n\nexport interface LocatorValidatorOption {\n centerDistanceThreshold?: number;\n}\n\nexport interface LocateValidatorResult {\n pass: boolean;\n rect: Rect;\n center: [number, number];\n centerDistance?: number;\n}\n\nexport interface AgentDescribeElementAtPointResult {\n prompt: string;\n deepThink: boolean;\n verifyResult?: LocateValidatorResult;\n}\n\n/**\n * context\n */\n\nexport abstract class UIContext {\n abstract screenshotBase64: string;\n\n abstract size: Size;\n\n abstract _isFrozen?: boolean;\n}\n\nexport type EnsureObject<T> = { [K in keyof T]: any };\n\nexport type ServiceAction = 'locate' | 'extract' | 'assert' | 'describe';\n\nexport type ServiceExtractParam = string | Record<string, string>;\n\nexport type ElementCacheFeature = Record<string, unknown>;\n\nexport interface LocateResult {\n element: LocateResultElement | null;\n rect?: Rect;\n}\n\nexport type ThinkingLevel = 'off' | 'medium' | 'high';\nexport type ThinkingStrategy = 'off' | 'cot';\n\nexport interface ServiceTaskInfo {\n durationMs: number;\n formatResponse?: string;\n rawResponse?: string;\n usage?: AIUsageInfo;\n searchArea?: Rect;\n searchAreaRawResponse?: string;\n searchAreaUsage?: AIUsageInfo;\n}\n\nexport interface DumpMeta {\n logTime: number;\n}\n\nexport interface ReportDumpWithAttributes {\n dumpString: string;\n attributes?: Record<string, any>;\n}\n\nexport interface ServiceDump extends DumpMeta {\n type: 'locate' | 'extract' | 'assert';\n logId: string;\n userQuery: {\n element?: TUserPrompt;\n dataDemand?: ServiceExtractParam;\n assertion?: TUserPrompt;\n };\n matchedElement: LocateResultElement[];\n matchedRect?: Rect;\n deepThink?: boolean;\n data: any;\n assertionPass?: boolean;\n assertionThought?: string;\n taskInfo: ServiceTaskInfo;\n error?: string;\n output?: any;\n}\n\nexport type PartialServiceDumpFromSDK = Omit<\n ServiceDump,\n 'logTime' | 'logId' | 'model_name'\n>;\n\nexport interface ServiceResultBase {\n dump: ServiceDump;\n}\n\nexport type LocateResultWithDump = LocateResult & ServiceResultBase;\n\nexport interface ServiceExtractResult<T> extends ServiceResultBase {\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n}\n\nexport class ServiceError extends Error {\n dump: ServiceDump;\n\n constructor(message: string, dump: ServiceDump) {\n super(message);\n this.name = 'ServiceError';\n this.dump = dump;\n }\n}\n\n// intermediate variables to optimize the return value by AI\nexport interface LiteUISection {\n name: string;\n description: string;\n sectionCharacteristics: string;\n textIds: string[];\n}\n\nexport type ElementById = (id: string) => BaseElement | null;\n\nexport type ServiceAssertionResponse = AIAssertionResponse & {\n usage?: AIUsageInfo;\n};\n\n/**\n * agent\n */\n\nexport type OnTaskStartTip = (tip: string) => Promise<void> | void;\n\nexport interface AgentWaitForOpt {\n checkIntervalMs?: number;\n timeoutMs?: number;\n}\n\nexport interface AgentAssertOpt {\n keepRawResponse?: boolean;\n}\n\n/**\n * planning\n *\n */\n\nexport interface PlanningLocateParam extends DetailedLocateParam {\n bbox?: [number, number, number, number];\n}\n\nexport interface PlanningAction<ParamType = any> {\n thought?: string;\n type: string;\n param: ParamType;\n}\n\nexport interface RawResponsePlanningAIResponse {\n action: PlanningAction;\n more_actions_needed_by_instruction: boolean;\n log: string;\n sleep?: number;\n error?: string;\n}\n\nexport interface PlanningAIResponse\n extends Omit<RawResponsePlanningAIResponse, 'action'> {\n actions?: PlanningAction[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n yamlString?: string;\n}\n\nexport interface PlanningActionParamSleep {\n timeMs: number;\n}\n\nexport interface PlanningActionParamError {\n thought: string;\n}\n\nexport type PlanningActionParamWaitFor = AgentWaitForOpt & {};\n\nexport interface LongPressParam {\n duration?: number;\n}\n\nexport interface PullParam {\n direction: 'up' | 'down';\n distance?: number;\n duration?: number;\n}\n/**\n * misc\n */\n\nexport interface Color {\n name: string;\n hex: string;\n}\n\nexport interface BaseAgentParserOpt {\n selector?: string;\n}\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PuppeteerParserOpt extends BaseAgentParserOpt {}\n\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PlaywrightParserOpt extends BaseAgentParserOpt {}\n\n/*\naction\n*/\nexport interface ExecutionTaskProgressOptions {\n onTaskStart?: (task: ExecutionTask) => Promise<void> | void;\n}\n\nexport interface ExecutionRecorderItem {\n type: 'screenshot';\n ts: number;\n screenshot?: string;\n timing?: string;\n}\n\nexport type ExecutionTaskType = 'Planning' | 'Insight' | 'Action Space' | 'Log';\n\nexport interface ExecutorContext {\n task: ExecutionTask;\n element?: LocateResultElement | null;\n uiContext?: UIContext;\n}\n\nexport interface ExecutionTaskApply<\n Type extends ExecutionTaskType = any,\n TaskParam = any,\n TaskOutput = any,\n TaskLog = any,\n> {\n type: Type;\n subType?: string;\n subTask?: boolean;\n param?: TaskParam;\n thought?: string;\n uiContext?: UIContext;\n executor: (\n param: TaskParam,\n context: ExecutorContext,\n ) => // biome-ignore lint/suspicious/noConfusingVoidType: <explanation>\n | Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>\n | undefined\n | void;\n}\n\nexport interface ExecutionTaskHitBy {\n from: string;\n context: Record<string, any>;\n}\n\nexport interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {\n output?: TaskOutput;\n log?: TaskLog;\n recorder?: ExecutionRecorderItem[];\n hitBy?: ExecutionTaskHitBy;\n}\n\nexport type ExecutionTask<\n E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<\n any,\n any,\n any\n >,\n> = E &\n ExecutionTaskReturn<\n E extends ExecutionTaskApply<any, any, infer TaskOutput, any>\n ? TaskOutput\n : unknown,\n E extends ExecutionTaskApply<any, any, any, infer TaskLog>\n ? TaskLog\n : unknown\n > & {\n status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';\n error?: Error;\n errorMessage?: string;\n errorStack?: string;\n timing?: {\n start: number;\n end?: number;\n cost?: number;\n };\n usage?: AIUsageInfo;\n searchAreaUsage?: AIUsageInfo;\n };\n\nexport interface ExecutionDump extends DumpMeta {\n name: string;\n description?: string;\n tasks: ExecutionTask[];\n aiActionContext?: string;\n}\n\n/*\ntask - service-locate\n*/\nexport type ExecutionTaskInsightLocateParam = PlanningLocateParam;\n\nexport interface ExecutionTaskInsightLocateOutput {\n element: LocateResultElement | null;\n}\n\nexport type ExecutionTaskInsightDump = ServiceDump;\n\nexport type ExecutionTaskInsightLocateApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightLocateParam,\n ExecutionTaskInsightLocateOutput,\n ExecutionTaskInsightDump\n>;\n\nexport type ExecutionTaskInsightLocate =\n ExecutionTask<ExecutionTaskInsightLocateApply>;\n\n/*\ntask - service-query\n*/\nexport interface ExecutionTaskInsightQueryParam {\n dataDemand: ServiceExtractParam;\n}\n\nexport interface ExecutionTaskInsightQueryOutput {\n data: any;\n}\n\nexport type ExecutionTaskInsightQueryApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightQueryParam,\n any,\n ExecutionTaskInsightDump\n>;\n\nexport type ExecutionTaskInsightQuery =\n ExecutionTask<ExecutionTaskInsightQueryApply>;\n\n/*\ntask - assertion\n*/\nexport interface ExecutionTaskInsightAssertionParam {\n assertion: string;\n}\n\nexport type ExecutionTaskInsightAssertionApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightAssertionParam,\n ServiceAssertionResponse,\n ExecutionTaskInsightDump\n>;\n\nexport type ExecutionTaskInsightAssertion =\n ExecutionTask<ExecutionTaskInsightAssertionApply>;\n\n/*\ntask - action (i.e. interact) \n*/\nexport type ExecutionTaskActionApply<ActionParam = any> = ExecutionTaskApply<\n 'Action Space',\n ActionParam,\n void,\n void\n>;\n\nexport type ExecutionTaskAction = ExecutionTask<ExecutionTaskActionApply>;\n\n/*\ntask - Log\n*/\n\nexport type ExecutionTaskLogApply<\n LogParam = {\n content: string;\n },\n> = ExecutionTaskApply<'Log', LogParam, void, void>;\n\nexport type ExecutionTaskLog = ExecutionTask<ExecutionTaskLogApply>;\n\n/*\ntask - planning\n*/\n\nexport type ExecutionTaskPlanningApply = ExecutionTaskApply<\n 'Planning',\n {\n userInstruction: string;\n aiActionContext?: string;\n },\n PlanningAIResponse\n>;\n\nexport type ExecutionTaskPlanning = ExecutionTask<ExecutionTaskPlanningApply>;\n\n/*\ntask - planning-locate\n*/\nexport type ExecutionTaskPlanningLocateParam = PlanningLocateParam;\n\nexport interface ExecutionTaskPlanningLocateOutput {\n element: LocateResultElement | null;\n}\n\nexport type ExecutionTaskPlanningDump = ServiceDump;\n\nexport type ExecutionTaskPlanningLocateApply = ExecutionTaskApply<\n 'Planning',\n ExecutionTaskPlanningLocateParam,\n ExecutionTaskPlanningLocateOutput,\n ExecutionTaskPlanningDump\n>;\n\nexport type ExecutionTaskPlanningLocate =\n ExecutionTask<ExecutionTaskPlanningLocateApply>;\n\n/*\nGrouped dump\n*/\nexport interface GroupedActionDump {\n sdkVersion: string;\n groupName: string;\n groupDescription?: string;\n modelBriefs: string[];\n executions: ExecutionDump[];\n}\n\nexport type InterfaceType =\n | 'puppeteer'\n | 'playwright'\n | 'static'\n | 'chrome-extension-proxy'\n | 'android'\n | string;\n\nexport interface StreamingCodeGenerationOptions {\n /** Whether to enable streaming output */\n stream?: boolean;\n /** Callback function to handle streaming chunks */\n onChunk?: StreamingCallback;\n /** Callback function to handle streaming completion */\n onComplete?: (finalCode: string) => void;\n /** Callback function to handle streaming errors */\n onError?: (error: Error) => void;\n}\n\nexport type StreamingCallback = (chunk: CodeGenerationChunk) => void;\n\nexport interface CodeGenerationChunk {\n /** The incremental content chunk */\n content: string;\n /** The reasoning content */\n reasoning_content: string;\n /** The accumulated content so far */\n accumulated: string;\n /** Whether this is the final chunk */\n isComplete: boolean;\n /** Token usage information if available */\n usage?: AIUsageInfo;\n}\n\nexport interface StreamingAIResponse {\n /** The final accumulated content */\n content: string;\n /** Token usage information */\n usage?: AIUsageInfo;\n /** Whether the response was streamed */\n isStreamed: boolean;\n}\n\nexport interface DeviceAction<TParam = any, TReturn = any> {\n name: string;\n description?: string;\n interfaceAlias?: string;\n paramSchema?: z.ZodType<TParam>;\n call: (param: TParam, context: ExecutorContext) => Promise<TReturn> | TReturn;\n delayAfterRunner?: number;\n}\n\n/**\n * Type utilities for extracting types from DeviceAction definitions\n */\n\n/**\n * Extract parameter type from a DeviceAction\n */\nexport type ActionParam<Action extends DeviceAction<any, any>> =\n Action extends DeviceAction<infer P, any> ? P : never;\n\n/**\n * Extract return type from a DeviceAction\n */\nexport type ActionReturn<Action extends DeviceAction<any, any>> =\n Action extends DeviceAction<any, infer R> ? R : never;\n\n/**\n * Web-specific types\n */\nexport interface WebElementInfo extends BaseElement {\n id: string;\n attributes: {\n nodeType: NodeType;\n [key: string]: string;\n };\n}\n\nexport type WebUIContext = UIContext;\n\n/**\n * Agent\n */\n\nexport type CacheConfig = {\n strategy?: 'read-only' | 'read-write' | 'write-only';\n id: string;\n};\n\nexport type Cache =\n | false // No read, no write\n | true // Will throw error at runtime - deprecated\n | CacheConfig; // Object configuration (requires explicit id)\n\nexport interface AgentOpt {\n testId?: string;\n // @deprecated\n cacheId?: string; // Keep backward compatibility, but marked as deprecated\n groupName?: string;\n groupDescription?: string;\n /* if auto generate report, default true */\n generateReport?: boolean;\n /* if auto print report msg, default true */\n autoPrintReportMsg?: boolean;\n onTaskStartTip?: OnTaskStartTip;\n aiActionContext?: string;\n /* custom report file name */\n reportFileName?: string;\n modelConfig?: TModelConfig;\n cache?: Cache;\n /**\n * Maximum number of replanning cycles for aiAct.\n * Defaults to 20 (40 for `vlm-ui-tars`) when not provided.\n * If omitted, the agent will also read `MIDSCENE_REPLANNING_CYCLE_LIMIT` for backward compatibility.\n */\n replanningCycleLimit?: number;\n\n /**\n * Custom OpenAI client factory function\n *\n * If provided, this function will be called to create OpenAI client instances\n * for each AI call, allowing you to:\n * - Wrap clients with observability tools (langsmith, langfuse)\n * - Use custom OpenAI-compatible clients\n * - Apply different configurations based on intent\n *\n * @param config - Resolved model configuration\n * @returns OpenAI client instance (original or wrapped)\n *\n * @example\n * ```typescript\n * createOpenAIClient: async (openai, opts) => {\n * // Wrap with langsmith for planning tasks\n * if (opts.baseURL?.includes('planning')) {\n * return wrapOpenAI(openai, { metadata: { task: 'planning' } });\n * }\n *\n * return openai;\n * }\n * ```\n */\n createOpenAIClient?: CreateOpenAIClientFn;\n}\n\nexport type TestStatus =\n | 'passed'\n | 'failed'\n | 'timedOut'\n | 'skipped'\n | 'interrupted';\n\nexport interface ReportFileWithAttributes {\n reportFilePath: string;\n reportAttributes: {\n testDuration: number;\n testStatus: TestStatus;\n testTitle: string;\n testId: string;\n testDescription: string;\n };\n}\n"],"names":["AIResponseFormat","UIContext","ServiceError","Error","message","dump"],"mappings":";AAAqD;;;;;;;;;;AAwC9C,IAAKA,yBAAgBA,WAAAA,GAAAA,SAAhBA,gBAAgB;;;WAAhBA;;AA0EL,MAAeC;AAMtB;AAyEO,MAAMC,qBAAqBC;IAGhC,YAAYC,OAAe,EAAEC,IAAiB,CAAE;QAC9C,KAAK,CAACD,UAHR;QAIE,IAAI,CAAC,IAAI,GAAG;QACZ,IAAI,CAAC,IAAI,GAAGC;IACd;AACF"}
|
|
1
|
+
{"version":3,"file":"types.mjs","sources":["../../src/types.ts"],"sourcesContent":["/* eslint-disable @typescript-eslint/no-explicit-any */\n\nimport type { NodeType } from '@midscene/shared/constants';\nimport type { CreateOpenAIClientFn, TModelConfig } from '@midscene/shared/env';\nimport type {\n BaseElement,\n LocateResultElement,\n Rect,\n Size,\n} from '@midscene/shared/types';\nimport type { z } from 'zod';\nimport type { TUserPrompt } from './common';\nimport type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';\n\nexport type {\n ElementTreeNode,\n BaseElement,\n Rect,\n Size,\n Point,\n} from '@midscene/shared/types';\nexport * from './yaml';\n\nexport type AIUsageInfo = Record<string, any> & {\n prompt_tokens: number | undefined;\n completion_tokens: number | undefined;\n total_tokens: number | undefined;\n cached_input: number | undefined;\n time_cost: number | undefined;\n model_name: string | undefined;\n model_description: string | undefined;\n intent: string | undefined;\n};\n\nexport type { LocateResultElement };\n\n/**\n * openai\n *\n */\nexport enum AIResponseFormat {\n JSON = 'json_object',\n TEXT = 'text',\n}\n\nexport type AISingleElementResponseById = {\n id: string;\n reason?: string;\n text?: string;\n xpaths?: string[];\n};\n\nexport type AISingleElementResponseByPosition = {\n position?: {\n x: number;\n y: number;\n };\n bbox?: [number, number, number, number];\n reason: string;\n text: string;\n};\n\nexport type AISingleElementResponse = AISingleElementResponseById;\n\nexport interface AIElementCoordinatesResponse {\n bbox: [number, number, number, number];\n errors?: string[];\n}\n\nexport type AIElementResponse = AIElementCoordinatesResponse;\n\nexport interface AIDataExtractionResponse<DataDemand> {\n data: DataDemand;\n errors?: string[];\n thought?: string;\n}\n\nexport interface AISectionLocatorResponse {\n bbox: [number, number, number, number];\n references_bbox?: [number, number, number, number][];\n error?: string;\n}\n\nexport interface AIAssertionResponse {\n pass: boolean;\n thought: string;\n}\n\nexport interface AIDescribeElementResponse {\n description: string;\n error?: string;\n}\n\nexport interface LocatorValidatorOption {\n centerDistanceThreshold?: number;\n}\n\nexport interface LocateValidatorResult {\n pass: boolean;\n rect: Rect;\n center: [number, number];\n centerDistance?: number;\n}\n\nexport interface AgentDescribeElementAtPointResult {\n prompt: string;\n deepThink: boolean;\n verifyResult?: LocateValidatorResult;\n}\n\n/**\n * context\n */\n\nexport abstract class UIContext {\n abstract screenshotBase64: string;\n\n abstract size: Size;\n\n abstract _isFrozen?: boolean;\n}\n\nexport type EnsureObject<T> = { [K in keyof T]: any };\n\nexport type ServiceAction = 'locate' | 'extract' | 'assert' | 'describe';\n\nexport type ServiceExtractParam = string | Record<string, string>;\n\nexport type ElementCacheFeature = Record<string, unknown>;\n\nexport interface LocateResult {\n element: LocateResultElement | null;\n rect?: Rect;\n}\n\nexport type ThinkingLevel = 'off' | 'medium' | 'high';\n\nexport interface ServiceTaskInfo {\n durationMs: number;\n formatResponse?: string;\n rawResponse?: string;\n usage?: AIUsageInfo;\n searchArea?: Rect;\n searchAreaRawResponse?: string;\n searchAreaUsage?: AIUsageInfo;\n}\n\nexport interface DumpMeta {\n logTime: number;\n}\n\nexport interface ReportDumpWithAttributes {\n dumpString: string;\n attributes?: Record<string, any>;\n}\n\nexport interface ServiceDump extends DumpMeta {\n type: 'locate' | 'extract' | 'assert';\n logId: string;\n userQuery: {\n element?: TUserPrompt;\n dataDemand?: ServiceExtractParam;\n assertion?: TUserPrompt;\n };\n matchedElement: LocateResultElement[];\n matchedRect?: Rect;\n deepThink?: boolean;\n data: any;\n assertionPass?: boolean;\n assertionThought?: string;\n taskInfo: ServiceTaskInfo;\n error?: string;\n output?: any;\n}\n\nexport type PartialServiceDumpFromSDK = Omit<\n ServiceDump,\n 'logTime' | 'logId' | 'model_name'\n>;\n\nexport interface ServiceResultBase {\n dump: ServiceDump;\n}\n\nexport type LocateResultWithDump = LocateResult & ServiceResultBase;\n\nexport interface ServiceExtractResult<T> extends ServiceResultBase {\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n}\n\nexport class ServiceError extends Error {\n dump: ServiceDump;\n\n constructor(message: string, dump: ServiceDump) {\n super(message);\n this.name = 'ServiceError';\n this.dump = dump;\n }\n}\n\n// intermediate variables to optimize the return value by AI\nexport interface LiteUISection {\n name: string;\n description: string;\n sectionCharacteristics: string;\n textIds: string[];\n}\n\nexport type ElementById = (id: string) => BaseElement | null;\n\nexport type ServiceAssertionResponse = AIAssertionResponse & {\n usage?: AIUsageInfo;\n};\n\n/**\n * agent\n */\n\nexport type OnTaskStartTip = (tip: string) => Promise<void> | void;\n\nexport interface AgentWaitForOpt {\n checkIntervalMs?: number;\n timeoutMs?: number;\n [key: string]: unknown;\n}\n\nexport interface AgentAssertOpt {\n keepRawResponse?: boolean;\n}\n\n/**\n * planning\n *\n */\n\nexport interface PlanningLocateParam extends DetailedLocateParam {\n bbox?: [number, number, number, number];\n}\n\nexport interface PlanningAction<ParamType = any> {\n thought?: string;\n type: string;\n param: ParamType;\n}\n\nexport interface RawResponsePlanningAIResponse {\n action: PlanningAction;\n more_actions_needed_by_instruction: boolean;\n log: string;\n sleep?: number;\n error?: string;\n}\n\nexport interface PlanningAIResponse\n extends Omit<RawResponsePlanningAIResponse, 'action'> {\n actions?: PlanningAction[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n yamlString?: string;\n error?: string;\n}\n\nexport interface PlanningActionParamSleep {\n timeMs: number;\n}\n\nexport interface PlanningActionParamError {\n thought: string;\n}\n\nexport type PlanningActionParamWaitFor = AgentWaitForOpt & {};\n\nexport interface LongPressParam {\n duration?: number;\n}\n\nexport interface PullParam {\n direction: 'up' | 'down';\n distance?: number;\n duration?: number;\n}\n/**\n * misc\n */\n\nexport interface Color {\n name: string;\n hex: string;\n}\n\nexport interface BaseAgentParserOpt {\n selector?: string;\n}\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PuppeteerParserOpt extends BaseAgentParserOpt {}\n\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PlaywrightParserOpt extends BaseAgentParserOpt {}\n\n/*\naction\n*/\nexport interface ExecutionTaskProgressOptions {\n onTaskStart?: (task: ExecutionTask) => Promise<void> | void;\n}\n\nexport interface ExecutionRecorderItem {\n type: 'screenshot';\n ts: number;\n screenshot?: string;\n timing?: string;\n}\n\nexport type ExecutionTaskType = 'Planning' | 'Insight' | 'Action Space' | 'Log';\n\nexport interface ExecutorContext {\n task: ExecutionTask;\n element?: LocateResultElement | null;\n uiContext?: UIContext;\n}\n\nexport interface ExecutionTaskApply<\n Type extends ExecutionTaskType = any,\n TaskParam = any,\n TaskOutput = any,\n TaskLog = any,\n> {\n type: Type;\n subType?: string;\n subTask?: boolean;\n param?: TaskParam;\n thought?: string;\n uiContext?: UIContext;\n executor: (\n param: TaskParam,\n context: ExecutorContext,\n ) => // biome-ignore lint/suspicious/noConfusingVoidType: <explanation>\n | Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>\n | undefined\n | void;\n}\n\nexport interface ExecutionTaskHitBy {\n from: string;\n context: Record<string, any>;\n}\n\nexport interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {\n output?: TaskOutput;\n log?: TaskLog;\n recorder?: ExecutionRecorderItem[];\n hitBy?: ExecutionTaskHitBy;\n}\n\nexport type ExecutionTask<\n E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<\n any,\n any,\n any\n >,\n> = E &\n ExecutionTaskReturn<\n E extends ExecutionTaskApply<any, any, infer TaskOutput, any>\n ? TaskOutput\n : unknown,\n E extends ExecutionTaskApply<any, any, any, infer TaskLog>\n ? TaskLog\n : unknown\n > & {\n status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';\n error?: Error;\n errorMessage?: string;\n errorStack?: string;\n timing?: {\n start: number;\n end?: number;\n cost?: number;\n };\n usage?: AIUsageInfo;\n searchAreaUsage?: AIUsageInfo;\n };\n\nexport interface ExecutionDump extends DumpMeta {\n name: string;\n description?: string;\n tasks: ExecutionTask[];\n aiActionContext?: string;\n}\n\n/*\ntask - service-locate\n*/\nexport type ExecutionTaskInsightLocateParam = PlanningLocateParam;\n\nexport interface ExecutionTaskInsightLocateOutput {\n element: LocateResultElement | null;\n}\n\nexport type ExecutionTaskInsightDump = ServiceDump;\n\nexport type ExecutionTaskInsightLocateApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightLocateParam,\n ExecutionTaskInsightLocateOutput,\n ExecutionTaskInsightDump\n>;\n\nexport type ExecutionTaskInsightLocate =\n ExecutionTask<ExecutionTaskInsightLocateApply>;\n\n/*\ntask - service-query\n*/\nexport interface ExecutionTaskInsightQueryParam {\n dataDemand: ServiceExtractParam;\n}\n\nexport interface ExecutionTaskInsightQueryOutput {\n data: any;\n}\n\nexport type ExecutionTaskInsightQueryApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightQueryParam,\n any,\n ExecutionTaskInsightDump\n>;\n\nexport type ExecutionTaskInsightQuery =\n ExecutionTask<ExecutionTaskInsightQueryApply>;\n\n/*\ntask - assertion\n*/\nexport interface ExecutionTaskInsightAssertionParam {\n assertion: string;\n}\n\nexport type ExecutionTaskInsightAssertionApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightAssertionParam,\n ServiceAssertionResponse,\n ExecutionTaskInsightDump\n>;\n\nexport type ExecutionTaskInsightAssertion =\n ExecutionTask<ExecutionTaskInsightAssertionApply>;\n\n/*\ntask - action (i.e. interact) \n*/\nexport type ExecutionTaskActionApply<ActionParam = any> = ExecutionTaskApply<\n 'Action Space',\n ActionParam,\n void,\n void\n>;\n\nexport type ExecutionTaskAction = ExecutionTask<ExecutionTaskActionApply>;\n\n/*\ntask - Log\n*/\n\nexport type ExecutionTaskLogApply<\n LogParam = {\n content: string;\n },\n> = ExecutionTaskApply<'Log', LogParam, void, void>;\n\nexport type ExecutionTaskLog = ExecutionTask<ExecutionTaskLogApply>;\n\n/*\ntask - planning\n*/\n\nexport type ExecutionTaskPlanningApply = ExecutionTaskApply<\n 'Planning',\n {\n userInstruction: string;\n aiActionContext?: string;\n },\n PlanningAIResponse\n>;\n\nexport type ExecutionTaskPlanning = ExecutionTask<ExecutionTaskPlanningApply>;\n\n/*\ntask - planning-locate\n*/\nexport type ExecutionTaskPlanningLocateParam = PlanningLocateParam;\n\nexport interface ExecutionTaskPlanningLocateOutput {\n element: LocateResultElement | null;\n}\n\nexport type ExecutionTaskPlanningDump = ServiceDump;\n\nexport type ExecutionTaskPlanningLocateApply = ExecutionTaskApply<\n 'Planning',\n ExecutionTaskPlanningLocateParam,\n ExecutionTaskPlanningLocateOutput,\n ExecutionTaskPlanningDump\n>;\n\nexport type ExecutionTaskPlanningLocate =\n ExecutionTask<ExecutionTaskPlanningLocateApply>;\n\n/*\nGrouped dump\n*/\nexport interface GroupedActionDump {\n sdkVersion: string;\n groupName: string;\n groupDescription?: string;\n modelBriefs: string[];\n executions: ExecutionDump[];\n}\n\nexport type InterfaceType =\n | 'puppeteer'\n | 'playwright'\n | 'static'\n | 'chrome-extension-proxy'\n | 'android'\n | string;\n\nexport interface StreamingCodeGenerationOptions {\n /** Whether to enable streaming output */\n stream?: boolean;\n /** Callback function to handle streaming chunks */\n onChunk?: StreamingCallback;\n /** Callback function to handle streaming completion */\n onComplete?: (finalCode: string) => void;\n /** Callback function to handle streaming errors */\n onError?: (error: Error) => void;\n}\n\nexport type StreamingCallback = (chunk: CodeGenerationChunk) => void;\n\nexport interface CodeGenerationChunk {\n /** The incremental content chunk */\n content: string;\n /** The reasoning content */\n reasoning_content: string;\n /** The accumulated content so far */\n accumulated: string;\n /** Whether this is the final chunk */\n isComplete: boolean;\n /** Token usage information if available */\n usage?: AIUsageInfo;\n}\n\nexport interface StreamingAIResponse {\n /** The final accumulated content */\n content: string;\n /** Token usage information */\n usage?: AIUsageInfo;\n /** Whether the response was streamed */\n isStreamed: boolean;\n}\n\nexport interface DeviceAction<TParam = any, TReturn = any> {\n name: string;\n description?: string;\n interfaceAlias?: string;\n paramSchema?: z.ZodType<TParam>;\n call: (param: TParam, context: ExecutorContext) => Promise<TReturn> | TReturn;\n delayAfterRunner?: number;\n}\n\n/**\n * Type utilities for extracting types from DeviceAction definitions\n */\n\n/**\n * Extract parameter type from a DeviceAction\n */\nexport type ActionParam<Action extends DeviceAction<any, any>> =\n Action extends DeviceAction<infer P, any> ? P : never;\n\n/**\n * Extract return type from a DeviceAction\n */\nexport type ActionReturn<Action extends DeviceAction<any, any>> =\n Action extends DeviceAction<any, infer R> ? R : never;\n\n/**\n * Web-specific types\n */\nexport interface WebElementInfo extends BaseElement {\n id: string;\n attributes: {\n nodeType: NodeType;\n [key: string]: string;\n };\n}\n\nexport type WebUIContext = UIContext;\n\n/**\n * Agent\n */\n\nexport type CacheConfig = {\n strategy?: 'read-only' | 'read-write' | 'write-only';\n id: string;\n};\n\nexport type Cache =\n | false // No read, no write\n | true // Will throw error at runtime - deprecated\n | CacheConfig; // Object configuration (requires explicit id)\n\nexport interface AgentOpt {\n testId?: string;\n // @deprecated\n cacheId?: string; // Keep backward compatibility, but marked as deprecated\n groupName?: string;\n groupDescription?: string;\n /* if auto generate report, default true */\n generateReport?: boolean;\n /* if auto print report msg, default true */\n autoPrintReportMsg?: boolean;\n onTaskStartTip?: OnTaskStartTip;\n aiActionContext?: string;\n /* custom report file name */\n reportFileName?: string;\n modelConfig?: TModelConfig;\n cache?: Cache;\n /**\n * Maximum number of replanning cycles for aiAct.\n * Defaults to 20 (40 for `vlm-ui-tars`) when not provided.\n * If omitted, the agent will also read `MIDSCENE_REPLANNING_CYCLE_LIMIT` for backward compatibility.\n */\n replanningCycleLimit?: number;\n\n /**\n * Custom OpenAI client factory function\n *\n * If provided, this function will be called to create OpenAI client instances\n * for each AI call, allowing you to:\n * - Wrap clients with observability tools (langsmith, langfuse)\n * - Use custom OpenAI-compatible clients\n * - Apply different configurations based on intent\n *\n * @param config - Resolved model configuration\n * @returns OpenAI client instance (original or wrapped)\n *\n * @example\n * ```typescript\n * createOpenAIClient: async (openai, opts) => {\n * // Wrap with langsmith for planning tasks\n * if (opts.baseURL?.includes('planning')) {\n * return wrapOpenAI(openai, { metadata: { task: 'planning' } });\n * }\n *\n * return openai;\n * }\n * ```\n */\n createOpenAIClient?: CreateOpenAIClientFn;\n}\n\nexport type TestStatus =\n | 'passed'\n | 'failed'\n | 'timedOut'\n | 'skipped'\n | 'interrupted';\n\nexport interface ReportFileWithAttributes {\n reportFilePath: string;\n reportAttributes: {\n testDuration: number;\n testStatus: TestStatus;\n testTitle: string;\n testId: string;\n testDescription: string;\n };\n}\n"],"names":["AIResponseFormat","UIContext","ServiceError","Error","message","dump"],"mappings":";AAAqD;;;;;;;;;;AAwC9C,IAAKA,yBAAgBA,WAAAA,GAAAA,SAAhBA,gBAAgB;;;WAAhBA;;AA0EL,MAAeC;AAMtB;AAwEO,MAAMC,qBAAqBC;IAGhC,YAAYC,OAAe,EAAEC,IAAiB,CAAE;QAC9C,KAAK,CAACD,UAHR;QAIE,IAAI,CAAC,IAAI,GAAG;QACZ,IAAI,CAAC,IAAI,GAAGC;IACd;AACF"}
|