@midscene/core 1.2.2-beta-20260115092052.0 → 1.2.2-beta-20260116060040.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +27 -24
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +15 -29
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +4 -2
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/inspect.mjs +3 -3
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +14 -4
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +44 -9
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +11 -10
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +3 -2
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/common.mjs +14 -5
- package/dist/es/common.mjs.map +1 -1
- package/dist/es/device/index.mjs +29 -4
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/dump/html-utils.mjs +42 -0
- package/dist/es/dump/html-utils.mjs.map +1 -0
- package/dist/es/dump/image-restoration.mjs +31 -0
- package/dist/es/dump/image-restoration.mjs.map +1 -0
- package/dist/es/dump/index.mjs +3 -0
- package/dist/es/index.mjs +4 -2
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/screenshot-item.mjs +34 -0
- package/dist/es/screenshot-item.mjs.map +1 -0
- package/dist/es/service/index.mjs +2 -1
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/task-runner.mjs +7 -9
- package/dist/es/task-runner.mjs.map +1 -1
- package/dist/es/types.mjs +78 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +3 -2
- package/dist/es/utils.mjs.map +1 -1
- package/dist/lib/agent/agent.js +24 -21
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/tasks.js +15 -29
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +4 -2
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/inspect.js +3 -3
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +13 -3
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +44 -9
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +11 -10
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +3 -2
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/common.js +20 -5
- package/dist/lib/common.js.map +1 -1
- package/dist/lib/device/index.js +53 -16
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/dump/html-utils.js +94 -0
- package/dist/lib/dump/html-utils.js.map +1 -0
- package/dist/lib/dump/image-restoration.js +65 -0
- package/dist/lib/dump/image-restoration.js.map +1 -0
- package/dist/lib/dump/index.js +60 -0
- package/dist/lib/dump/index.js.map +1 -0
- package/dist/lib/index.js +42 -7
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/screenshot-item.js +68 -0
- package/dist/lib/screenshot-item.js.map +1 -0
- package/dist/lib/service/index.js +2 -1
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/task-runner.js +7 -9
- package/dist/lib/task-runner.js.map +1 -1
- package/dist/lib/types.js +91 -3
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +3 -2
- package/dist/lib/utils.js.map +1 -1
- package/dist/types/agent/agent.d.ts +5 -16
- package/dist/types/agent/tasks.d.ts +4 -4
- package/dist/types/ai-model/service-caller/index.d.ts +3 -3
- package/dist/types/common.d.ts +8 -1
- package/dist/types/device/index.d.ts +85 -23
- package/dist/types/dump/html-utils.d.ts +7 -0
- package/dist/types/dump/image-restoration.d.ts +6 -0
- package/dist/types/dump/index.d.ts +5 -0
- package/dist/types/index.d.ts +3 -1
- package/dist/types/screenshot-item.d.ts +27 -0
- package/dist/types/task-runner.d.ts +1 -1
- package/dist/types/types.d.ts +61 -6
- package/package.json +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If there is nothing to do but waiting, set the \"sleep\" field to the positive waiting time in milliseconds and null for the \"action\" field.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"log\": \"Click the login button\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","field","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","paramLine","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","undefined","actionList","logFieldInstruction"],"mappings":";;AAYA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACH,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAME,aACJ,AACE,cADF,OAAQF,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMG,kBAAkBD,aAAa,GAAGH,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMK,WAAWC,eAAeL,OAAOR;gBAGvC,MAAMc,cAAcC,kBAAkBP;gBAGtC,IAAIQ,YAAY,GAAGL,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,IAAIE,aACFE,aAAa,CAAC,IAAI,EAAEF,aAAa;gBAGnCX,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACc;oBAClBf,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEe,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAML,WAAWC,eAAeT;YAChC,MAAMU,cAAcC,kBAAkBX;YAGtC,IAAIc,mBAAmB,CAAC,SAAS,EAAEN,UAAU;YAC7C,IAAIE,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpBhB,OAAO,IAAI,CAACgB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEnB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAekB,2BAA2B,EAC/CC,WAAW,EACXxB,MAAM,EACNyB,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACzB,QAClB,MAAM,IAAI0B,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAACrB,SACtCD,qBACLC,QACAJ,cAAc0B,cAAczB,SAAS4B;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAE9C,MAAMG,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;EAOpB,EAAEhC,mBAAmB;;;;;;;;;;;;;;;;;;;;;oCAqBa,EAAEE,UAAUyB,cAAc,mCAAmC,GAAG;;;;;AAKpG,CAAC;AACD"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"thought\": string, // your thought process about the next action\n \"note\"?: string, // some important notes to finish the follow-up action should be written here, and the agent executing the subsequent steps will focus on this information. For example, the data extracted from the current screenshot which will be used in the follow-up action.\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n \"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\n/**\n * Find ZodDefault in the wrapper chain and return its default value\n */\nconst findDefaultValue = (field: unknown): any | undefined => {\n let current = field;\n const visited = new Set<unknown>();\n\n while (current && !visited.has(current)) {\n visited.add(current);\n const currentWithDef = current as {\n _def?: {\n typeName?: string;\n defaultValue?: () => any;\n innerType?: unknown;\n };\n };\n\n if (!currentWithDef._def?.typeName) break;\n\n if (currentWithDef._def.typeName === 'ZodDefault') {\n return currentWithDef._def.defaultValue?.();\n }\n\n // Continue unwrapping if it's a wrapper type\n if (\n currentWithDef._def.typeName === 'ZodOptional' ||\n currentWithDef._def.typeName === 'ZodNullable'\n ) {\n current = currentWithDef._def.innerType;\n } else {\n break;\n }\n }\n\n return undefined;\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Check if field has a default value by searching the wrapper chain\n const defaultValue = findDefaultValue(field);\n const hasDefault = defaultValue !== undefined;\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n const comments: string[] = [];\n if (description) {\n comments.push(description);\n }\n if (hasDefault) {\n const defaultStr =\n typeof defaultValue === 'string'\n ? `\"${defaultValue}\"`\n : JSON.stringify(defaultValue);\n comments.push(`default: ${defaultStr}`);\n }\n if (comments.length > 0) {\n paramLine += ` // ${comments.join(', ')}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"thought\": \"The form has already been filled, I need to click the login button to login\",\n \"log\": \"Click the login button\",\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n\nFor example, if the instruction is to find out every title in the screenshot, the return value should be:\n\n{\n \"thought\": \"I need to note the titles in the current screenshot for further processing and scroll to find more titles\",\n \"note\": \"The titles in the current screenshot are: 'Hello, world!', 'Midscene 101', 'Model strategy'\",\n \"log\": \"Scroll to find more titles\",\n \"action\": {\n \"type\": \"Scroll\",\n \"param\": {\n \"locate\": {\n \"prompt\": \"The page content area\"\n },\n \"direction\": \"down\"\n }\n }\n}\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","findDefaultValue","field","current","visited","Set","currentWithDef","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","defaultValue","hasDefault","undefined","paramLine","comments","defaultStr","JSON","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","actionList","logFieldInstruction"],"mappings":";;AAYA,MAAMA,qBAAqB,CAAC;;;yNAG6L,CAAC;AAE1N,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAKA,MAAME,mBAAmB,CAACC;IACxB,IAAIC,UAAUD;IACd,MAAME,UAAU,IAAIC;IAEpB,MAAOF,WAAW,CAACC,QAAQ,GAAG,CAACD,SAAU;QACvCC,QAAQ,GAAG,CAACD;QACZ,MAAMG,iBAAiBH;QAQvB,IAAI,CAACG,eAAe,IAAI,EAAE,UAAU;QAEpC,IAAIA,AAAiC,iBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAC9B,OAAOA,eAAe,IAAI,CAAC,YAAY;QAIzC,IACEA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,IAC5BA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAE5BH,UAAUG,eAAe,IAAI,CAAC,SAAS;aAEvC;IAEJ;AAGF;AAEO,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKd,MAAM,IAAIe,OAAO,OAAO,CAACF,OACxC,IAAIb,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMgB,aACJ,AACE,cADF,OAAQhB,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMiB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMI,WAAWC,eAAenB,OAAOO;gBAGvC,MAAMa,cAAcC,kBAAkBrB;gBAGtC,MAAMsB,eAAevB,iBAAiBC;gBACtC,MAAMuB,aAAaD,AAAiBE,WAAjBF;gBAGnB,IAAIG,YAAY,GAAGR,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,MAAMQ,WAAqB,EAAE;gBAC7B,IAAIN,aACFM,SAAS,IAAI,CAACN;gBAEhB,IAAIG,YAAY;oBACd,MAAMI,aACJ,AAAwB,YAAxB,OAAOL,eACH,CAAC,CAAC,EAAEA,aAAa,CAAC,CAAC,GACnBM,KAAK,SAAS,CAACN;oBACrBI,SAAS,IAAI,CAAC,CAAC,SAAS,EAAEC,YAAY;gBACxC;gBACA,IAAID,SAAS,MAAM,GAAG,GACpBD,aAAa,CAAC,IAAI,EAAEC,SAAS,IAAI,CAAC,OAAO;gBAG3ChB,WAAW,IAAI,CAACe;YAClB;YAIF,IAAIf,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACmB;oBAClBpB,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEoB,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAMX,WAAWC,eAAeR;YAChC,MAAMS,cAAcC,kBAAkBV;YAGtC,IAAImB,mBAAmB,CAAC,SAAS,EAAEZ,UAAU;YAC7C,IAAIE,aACFU,oBAAoB,CAAC,IAAI,EAAEV,aAAa;YAE1CU,oBAAoB;YAEpBrB,OAAO,IAAI,CAACqB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAExB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAeuB,2BAA2B,EAC/CC,WAAW,EACXnC,MAAM,EACNoC,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACpC,QAClB,MAAM,IAAIqC,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAAC1B,SACtCD,qBACLC,QACAV,cAAcqC,cAAcpC,SAAS2B;IAGzC,MAAMY,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,MAAME,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;AAeV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;EAMpB,EAAE1C,mBAAmB;;;;;;;;;;;;;;;;;;;oCAmBa,EAAEE,UAAUoC,cAAc,mCAAmC,GAAG;;;;;;;;;;;;;;;;;;;;;;AAsBpG,CAAC;AACD"}
|
|
@@ -4,7 +4,7 @@ import { assert, ifInBrowser } from "@midscene/shared/utils";
|
|
|
4
4
|
import { jsonrepair } from "jsonrepair";
|
|
5
5
|
import openai_0 from "openai";
|
|
6
6
|
async function createChatClient({ modelConfig }) {
|
|
7
|
-
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, modelDescription, uiTarsModelVersion: uiTarsVersion, vlMode, createOpenAIClient, timeout } = modelConfig;
|
|
7
|
+
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, modelDescription, uiTarsModelVersion: uiTarsVersion, vlMode, modelFamily, createOpenAIClient, timeout } = modelConfig;
|
|
8
8
|
let proxyAgent;
|
|
9
9
|
const debugProxy = getDebug('ai:call:proxy');
|
|
10
10
|
const sanitizeProxyUrl = (url)=>{
|
|
@@ -99,11 +99,12 @@ async function createChatClient({ modelConfig }) {
|
|
|
99
99
|
modelName,
|
|
100
100
|
modelDescription,
|
|
101
101
|
uiTarsVersion,
|
|
102
|
-
vlMode
|
|
102
|
+
vlMode,
|
|
103
|
+
modelFamily
|
|
103
104
|
};
|
|
104
105
|
}
|
|
105
106
|
async function callAI(messages, modelConfig, options) {
|
|
106
|
-
const { completion, modelName, modelDescription, uiTarsVersion, vlMode } = await createChatClient({
|
|
107
|
+
const { completion, modelName, modelDescription, uiTarsVersion, vlMode, modelFamily } = await createChatClient({
|
|
107
108
|
modelConfig
|
|
108
109
|
});
|
|
109
110
|
const maxTokens = globalConfigManager.getEnvConfigValueAsNumber(MIDSCENE_MODEL_MAX_TOKENS) ?? globalConfigManager.getEnvConfigValueAsNumber(OPENAI_MAX_TOKENS);
|
|
@@ -142,7 +143,7 @@ async function callAI(messages, modelConfig, options) {
|
|
|
142
143
|
};
|
|
143
144
|
const { config: deepThinkConfig, debugMessage, warningMessage } = resolveDeepThinkConfig({
|
|
144
145
|
deepThink: options?.deepThink,
|
|
145
|
-
|
|
146
|
+
modelFamily
|
|
146
147
|
});
|
|
147
148
|
if (debugMessage) debugCall(debugMessage);
|
|
148
149
|
if (warningMessage) {
|
|
@@ -290,19 +291,19 @@ function preprocessDoubaoBboxJson(input) {
|
|
|
290
291
|
if (input.includes('bbox')) while(/\d+\s+\d+/.test(input))input = input.replace(/(\d+)\s+(\d+)/g, '$1,$2');
|
|
291
292
|
return input;
|
|
292
293
|
}
|
|
293
|
-
function resolveDeepThinkConfig({ deepThink,
|
|
294
|
+
function resolveDeepThinkConfig({ deepThink, modelFamily }) {
|
|
294
295
|
const normalizedDeepThink = 'unset' === deepThink ? void 0 : deepThink;
|
|
295
296
|
if (void 0 === normalizedDeepThink) return {
|
|
296
297
|
config: {},
|
|
297
298
|
debugMessage: void 0
|
|
298
299
|
};
|
|
299
|
-
if ('qwen3-vl' ===
|
|
300
|
+
if ('qwen3-vl' === modelFamily) return {
|
|
300
301
|
config: {
|
|
301
302
|
enable_thinking: normalizedDeepThink
|
|
302
303
|
},
|
|
303
304
|
debugMessage: `deepThink mapped to enable_thinking=${normalizedDeepThink} for qwen3-vl`
|
|
304
305
|
};
|
|
305
|
-
if ('doubao-vision' ===
|
|
306
|
+
if ('doubao-vision' === modelFamily) return {
|
|
306
307
|
config: {
|
|
307
308
|
thinking: {
|
|
308
309
|
type: normalizedDeepThink ? 'enabled' : 'disabled'
|
|
@@ -310,7 +311,7 @@ function resolveDeepThinkConfig({ deepThink, vlMode }) {
|
|
|
310
311
|
},
|
|
311
312
|
debugMessage: `deepThink mapped to thinking.type=${normalizedDeepThink ? 'enabled' : 'disabled'} for doubao-vision`
|
|
312
313
|
};
|
|
313
|
-
if ('glm-v' ===
|
|
314
|
+
if ('glm-v' === modelFamily) return {
|
|
314
315
|
config: {
|
|
315
316
|
thinking: {
|
|
316
317
|
type: normalizedDeepThink ? 'enabled' : 'disabled'
|
|
@@ -320,8 +321,8 @@ function resolveDeepThinkConfig({ deepThink, vlMode }) {
|
|
|
320
321
|
};
|
|
321
322
|
return {
|
|
322
323
|
config: {},
|
|
323
|
-
debugMessage: `deepThink ignored: unsupported model_family "${
|
|
324
|
-
warningMessage: `The "deepThink" option is not supported for model_family "${
|
|
324
|
+
debugMessage: `deepThink ignored: unsupported model_family "${modelFamily ?? 'default'}"`,
|
|
325
|
+
warningMessage: `The "deepThink" option is not supported for model_family "${modelFamily ?? 'default'}".`
|
|
325
326
|
};
|
|
326
327
|
}
|
|
327
328
|
function normalizeJsonObject(obj) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/service-caller/index.mjs","sources":["../../../../src/ai-model/service-caller/index.ts"],"sourcesContent":["import {\n AIResponseFormat,\n type AIUsageInfo,\n type DeepThinkOption,\n} from '@/types';\nimport type { CodeGenerationChunk, StreamingCallback } from '@/types';\nimport {\n type IModelConfig,\n MIDSCENE_LANGFUSE_DEBUG,\n MIDSCENE_LANGSMITH_DEBUG,\n MIDSCENE_MODEL_MAX_TOKENS,\n OPENAI_MAX_TOKENS,\n type TVlModeTypes,\n type UITarsModelVersion,\n globalConfigManager,\n} from '@midscene/shared/env';\n\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert, ifInBrowser } from '@midscene/shared/utils';\nimport { jsonrepair } from 'jsonrepair';\nimport OpenAI from 'openai';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { Stream } from 'openai/streaming';\nimport type { AIArgs } from '../../common';\n\nasync function createChatClient({\n modelConfig,\n}: {\n modelConfig: IModelConfig;\n}): Promise<{\n completion: OpenAI.Chat.Completions;\n modelName: string;\n modelDescription: string;\n uiTarsVersion?: UITarsModelVersion;\n vlMode: TVlModeTypes | undefined;\n}> {\n const {\n socksProxy,\n httpProxy,\n modelName,\n openaiBaseURL,\n openaiApiKey,\n openaiExtraConfig,\n modelDescription,\n uiTarsModelVersion: uiTarsVersion,\n vlMode,\n createOpenAIClient,\n timeout,\n } = modelConfig;\n\n let proxyAgent: any = undefined;\n const debugProxy = getDebug('ai:call:proxy');\n\n // Helper function to sanitize proxy URL for logging (remove credentials)\n // Uses URL API instead of regex to avoid ReDoS vulnerabilities\n const sanitizeProxyUrl = (url: string): string => {\n try {\n const parsed = new URL(url);\n if (parsed.username) {\n // Keep username for debugging, hide password for security\n parsed.password = '****';\n return parsed.href;\n }\n return url;\n } catch {\n // If URL parsing fails, return original URL (will be caught later)\n return url;\n }\n };\n\n if (httpProxy) {\n debugProxy('using http proxy', sanitizeProxyUrl(httpProxy));\n if (ifInBrowser) {\n console.warn(\n 'HTTP proxy is configured but not supported in browser environment',\n );\n } else {\n // Dynamic import with variable to avoid bundler static analysis\n const moduleName = 'undici';\n const { ProxyAgent } = await import(moduleName);\n proxyAgent = new ProxyAgent({\n uri: httpProxy,\n // Note: authentication is handled via the URI (e.g., http://user:pass@proxy.com:8080)\n });\n }\n } else if (socksProxy) {\n debugProxy('using socks proxy', sanitizeProxyUrl(socksProxy));\n if (ifInBrowser) {\n console.warn(\n 'SOCKS proxy is configured but not supported in browser environment',\n );\n } else {\n try {\n // Dynamic import with variable to avoid bundler static analysis\n const moduleName = 'fetch-socks';\n const { socksDispatcher } = await import(moduleName);\n // Parse SOCKS proxy URL (e.g., socks5://127.0.0.1:1080)\n const proxyUrl = new URL(socksProxy);\n\n // Validate hostname\n if (!proxyUrl.hostname) {\n throw new Error('SOCKS proxy URL must include a valid hostname');\n }\n\n // Validate and parse port\n const port = Number.parseInt(proxyUrl.port, 10);\n if (!proxyUrl.port || Number.isNaN(port)) {\n throw new Error('SOCKS proxy URL must include a valid port');\n }\n\n // Parse SOCKS version from protocol\n const protocol = proxyUrl.protocol.replace(':', '');\n const socksType =\n protocol === 'socks4' ? 4 : protocol === 'socks5' ? 5 : 5;\n\n proxyAgent = socksDispatcher({\n type: socksType,\n host: proxyUrl.hostname,\n port,\n ...(proxyUrl.username\n ? {\n userId: decodeURIComponent(proxyUrl.username),\n password: decodeURIComponent(proxyUrl.password || ''),\n }\n : {}),\n });\n debugProxy('socks proxy configured successfully', {\n type: socksType,\n host: proxyUrl.hostname,\n port: port,\n });\n } catch (error) {\n console.error('Failed to configure SOCKS proxy:', error);\n throw new Error(\n `Invalid SOCKS proxy URL: ${socksProxy}. Expected format: socks4://host:port, socks5://host:port, or with authentication: socks5://user:pass@host:port`,\n );\n }\n }\n }\n\n const openAIOptions = {\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n // Use fetchOptions.dispatcher for fetch-based SDK instead of httpAgent\n // Note: Type assertion needed due to undici version mismatch between dependencies\n ...(proxyAgent ? { fetchOptions: { dispatcher: proxyAgent as any } } : {}),\n ...openaiExtraConfig,\n ...(typeof timeout === 'number' ? { timeout } : {}),\n dangerouslyAllowBrowser: true,\n };\n\n const baseOpenAI = new OpenAI(openAIOptions);\n\n let openai: OpenAI = baseOpenAI;\n\n // LangSmith wrapper\n if (\n openai &&\n globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)\n ) {\n if (ifInBrowser) {\n throw new Error('langsmith is not supported in browser');\n }\n console.log('DEBUGGING MODE: langsmith wrapper enabled');\n // Use variable to prevent static analysis by bundlers\n const langsmithModule = 'langsmith/wrappers';\n const { wrapOpenAI } = await import(langsmithModule);\n openai = wrapOpenAI(openai);\n }\n\n // Langfuse wrapper\n if (\n openai &&\n globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGFUSE_DEBUG)\n ) {\n if (ifInBrowser) {\n throw new Error('langfuse is not supported in browser');\n }\n console.log('DEBUGGING MODE: langfuse wrapper enabled');\n // Use variable to prevent static analysis by bundlers\n const langfuseModule = 'langfuse';\n const { observeOpenAI } = await import(langfuseModule);\n openai = observeOpenAI(openai);\n }\n\n if (createOpenAIClient) {\n const wrappedClient = await createOpenAIClient(baseOpenAI, openAIOptions);\n\n if (wrappedClient) {\n openai = wrappedClient as OpenAI;\n }\n }\n\n return {\n completion: openai.chat.completions,\n modelName,\n modelDescription,\n uiTarsVersion,\n vlMode,\n };\n}\n\nexport async function callAI(\n messages: ChatCompletionMessageParam[],\n modelConfig: IModelConfig,\n options?: {\n stream?: boolean;\n onChunk?: StreamingCallback;\n deepThink?: DeepThinkOption;\n },\n): Promise<{\n content: string;\n reasoning_content?: string;\n usage?: AIUsageInfo;\n isStreamed: boolean;\n}> {\n const { completion, modelName, modelDescription, uiTarsVersion, vlMode } =\n await createChatClient({\n modelConfig,\n });\n\n const maxTokens =\n globalConfigManager.getEnvConfigValueAsNumber(MIDSCENE_MODEL_MAX_TOKENS) ??\n globalConfigManager.getEnvConfigValueAsNumber(OPENAI_MAX_TOKENS);\n const debugCall = getDebug('ai:call');\n const debugProfileStats = getDebug('ai:profile:stats');\n const debugProfileDetail = getDebug('ai:profile:detail');\n\n const startTime = Date.now();\n const temperature = modelConfig.temperature ?? 0;\n\n const isStreaming = options?.stream && options?.onChunk;\n let content: string | undefined;\n let accumulated = '';\n let accumulatedReasoning = '';\n let usage: OpenAI.CompletionUsage | undefined;\n let timeCost: number | undefined;\n\n const buildUsageInfo = (usageData?: OpenAI.CompletionUsage) => {\n if (!usageData) return undefined;\n\n const cachedInputTokens = (\n usageData as { prompt_tokens_details?: { cached_tokens?: number } }\n )?.prompt_tokens_details?.cached_tokens;\n\n return {\n prompt_tokens: usageData.prompt_tokens ?? 0,\n completion_tokens: usageData.completion_tokens ?? 0,\n total_tokens: usageData.total_tokens ?? 0,\n cached_input: cachedInputTokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelConfig.intent,\n } satisfies AIUsageInfo;\n };\n\n const commonConfig = {\n temperature,\n stream: !!isStreaming,\n max_tokens: maxTokens,\n ...(vlMode === 'qwen2.5-vl' // qwen vl v2 specific config\n ? {\n vl_high_resolution_images: true,\n }\n : {}),\n };\n const {\n config: deepThinkConfig,\n debugMessage,\n warningMessage,\n } = resolveDeepThinkConfig({\n deepThink: options?.deepThink,\n vlMode,\n });\n if (debugMessage) {\n debugCall(debugMessage);\n }\n if (warningMessage) {\n debugCall(warningMessage);\n console.warn(warningMessage);\n }\n\n try {\n debugCall(\n `sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`,\n );\n\n if (isStreaming) {\n const stream = (await completion.create(\n {\n model: modelName,\n messages,\n ...commonConfig,\n ...deepThinkConfig,\n },\n {\n stream: true,\n },\n )) as Stream<OpenAI.Chat.Completions.ChatCompletionChunk> & {\n _request_id?: string | null;\n };\n\n for await (const chunk of stream) {\n const content = chunk.choices?.[0]?.delta?.content || '';\n const reasoning_content =\n (chunk.choices?.[0]?.delta as any)?.reasoning_content || '';\n\n // Check for usage info in any chunk (OpenAI provides usage in separate chunks)\n if (chunk.usage) {\n usage = chunk.usage;\n }\n\n if (content || reasoning_content) {\n accumulated += content;\n accumulatedReasoning += reasoning_content;\n const chunkData: CodeGenerationChunk = {\n content,\n reasoning_content,\n accumulated,\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.choices?.[0]?.finish_reason) {\n timeCost = Date.now() - startTime;\n\n // If usage is not available from the stream, provide a basic usage info\n if (!usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor(accumulated.length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: buildUsageInfo(usage),\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n debugProfileStats(\n `streaming model, ${modelName}, mode, ${vlMode || 'default'}, cost-ms, ${timeCost}, temperature, ${temperature ?? ''}`,\n );\n } else {\n // Non-streaming with retry logic\n const retryCount = modelConfig.retryCount ?? 1;\n const retryInterval = modelConfig.retryInterval ?? 2000;\n const maxAttempts = retryCount + 1; // retryCount=1 means 2 total attempts (1 initial + 1 retry)\n\n let lastError: Error | undefined;\n\n for (let attempt = 1; attempt <= maxAttempts; attempt++) {\n try {\n const result = await completion.create({\n model: modelName,\n messages,\n ...commonConfig,\n ...deepThinkConfig,\n } as any);\n\n timeCost = Date.now() - startTime;\n\n debugProfileStats(\n `model, ${modelName}, mode, ${vlMode || 'default'}, ui-tars-version, ${uiTarsVersion}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}, temperature, ${temperature ?? ''}`,\n );\n\n debugProfileDetail(\n `model usage detail: ${JSON.stringify(result.usage)}`,\n );\n\n if (!result.choices) {\n throw new Error(\n `invalid response from LLM service: ${JSON.stringify(result)}`,\n );\n }\n\n content = result.choices[0].message.content!;\n if (!content) {\n throw new Error('empty content from AI model');\n }\n\n accumulatedReasoning =\n (result.choices[0].message as any)?.reasoning_content || '';\n usage = result.usage;\n break; // Success, exit retry loop\n } catch (error) {\n lastError = error as Error;\n if (attempt < maxAttempts) {\n console.warn(\n `[Midscene] AI call failed (attempt ${attempt}/${maxAttempts}), retrying in ${retryInterval}ms... Error: ${lastError.message}`,\n );\n await new Promise((resolve) => setTimeout(resolve, retryInterval));\n }\n }\n }\n\n if (!content) {\n throw lastError;\n }\n }\n\n debugCall(`response reasoning content: ${accumulatedReasoning}`);\n debugCall(`response content: ${content}`);\n\n // Ensure we always have usage info for streaming responses\n if (isStreaming && !usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor((content || '').length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n } as OpenAI.CompletionUsage;\n }\n\n return {\n content: content || '',\n reasoning_content: accumulatedReasoning || undefined,\n usage: buildUsageInfo(usage),\n isStreamed: !!isStreaming,\n };\n } catch (e: any) {\n console.error(' call AI error', e);\n const newError = new Error(\n `failed to call ${isStreaming ? 'streaming ' : ''}AI model service (${modelName}): ${e.message}\\nTrouble shooting: https://midscenejs.com/model-provider.html`,\n {\n cause: e,\n },\n );\n throw newError;\n }\n}\n\nexport async function callAIWithObjectResponse<T>(\n messages: ChatCompletionMessageParam[],\n modelConfig: IModelConfig,\n options?: {\n deepThink?: DeepThinkOption;\n },\n): Promise<{\n content: T;\n contentString: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const response = await callAI(messages, modelConfig, {\n deepThink: options?.deepThink,\n });\n assert(response, 'empty response');\n const vlMode = modelConfig.vlMode;\n const jsonContent = safeParseJson(response.content, vlMode);\n assert(\n typeof jsonContent === 'object',\n `failed to parse json response from model (${modelConfig.modelName}): ${response.content}`,\n );\n return {\n content: jsonContent,\n contentString: response.content,\n usage: response.usage,\n reasoning_content: response.reasoning_content,\n };\n}\n\nexport async function callAIWithStringResponse(\n msgs: AIArgs,\n modelConfig: IModelConfig,\n): Promise<{ content: string; usage?: AIUsageInfo }> {\n const { content, usage } = await callAI(msgs, modelConfig);\n return { content, usage };\n}\n\nexport function extractJSONFromCodeBlock(response: string) {\n try {\n // First, try to match a JSON object directly in the response\n const jsonMatch = response.match(/^\\s*(\\{[\\s\\S]*\\})\\s*$/);\n if (jsonMatch) {\n return jsonMatch[1];\n }\n\n // If no direct JSON object is found, try to extract JSON from a code block\n const codeBlockMatch = response.match(\n /```(?:json)?\\s*(\\{[\\s\\S]*?\\})\\s*```/,\n );\n if (codeBlockMatch) {\n return codeBlockMatch[1];\n }\n\n // If no code block is found, try to find a JSON-like structure in the text\n const jsonLikeMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (jsonLikeMatch) {\n return jsonLikeMatch[0];\n }\n } catch {}\n // If no JSON-like structure is found, return the original response\n return response;\n}\n\nexport function preprocessDoubaoBboxJson(input: string) {\n if (input.includes('bbox')) {\n // when its values like 940 445 969 490, replace all /\\d+\\s+\\d+/g with /$1,$2/g\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nexport function resolveDeepThinkConfig({\n deepThink,\n vlMode,\n}: {\n deepThink?: DeepThinkOption;\n vlMode?: TVlModeTypes;\n}): {\n config: Record<string, unknown>;\n debugMessage?: string;\n warningMessage?: string;\n} {\n const normalizedDeepThink = deepThink === 'unset' ? undefined : deepThink;\n\n if (normalizedDeepThink === undefined) {\n return { config: {}, debugMessage: undefined };\n }\n\n if (vlMode === 'qwen3-vl') {\n return {\n config: { enable_thinking: normalizedDeepThink },\n debugMessage: `deepThink mapped to enable_thinking=${normalizedDeepThink} for qwen3-vl`,\n };\n }\n\n if (vlMode === 'doubao-vision') {\n return {\n config: {\n thinking: { type: normalizedDeepThink ? 'enabled' : 'disabled' },\n },\n debugMessage: `deepThink mapped to thinking.type=${normalizedDeepThink ? 'enabled' : 'disabled'} for doubao-vision`,\n };\n }\n\n if (vlMode === 'glm-v') {\n return {\n config: {\n thinking: { type: normalizedDeepThink ? 'enabled' : 'disabled' },\n },\n debugMessage: `deepThink mapped to thinking.type=${normalizedDeepThink ? 'enabled' : 'disabled'} for glm-v`,\n };\n }\n\n return {\n config: {},\n debugMessage: `deepThink ignored: unsupported model_family \"${vlMode ?? 'default'}\"`,\n warningMessage: `The \"deepThink\" option is not supported for model_family \"${vlMode ?? 'default'}\".`,\n };\n}\n\n/**\n * Normalize a parsed JSON object by trimming whitespace from:\n * 1. All object keys (e.g., \" prompt \" -> \"prompt\")\n * 2. All string values (e.g., \" Tap \" -> \"Tap\")\n * This handles LLM output that may include leading/trailing spaces.\n */\nfunction normalizeJsonObject(obj: any): any {\n // Handle null and undefined\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n // Handle arrays - recursively normalize each element\n if (Array.isArray(obj)) {\n return obj.map((item) => normalizeJsonObject(item));\n }\n\n // Handle objects\n if (typeof obj === 'object') {\n const normalized: any = {};\n\n for (const [key, value] of Object.entries(obj)) {\n // Trim the key to remove leading/trailing spaces\n const trimmedKey = key.trim();\n\n // Recursively normalize the value\n let normalizedValue = normalizeJsonObject(value);\n\n // Trim all string values\n if (typeof normalizedValue === 'string') {\n normalizedValue = normalizedValue.trim();\n }\n\n normalized[trimmedKey] = normalizedValue;\n }\n\n return normalized;\n }\n\n // Handle primitive strings\n if (typeof obj === 'string') {\n return obj.trim();\n }\n\n // Return other primitives as-is\n return obj;\n}\n\nexport function safeParseJson(input: string, vlMode: TVlModeTypes | undefined) {\n const cleanJsonString = extractJSONFromCodeBlock(input);\n // match the point\n if (cleanJsonString?.match(/\\((\\d+),(\\d+)\\)/)) {\n return cleanJsonString\n .match(/\\((\\d+),(\\d+)\\)/)\n ?.slice(1)\n .map(Number);\n }\n\n let parsed: any;\n let lastError: unknown;\n try {\n parsed = JSON.parse(cleanJsonString);\n return normalizeJsonObject(parsed);\n } catch (error) {\n lastError = error;\n }\n try {\n parsed = JSON.parse(jsonrepair(cleanJsonString));\n return normalizeJsonObject(parsed);\n } catch (error) {\n lastError = error;\n }\n\n if (vlMode === 'doubao-vision' || vlMode === 'vlm-ui-tars') {\n const jsonString = preprocessDoubaoBboxJson(cleanJsonString);\n try {\n parsed = JSON.parse(jsonrepair(jsonString));\n return normalizeJsonObject(parsed);\n } catch (error) {\n lastError = error;\n }\n }\n throw Error(\n `failed to parse LLM response into JSON. Error - ${String(\n lastError ?? 'unknown error',\n )}. Response - \\n ${input}`,\n );\n}\n"],"names":["createChatClient","modelConfig","socksProxy","httpProxy","modelName","openaiBaseURL","openaiApiKey","openaiExtraConfig","modelDescription","uiTarsVersion","vlMode","createOpenAIClient","timeout","proxyAgent","debugProxy","getDebug","sanitizeProxyUrl","url","parsed","URL","ifInBrowser","console","moduleName","ProxyAgent","socksDispatcher","proxyUrl","Error","port","Number","protocol","socksType","decodeURIComponent","error","openAIOptions","baseOpenAI","OpenAI","openai","globalConfigManager","MIDSCENE_LANGSMITH_DEBUG","langsmithModule","wrapOpenAI","MIDSCENE_LANGFUSE_DEBUG","langfuseModule","observeOpenAI","wrappedClient","callAI","messages","options","completion","maxTokens","MIDSCENE_MODEL_MAX_TOKENS","OPENAI_MAX_TOKENS","debugCall","debugProfileStats","debugProfileDetail","startTime","Date","temperature","isStreaming","content","accumulated","accumulatedReasoning","usage","timeCost","buildUsageInfo","usageData","cachedInputTokens","commonConfig","deepThinkConfig","debugMessage","warningMessage","resolveDeepThinkConfig","stream","chunk","reasoning_content","chunkData","undefined","estimatedTokens","Math","finalChunk","retryCount","retryInterval","maxAttempts","lastError","attempt","result","JSON","Promise","resolve","setTimeout","e","newError","callAIWithObjectResponse","response","assert","jsonContent","safeParseJson","callAIWithStringResponse","msgs","extractJSONFromCodeBlock","jsonMatch","codeBlockMatch","jsonLikeMatch","preprocessDoubaoBboxJson","input","deepThink","normalizedDeepThink","normalizeJsonObject","obj","Array","item","normalized","key","value","Object","trimmedKey","normalizedValue","cleanJsonString","jsonrepair","jsonString","String"],"mappings":";;;;;AAyBA,eAAeA,iBAAiB,EAC9BC,WAAW,EAGZ;IAOC,MAAM,EACJC,UAAU,EACVC,SAAS,EACTC,SAAS,EACTC,aAAa,EACbC,YAAY,EACZC,iBAAiB,EACjBC,gBAAgB,EAChB,oBAAoBC,aAAa,EACjCC,MAAM,EACNC,kBAAkB,EAClBC,OAAO,EACR,GAAGX;IAEJ,IAAIY;IACJ,MAAMC,aAAaC,SAAS;IAI5B,MAAMC,mBAAmB,CAACC;QACxB,IAAI;YACF,MAAMC,SAAS,IAAIC,IAAIF;YACvB,IAAIC,OAAO,QAAQ,EAAE;gBAEnBA,OAAO,QAAQ,GAAG;gBAClB,OAAOA,OAAO,IAAI;YACpB;YACA,OAAOD;QACT,EAAE,OAAM;YAEN,OAAOA;QACT;IACF;IAEA,IAAId,WAAW;QACbW,WAAW,oBAAoBE,iBAAiBb;QAChD,IAAIiB,aACFC,QAAQ,IAAI,CACV;aAEG;YAEL,MAAMC,aAAa;YACnB,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAACD;YACpCT,aAAa,IAAIU,WAAW;gBAC1B,KAAKpB;YAEP;QACF;IACF,OAAO,IAAID,YAAY;QACrBY,WAAW,qBAAqBE,iBAAiBd;QACjD,IAAIkB,aACFC,QAAQ,IAAI,CACV;aAGF,IAAI;YAEF,MAAMC,aAAa;YACnB,MAAM,EAAEE,eAAe,EAAE,GAAG,MAAM,MAAM,CAACF;YAEzC,MAAMG,WAAW,IAAIN,IAAIjB;YAGzB,IAAI,CAACuB,SAAS,QAAQ,EACpB,MAAM,IAAIC,MAAM;YAIlB,MAAMC,OAAOC,OAAO,QAAQ,CAACH,SAAS,IAAI,EAAE;YAC5C,IAAI,CAACA,SAAS,IAAI,IAAIG,OAAO,KAAK,CAACD,OACjC,MAAM,IAAID,MAAM;YAIlB,MAAMG,WAAWJ,SAAS,QAAQ,CAAC,OAAO,CAAC,KAAK;YAChD,MAAMK,YACJD,AAAa,aAAbA,WAAwB,IAAIA,AAAa,aAAbA,WAAwB,IAAI;YAE1DhB,aAAaW,gBAAgB;gBAC3B,MAAMM;gBACN,MAAML,SAAS,QAAQ;gBACvBE;gBACA,GAAIF,SAAS,QAAQ,GACjB;oBACE,QAAQM,mBAAmBN,SAAS,QAAQ;oBAC5C,UAAUM,mBAAmBN,SAAS,QAAQ,IAAI;gBACpD,IACA,CAAC,CAAC;YACR;YACAX,WAAW,uCAAuC;gBAChD,MAAMgB;gBACN,MAAML,SAAS,QAAQ;gBACvB,MAAME;YACR;QACF,EAAE,OAAOK,OAAO;YACdX,QAAQ,KAAK,CAAC,oCAAoCW;YAClD,MAAM,IAAIN,MACR,CAAC,yBAAyB,EAAExB,WAAW,+GAA+G,CAAC;QAE3J;IAEJ;IAEA,MAAM+B,gBAAgB;QACpB,SAAS5B;QACT,QAAQC;QAGR,GAAIO,aAAa;YAAE,cAAc;gBAAE,YAAYA;YAAkB;QAAE,IAAI,CAAC,CAAC;QACzE,GAAGN,iBAAiB;QACpB,GAAI,AAAmB,YAAnB,OAAOK,UAAuB;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAClD,yBAAyB;IAC3B;IAEA,MAAMsB,aAAa,IAAIC,SAAOF;IAE9B,IAAIG,SAAiBF;IAGrB,IACEE,UACAC,oBAAoB,qBAAqB,CAACC,2BAC1C;QACA,IAAIlB,aACF,MAAM,IAAIM,MAAM;QAElBL,QAAQ,GAAG,CAAC;QAEZ,MAAMkB,kBAAkB;QACxB,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAACD;QACpCH,SAASI,WAAWJ;IACtB;IAGA,IACEA,UACAC,oBAAoB,qBAAqB,CAACI,0BAC1C;QACA,IAAIrB,aACF,MAAM,IAAIM,MAAM;QAElBL,QAAQ,GAAG,CAAC;QAEZ,MAAMqB,iBAAiB;QACvB,MAAM,EAAEC,aAAa,EAAE,GAAG,MAAM,MAAM,CAACD;QACvCN,SAASO,cAAcP;IACzB;IAEA,IAAIzB,oBAAoB;QACtB,MAAMiC,gBAAgB,MAAMjC,mBAAmBuB,YAAYD;QAE3D,IAAIW,eACFR,SAASQ;IAEb;IAEA,OAAO;QACL,YAAYR,OAAO,IAAI,CAAC,WAAW;QACnChC;QACAI;QACAC;QACAC;IACF;AACF;AAEO,eAAemC,OACpBC,QAAsC,EACtC7C,WAAyB,EACzB8C,OAIC;IAOD,MAAM,EAAEC,UAAU,EAAE5C,SAAS,EAAEI,gBAAgB,EAAEC,aAAa,EAAEC,MAAM,EAAE,GACtE,MAAMV,iBAAiB;QACrBC;IACF;IAEF,MAAMgD,YACJZ,oBAAoB,yBAAyB,CAACa,8BAC9Cb,oBAAoB,yBAAyB,CAACc;IAChD,MAAMC,YAAYrC,SAAS;IAC3B,MAAMsC,oBAAoBtC,SAAS;IACnC,MAAMuC,qBAAqBvC,SAAS;IAEpC,MAAMwC,YAAYC,KAAK,GAAG;IAC1B,MAAMC,cAAcxD,YAAY,WAAW,IAAI;IAE/C,MAAMyD,cAAcX,SAAS,UAAUA,SAAS;IAChD,IAAIY;IACJ,IAAIC,cAAc;IAClB,IAAIC,uBAAuB;IAC3B,IAAIC;IACJ,IAAIC;IAEJ,MAAMC,iBAAiB,CAACC;QACtB,IAAI,CAACA,WAAW;QAEhB,MAAMC,oBACJD,WACC,uBAAuB;QAE1B,OAAO;YACL,eAAeA,UAAU,aAAa,IAAI;YAC1C,mBAAmBA,UAAU,iBAAiB,IAAI;YAClD,cAAcA,UAAU,YAAY,IAAI;YACxC,cAAcC,qBAAqB;YACnC,WAAWH,YAAY;YACvB,YAAY3D;YACZ,mBAAmBI;YACnB,QAAQP,YAAY,MAAM;QAC5B;IACF;IAEA,MAAMkE,eAAe;QACnBV;QACA,QAAQ,CAAC,CAACC;QACV,YAAYT;QACZ,GAAIvC,AAAW,iBAAXA,SACA;YACE,2BAA2B;QAC7B,IACA,CAAC,CAAC;IACR;IACA,MAAM,EACJ,QAAQ0D,eAAe,EACvBC,YAAY,EACZC,cAAc,EACf,GAAGC,uBAAuB;QACzB,WAAWxB,SAAS;QACpBrC;IACF;IACA,IAAI2D,cACFjB,UAAUiB;IAEZ,IAAIC,gBAAgB;QAClBlB,UAAUkB;QACVjD,QAAQ,IAAI,CAACiD;IACf;IAEA,IAAI;QACFlB,UACE,CAAC,QAAQ,EAAEM,cAAc,eAAe,GAAG,WAAW,EAAEtD,WAAW;QAGrE,IAAIsD,aAAa;YACf,MAAMc,SAAU,MAAMxB,WAAW,MAAM,CACrC;gBACE,OAAO5C;gBACP0C;gBACA,GAAGqB,YAAY;gBACf,GAAGC,eAAe;YACpB,GACA;gBACE,QAAQ;YACV;YAKF,WAAW,MAAMK,SAASD,OAAQ;gBAChC,MAAMb,UAAUc,MAAM,OAAO,EAAE,CAAC,EAAE,EAAE,OAAO,WAAW;gBACtD,MAAMC,oBACHD,MAAM,OAAO,EAAE,CAAC,EAAE,EAAE,OAAe,qBAAqB;gBAG3D,IAAIA,MAAM,KAAK,EACbX,QAAQW,MAAM,KAAK;gBAGrB,IAAId,WAAWe,mBAAmB;oBAChCd,eAAeD;oBACfE,wBAAwBa;oBACxB,MAAMC,YAAiC;wBACrChB;wBACAe;wBACAd;wBACA,YAAY;wBACZ,OAAOgB;oBACT;oBACA7B,QAAQ,OAAO,CAAE4B;gBACnB;gBAGA,IAAIF,MAAM,OAAO,EAAE,CAAC,EAAE,EAAE,eAAe;oBACrCV,WAAWP,KAAK,GAAG,KAAKD;oBAGxB,IAAI,CAACO,OAAO;wBAEV,MAAMe,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAClB,YAAY,MAAM,GAAG;wBAElCE,QAAQ;4BACN,eAAee;4BACf,mBAAmBA;4BACnB,cAAcA,AAAkB,IAAlBA;wBAChB;oBACF;oBAGA,MAAME,aAAkC;wBACtC,SAAS;wBACTnB;wBACA,mBAAmB;wBACnB,YAAY;wBACZ,OAAOI,eAAeF;oBACxB;oBACAf,QAAQ,OAAO,CAAEgC;oBACjB;gBACF;YACF;YACApB,UAAUC;YACVP,kBACE,CAAC,iBAAiB,EAAEjD,UAAU,QAAQ,EAAEM,UAAU,UAAU,WAAW,EAAEqD,SAAS,eAAe,EAAEN,eAAe,IAAI;QAE1H,OAAO;YAEL,MAAMuB,aAAa/E,YAAY,UAAU,IAAI;YAC7C,MAAMgF,gBAAgBhF,YAAY,aAAa,IAAI;YACnD,MAAMiF,cAAcF,aAAa;YAEjC,IAAIG;YAEJ,IAAK,IAAIC,UAAU,GAAGA,WAAWF,aAAaE,UAC5C,IAAI;gBACF,MAAMC,SAAS,MAAMrC,WAAW,MAAM,CAAC;oBACrC,OAAO5C;oBACP0C;oBACA,GAAGqB,YAAY;oBACf,GAAGC,eAAe;gBACpB;gBAEAL,WAAWP,KAAK,GAAG,KAAKD;gBAExBF,kBACE,CAAC,OAAO,EAAEjD,UAAU,QAAQ,EAAEM,UAAU,UAAU,mBAAmB,EAAED,cAAc,iBAAiB,EAAE4E,OAAO,KAAK,EAAE,iBAAiB,GAAG,qBAAqB,EAAEA,OAAO,KAAK,EAAE,qBAAqB,GAAG,gBAAgB,EAAEA,OAAO,KAAK,EAAE,gBAAgB,GAAG,WAAW,EAAEtB,SAAS,aAAa,EAAEsB,OAAO,WAAW,IAAI,GAAG,eAAe,EAAE5B,eAAe,IAAI;gBAG9VH,mBACE,CAAC,oBAAoB,EAAEgC,KAAK,SAAS,CAACD,OAAO,KAAK,GAAG;gBAGvD,IAAI,CAACA,OAAO,OAAO,EACjB,MAAM,IAAI3D,MACR,CAAC,mCAAmC,EAAE4D,KAAK,SAAS,CAACD,SAAS;gBAIlE1B,UAAU0B,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,OAAO;gBAC3C,IAAI,CAAC1B,SACH,MAAM,IAAIjC,MAAM;gBAGlBmC,uBACGwB,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,EAAU,qBAAqB;gBAC3DvB,QAAQuB,OAAO,KAAK;gBACpB;YACF,EAAE,OAAOrD,OAAO;gBACdmD,YAAYnD;gBACZ,IAAIoD,UAAUF,aAAa;oBACzB7D,QAAQ,IAAI,CACV,CAAC,mCAAmC,EAAE+D,QAAQ,CAAC,EAAEF,YAAY,eAAe,EAAED,cAAc,aAAa,EAAEE,UAAU,OAAO,EAAE;oBAEhI,MAAM,IAAII,QAAQ,CAACC,UAAYC,WAAWD,SAASP;gBACrD;YACF;YAGF,IAAI,CAACtB,SACH,MAAMwB;QAEV;QAEA/B,UAAU,CAAC,4BAA4B,EAAES,sBAAsB;QAC/DT,UAAU,CAAC,kBAAkB,EAAEO,SAAS;QAGxC,IAAID,eAAe,CAACI,OAAO;YAEzB,MAAMe,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAEnB,AAAAA,CAAAA,WAAW,EAAC,EAAG,MAAM,GAAG;YAEtCG,QAAQ;gBACN,eAAee;gBACf,mBAAmBA;gBACnB,cAAcA,AAAkB,IAAlBA;YAChB;QACF;QAEA,OAAO;YACL,SAASlB,WAAW;YACpB,mBAAmBE,wBAAwBe;YAC3C,OAAOZ,eAAeF;YACtB,YAAY,CAAC,CAACJ;QAChB;IACF,EAAE,OAAOgC,GAAQ;QACfrE,QAAQ,KAAK,CAAC,kBAAkBqE;QAChC,MAAMC,WAAW,IAAIjE,MACnB,CAAC,eAAe,EAAEgC,cAAc,eAAe,GAAG,kBAAkB,EAAEtD,UAAU,GAAG,EAAEsF,EAAE,OAAO,CAAC,8DAA8D,CAAC,EAC9J;YACE,OAAOA;QACT;QAEF,MAAMC;IACR;AACF;AAEO,eAAeC,yBACpB9C,QAAsC,EACtC7C,WAAyB,EACzB8C,OAEC;IAOD,MAAM8C,WAAW,MAAMhD,OAAOC,UAAU7C,aAAa;QACnD,WAAW8C,SAAS;IACtB;IACA+C,OAAOD,UAAU;IACjB,MAAMnF,SAAST,YAAY,MAAM;IACjC,MAAM8F,cAAcC,cAAcH,SAAS,OAAO,EAAEnF;IACpDoF,OACE,AAAuB,YAAvB,OAAOC,aACP,CAAC,0CAA0C,EAAE9F,YAAY,SAAS,CAAC,GAAG,EAAE4F,SAAS,OAAO,EAAE;IAE5F,OAAO;QACL,SAASE;QACT,eAAeF,SAAS,OAAO;QAC/B,OAAOA,SAAS,KAAK;QACrB,mBAAmBA,SAAS,iBAAiB;IAC/C;AACF;AAEO,eAAeI,yBACpBC,IAAY,EACZjG,WAAyB;IAEzB,MAAM,EAAE0D,OAAO,EAAEG,KAAK,EAAE,GAAG,MAAMjB,OAAOqD,MAAMjG;IAC9C,OAAO;QAAE0D;QAASG;IAAM;AAC1B;AAEO,SAASqC,yBAAyBN,QAAgB;IACvD,IAAI;QAEF,MAAMO,YAAYP,SAAS,KAAK,CAAC;QACjC,IAAIO,WACF,OAAOA,SAAS,CAAC,EAAE;QAIrB,MAAMC,iBAAiBR,SAAS,KAAK,CACnC;QAEF,IAAIQ,gBACF,OAAOA,cAAc,CAAC,EAAE;QAI1B,MAAMC,gBAAgBT,SAAS,KAAK,CAAC;QACrC,IAAIS,eACF,OAAOA,aAAa,CAAC,EAAE;IAE3B,EAAE,OAAM,CAAC;IAET,OAAOT;AACT;AAEO,SAASU,yBAAyBC,KAAa;IACpD,IAAIA,MAAM,QAAQ,CAAC,SAEjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEO,SAASjC,uBAAuB,EACrCkC,SAAS,EACT/F,MAAM,EAIP;IAKC,MAAMgG,sBAAsBD,AAAc,YAAdA,YAAwB7B,SAAY6B;IAEhE,IAAIC,AAAwB9B,WAAxB8B,qBACF,OAAO;QAAE,QAAQ,CAAC;QAAG,cAAc9B;IAAU;IAG/C,IAAIlE,AAAW,eAAXA,QACF,OAAO;QACL,QAAQ;YAAE,iBAAiBgG;QAAoB;QAC/C,cAAc,CAAC,oCAAoC,EAAEA,oBAAoB,aAAa,CAAC;IACzF;IAGF,IAAIhG,AAAW,oBAAXA,QACF,OAAO;QACL,QAAQ;YACN,UAAU;gBAAE,MAAMgG,sBAAsB,YAAY;YAAW;QACjE;QACA,cAAc,CAAC,kCAAkC,EAAEA,sBAAsB,YAAY,WAAW,kBAAkB,CAAC;IACrH;IAGF,IAAIhG,AAAW,YAAXA,QACF,OAAO;QACL,QAAQ;YACN,UAAU;gBAAE,MAAMgG,sBAAsB,YAAY;YAAW;QACjE;QACA,cAAc,CAAC,kCAAkC,EAAEA,sBAAsB,YAAY,WAAW,UAAU,CAAC;IAC7G;IAGF,OAAO;QACL,QAAQ,CAAC;QACT,cAAc,CAAC,6CAA6C,EAAEhG,UAAU,UAAU,CAAC,CAAC;QACpF,gBAAgB,CAAC,0DAA0D,EAAEA,UAAU,UAAU,EAAE,CAAC;IACtG;AACF;AAQA,SAASiG,oBAAoBC,GAAQ;IAEnC,IAAIA,QAAAA,KACF,OAAOA;IAIT,IAAIC,MAAM,OAAO,CAACD,MAChB,OAAOA,IAAI,GAAG,CAAC,CAACE,OAASH,oBAAoBG;IAI/C,IAAI,AAAe,YAAf,OAAOF,KAAkB;QAC3B,MAAMG,aAAkB,CAAC;QAEzB,KAAK,MAAM,CAACC,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACN,KAAM;YAE9C,MAAMO,aAAaH,IAAI,IAAI;YAG3B,IAAII,kBAAkBT,oBAAoBM;YAG1C,IAAI,AAA2B,YAA3B,OAAOG,iBACTA,kBAAkBA,gBAAgB,IAAI;YAGxCL,UAAU,CAACI,WAAW,GAAGC;QAC3B;QAEA,OAAOL;IACT;IAGA,IAAI,AAAe,YAAf,OAAOH,KACT,OAAOA,IAAI,IAAI;IAIjB,OAAOA;AACT;AAEO,SAASZ,cAAcQ,KAAa,EAAE9F,MAAgC;IAC3E,MAAM2G,kBAAkBlB,yBAAyBK;IAEjD,IAAIa,iBAAiB,MAAM,oBACzB,OAAOA,gBACJ,KAAK,CAAC,oBACL,MAAM,GACP,IAAIzF;IAGT,IAAIV;IACJ,IAAIiE;IACJ,IAAI;QACFjE,SAASoE,KAAK,KAAK,CAAC+B;QACpB,OAAOV,oBAAoBzF;IAC7B,EAAE,OAAOc,OAAO;QACdmD,YAAYnD;IACd;IACA,IAAI;QACFd,SAASoE,KAAK,KAAK,CAACgC,WAAWD;QAC/B,OAAOV,oBAAoBzF;IAC7B,EAAE,OAAOc,OAAO;QACdmD,YAAYnD;IACd;IAEA,IAAItB,AAAW,oBAAXA,UAA8BA,AAAW,kBAAXA,QAA0B;QAC1D,MAAM6G,aAAahB,yBAAyBc;QAC5C,IAAI;YACFnG,SAASoE,KAAK,KAAK,CAACgC,WAAWC;YAC/B,OAAOZ,oBAAoBzF;QAC7B,EAAE,OAAOc,OAAO;YACdmD,YAAYnD;QACd;IACF;IACA,MAAMN,MACJ,CAAC,gDAAgD,EAAE8F,OACjDrC,aAAa,iBACb,gBAAgB,EAAEqB,OAAO;AAE/B"}
|
|
1
|
+
{"version":3,"file":"ai-model/service-caller/index.mjs","sources":["../../../../src/ai-model/service-caller/index.ts"],"sourcesContent":["import {\n AIResponseFormat,\n type AIUsageInfo,\n type DeepThinkOption,\n} from '@/types';\nimport type { CodeGenerationChunk, StreamingCallback } from '@/types';\nimport {\n type IModelConfig,\n MIDSCENE_LANGFUSE_DEBUG,\n MIDSCENE_LANGSMITH_DEBUG,\n MIDSCENE_MODEL_MAX_TOKENS,\n OPENAI_MAX_TOKENS,\n type TModelFamily,\n type TVlModeTypes,\n type UITarsModelVersion,\n globalConfigManager,\n} from '@midscene/shared/env';\n\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert, ifInBrowser } from '@midscene/shared/utils';\nimport { jsonrepair } from 'jsonrepair';\nimport OpenAI from 'openai';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { Stream } from 'openai/streaming';\nimport type { AIArgs } from '../../common';\n\nasync function createChatClient({\n modelConfig,\n}: {\n modelConfig: IModelConfig;\n}): Promise<{\n completion: OpenAI.Chat.Completions;\n modelName: string;\n modelDescription: string;\n uiTarsVersion?: UITarsModelVersion;\n vlMode: TVlModeTypes | undefined;\n modelFamily: TModelFamily | undefined;\n}> {\n const {\n socksProxy,\n httpProxy,\n modelName,\n openaiBaseURL,\n openaiApiKey,\n openaiExtraConfig,\n modelDescription,\n uiTarsModelVersion: uiTarsVersion,\n vlMode,\n modelFamily,\n createOpenAIClient,\n timeout,\n } = modelConfig;\n\n let proxyAgent: any = undefined;\n const debugProxy = getDebug('ai:call:proxy');\n\n // Helper function to sanitize proxy URL for logging (remove credentials)\n // Uses URL API instead of regex to avoid ReDoS vulnerabilities\n const sanitizeProxyUrl = (url: string): string => {\n try {\n const parsed = new URL(url);\n if (parsed.username) {\n // Keep username for debugging, hide password for security\n parsed.password = '****';\n return parsed.href;\n }\n return url;\n } catch {\n // If URL parsing fails, return original URL (will be caught later)\n return url;\n }\n };\n\n if (httpProxy) {\n debugProxy('using http proxy', sanitizeProxyUrl(httpProxy));\n if (ifInBrowser) {\n console.warn(\n 'HTTP proxy is configured but not supported in browser environment',\n );\n } else {\n // Dynamic import with variable to avoid bundler static analysis\n const moduleName = 'undici';\n const { ProxyAgent } = await import(moduleName);\n proxyAgent = new ProxyAgent({\n uri: httpProxy,\n // Note: authentication is handled via the URI (e.g., http://user:pass@proxy.com:8080)\n });\n }\n } else if (socksProxy) {\n debugProxy('using socks proxy', sanitizeProxyUrl(socksProxy));\n if (ifInBrowser) {\n console.warn(\n 'SOCKS proxy is configured but not supported in browser environment',\n );\n } else {\n try {\n // Dynamic import with variable to avoid bundler static analysis\n const moduleName = 'fetch-socks';\n const { socksDispatcher } = await import(moduleName);\n // Parse SOCKS proxy URL (e.g., socks5://127.0.0.1:1080)\n const proxyUrl = new URL(socksProxy);\n\n // Validate hostname\n if (!proxyUrl.hostname) {\n throw new Error('SOCKS proxy URL must include a valid hostname');\n }\n\n // Validate and parse port\n const port = Number.parseInt(proxyUrl.port, 10);\n if (!proxyUrl.port || Number.isNaN(port)) {\n throw new Error('SOCKS proxy URL must include a valid port');\n }\n\n // Parse SOCKS version from protocol\n const protocol = proxyUrl.protocol.replace(':', '');\n const socksType =\n protocol === 'socks4' ? 4 : protocol === 'socks5' ? 5 : 5;\n\n proxyAgent = socksDispatcher({\n type: socksType,\n host: proxyUrl.hostname,\n port,\n ...(proxyUrl.username\n ? {\n userId: decodeURIComponent(proxyUrl.username),\n password: decodeURIComponent(proxyUrl.password || ''),\n }\n : {}),\n });\n debugProxy('socks proxy configured successfully', {\n type: socksType,\n host: proxyUrl.hostname,\n port: port,\n });\n } catch (error) {\n console.error('Failed to configure SOCKS proxy:', error);\n throw new Error(\n `Invalid SOCKS proxy URL: ${socksProxy}. Expected format: socks4://host:port, socks5://host:port, or with authentication: socks5://user:pass@host:port`,\n );\n }\n }\n }\n\n const openAIOptions = {\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n // Use fetchOptions.dispatcher for fetch-based SDK instead of httpAgent\n // Note: Type assertion needed due to undici version mismatch between dependencies\n ...(proxyAgent ? { fetchOptions: { dispatcher: proxyAgent as any } } : {}),\n ...openaiExtraConfig,\n ...(typeof timeout === 'number' ? { timeout } : {}),\n dangerouslyAllowBrowser: true,\n };\n\n const baseOpenAI = new OpenAI(openAIOptions);\n\n let openai: OpenAI = baseOpenAI;\n\n // LangSmith wrapper\n if (\n openai &&\n globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)\n ) {\n if (ifInBrowser) {\n throw new Error('langsmith is not supported in browser');\n }\n console.log('DEBUGGING MODE: langsmith wrapper enabled');\n // Use variable to prevent static analysis by bundlers\n const langsmithModule = 'langsmith/wrappers';\n const { wrapOpenAI } = await import(langsmithModule);\n openai = wrapOpenAI(openai);\n }\n\n // Langfuse wrapper\n if (\n openai &&\n globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGFUSE_DEBUG)\n ) {\n if (ifInBrowser) {\n throw new Error('langfuse is not supported in browser');\n }\n console.log('DEBUGGING MODE: langfuse wrapper enabled');\n // Use variable to prevent static analysis by bundlers\n const langfuseModule = 'langfuse';\n const { observeOpenAI } = await import(langfuseModule);\n openai = observeOpenAI(openai);\n }\n\n if (createOpenAIClient) {\n const wrappedClient = await createOpenAIClient(baseOpenAI, openAIOptions);\n\n if (wrappedClient) {\n openai = wrappedClient as OpenAI;\n }\n }\n\n return {\n completion: openai.chat.completions,\n modelName,\n modelDescription,\n uiTarsVersion,\n vlMode,\n modelFamily,\n };\n}\n\nexport async function callAI(\n messages: ChatCompletionMessageParam[],\n modelConfig: IModelConfig,\n options?: {\n stream?: boolean;\n onChunk?: StreamingCallback;\n deepThink?: DeepThinkOption;\n },\n): Promise<{\n content: string;\n reasoning_content?: string;\n usage?: AIUsageInfo;\n isStreamed: boolean;\n}> {\n const {\n completion,\n modelName,\n modelDescription,\n uiTarsVersion,\n vlMode,\n modelFamily,\n } = await createChatClient({\n modelConfig,\n });\n\n const maxTokens =\n globalConfigManager.getEnvConfigValueAsNumber(MIDSCENE_MODEL_MAX_TOKENS) ??\n globalConfigManager.getEnvConfigValueAsNumber(OPENAI_MAX_TOKENS);\n const debugCall = getDebug('ai:call');\n const debugProfileStats = getDebug('ai:profile:stats');\n const debugProfileDetail = getDebug('ai:profile:detail');\n\n const startTime = Date.now();\n const temperature = modelConfig.temperature ?? 0;\n\n const isStreaming = options?.stream && options?.onChunk;\n let content: string | undefined;\n let accumulated = '';\n let accumulatedReasoning = '';\n let usage: OpenAI.CompletionUsage | undefined;\n let timeCost: number | undefined;\n\n const buildUsageInfo = (usageData?: OpenAI.CompletionUsage) => {\n if (!usageData) return undefined;\n\n const cachedInputTokens = (\n usageData as { prompt_tokens_details?: { cached_tokens?: number } }\n )?.prompt_tokens_details?.cached_tokens;\n\n return {\n prompt_tokens: usageData.prompt_tokens ?? 0,\n completion_tokens: usageData.completion_tokens ?? 0,\n total_tokens: usageData.total_tokens ?? 0,\n cached_input: cachedInputTokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelConfig.intent,\n } satisfies AIUsageInfo;\n };\n\n const commonConfig = {\n temperature,\n stream: !!isStreaming,\n max_tokens: maxTokens,\n ...(vlMode === 'qwen2.5-vl' // qwen vl v2 specific config\n ? {\n vl_high_resolution_images: true,\n }\n : {}),\n };\n const {\n config: deepThinkConfig,\n debugMessage,\n warningMessage,\n } = resolveDeepThinkConfig({\n deepThink: options?.deepThink,\n modelFamily,\n });\n if (debugMessage) {\n debugCall(debugMessage);\n }\n if (warningMessage) {\n debugCall(warningMessage);\n console.warn(warningMessage);\n }\n\n try {\n debugCall(\n `sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`,\n );\n\n if (isStreaming) {\n const stream = (await completion.create(\n {\n model: modelName,\n messages,\n ...commonConfig,\n ...deepThinkConfig,\n },\n {\n stream: true,\n },\n )) as Stream<OpenAI.Chat.Completions.ChatCompletionChunk> & {\n _request_id?: string | null;\n };\n\n for await (const chunk of stream) {\n const content = chunk.choices?.[0]?.delta?.content || '';\n const reasoning_content =\n (chunk.choices?.[0]?.delta as any)?.reasoning_content || '';\n\n // Check for usage info in any chunk (OpenAI provides usage in separate chunks)\n if (chunk.usage) {\n usage = chunk.usage;\n }\n\n if (content || reasoning_content) {\n accumulated += content;\n accumulatedReasoning += reasoning_content;\n const chunkData: CodeGenerationChunk = {\n content,\n reasoning_content,\n accumulated,\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.choices?.[0]?.finish_reason) {\n timeCost = Date.now() - startTime;\n\n // If usage is not available from the stream, provide a basic usage info\n if (!usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor(accumulated.length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: buildUsageInfo(usage),\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n debugProfileStats(\n `streaming model, ${modelName}, mode, ${vlMode || 'default'}, cost-ms, ${timeCost}, temperature, ${temperature ?? ''}`,\n );\n } else {\n // Non-streaming with retry logic\n const retryCount = modelConfig.retryCount ?? 1;\n const retryInterval = modelConfig.retryInterval ?? 2000;\n const maxAttempts = retryCount + 1; // retryCount=1 means 2 total attempts (1 initial + 1 retry)\n\n let lastError: Error | undefined;\n\n for (let attempt = 1; attempt <= maxAttempts; attempt++) {\n try {\n const result = await completion.create({\n model: modelName,\n messages,\n ...commonConfig,\n ...deepThinkConfig,\n } as any);\n\n timeCost = Date.now() - startTime;\n\n debugProfileStats(\n `model, ${modelName}, mode, ${vlMode || 'default'}, ui-tars-version, ${uiTarsVersion}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}, temperature, ${temperature ?? ''}`,\n );\n\n debugProfileDetail(\n `model usage detail: ${JSON.stringify(result.usage)}`,\n );\n\n if (!result.choices) {\n throw new Error(\n `invalid response from LLM service: ${JSON.stringify(result)}`,\n );\n }\n\n content = result.choices[0].message.content!;\n if (!content) {\n throw new Error('empty content from AI model');\n }\n\n accumulatedReasoning =\n (result.choices[0].message as any)?.reasoning_content || '';\n usage = result.usage;\n break; // Success, exit retry loop\n } catch (error) {\n lastError = error as Error;\n if (attempt < maxAttempts) {\n console.warn(\n `[Midscene] AI call failed (attempt ${attempt}/${maxAttempts}), retrying in ${retryInterval}ms... Error: ${lastError.message}`,\n );\n await new Promise((resolve) => setTimeout(resolve, retryInterval));\n }\n }\n }\n\n if (!content) {\n throw lastError;\n }\n }\n\n debugCall(`response reasoning content: ${accumulatedReasoning}`);\n debugCall(`response content: ${content}`);\n\n // Ensure we always have usage info for streaming responses\n if (isStreaming && !usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor((content || '').length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n } as OpenAI.CompletionUsage;\n }\n\n return {\n content: content || '',\n reasoning_content: accumulatedReasoning || undefined,\n usage: buildUsageInfo(usage),\n isStreamed: !!isStreaming,\n };\n } catch (e: any) {\n console.error(' call AI error', e);\n const newError = new Error(\n `failed to call ${isStreaming ? 'streaming ' : ''}AI model service (${modelName}): ${e.message}\\nTrouble shooting: https://midscenejs.com/model-provider.html`,\n {\n cause: e,\n },\n );\n throw newError;\n }\n}\n\nexport async function callAIWithObjectResponse<T>(\n messages: ChatCompletionMessageParam[],\n modelConfig: IModelConfig,\n options?: {\n deepThink?: DeepThinkOption;\n },\n): Promise<{\n content: T;\n contentString: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const response = await callAI(messages, modelConfig, {\n deepThink: options?.deepThink,\n });\n assert(response, 'empty response');\n const vlMode = modelConfig.vlMode;\n const jsonContent = safeParseJson(response.content, vlMode);\n assert(\n typeof jsonContent === 'object',\n `failed to parse json response from model (${modelConfig.modelName}): ${response.content}`,\n );\n return {\n content: jsonContent,\n contentString: response.content,\n usage: response.usage,\n reasoning_content: response.reasoning_content,\n };\n}\n\nexport async function callAIWithStringResponse(\n msgs: AIArgs,\n modelConfig: IModelConfig,\n): Promise<{ content: string; usage?: AIUsageInfo }> {\n const { content, usage } = await callAI(msgs, modelConfig);\n return { content, usage };\n}\n\nexport function extractJSONFromCodeBlock(response: string) {\n try {\n // First, try to match a JSON object directly in the response\n const jsonMatch = response.match(/^\\s*(\\{[\\s\\S]*\\})\\s*$/);\n if (jsonMatch) {\n return jsonMatch[1];\n }\n\n // If no direct JSON object is found, try to extract JSON from a code block\n const codeBlockMatch = response.match(\n /```(?:json)?\\s*(\\{[\\s\\S]*?\\})\\s*```/,\n );\n if (codeBlockMatch) {\n return codeBlockMatch[1];\n }\n\n // If no code block is found, try to find a JSON-like structure in the text\n const jsonLikeMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (jsonLikeMatch) {\n return jsonLikeMatch[0];\n }\n } catch {}\n // If no JSON-like structure is found, return the original response\n return response;\n}\n\nexport function preprocessDoubaoBboxJson(input: string) {\n if (input.includes('bbox')) {\n // when its values like 940 445 969 490, replace all /\\d+\\s+\\d+/g with /$1,$2/g\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nexport function resolveDeepThinkConfig({\n deepThink,\n modelFamily,\n}: {\n deepThink?: DeepThinkOption;\n modelFamily?: TModelFamily;\n}): {\n config: Record<string, unknown>;\n debugMessage?: string;\n warningMessage?: string;\n} {\n const normalizedDeepThink = deepThink === 'unset' ? undefined : deepThink;\n\n if (normalizedDeepThink === undefined) {\n return { config: {}, debugMessage: undefined };\n }\n\n if (modelFamily === 'qwen3-vl') {\n return {\n config: { enable_thinking: normalizedDeepThink },\n debugMessage: `deepThink mapped to enable_thinking=${normalizedDeepThink} for qwen3-vl`,\n };\n }\n\n if (modelFamily === 'doubao-vision') {\n return {\n config: {\n thinking: { type: normalizedDeepThink ? 'enabled' : 'disabled' },\n },\n debugMessage: `deepThink mapped to thinking.type=${normalizedDeepThink ? 'enabled' : 'disabled'} for doubao-vision`,\n };\n }\n\n if (modelFamily === 'glm-v') {\n return {\n config: {\n thinking: { type: normalizedDeepThink ? 'enabled' : 'disabled' },\n },\n debugMessage: `deepThink mapped to thinking.type=${normalizedDeepThink ? 'enabled' : 'disabled'} for glm-v`,\n };\n }\n\n return {\n config: {},\n debugMessage: `deepThink ignored: unsupported model_family \"${modelFamily ?? 'default'}\"`,\n warningMessage: `The \"deepThink\" option is not supported for model_family \"${modelFamily ?? 'default'}\".`,\n };\n}\n\n/**\n * Normalize a parsed JSON object by trimming whitespace from:\n * 1. All object keys (e.g., \" prompt \" -> \"prompt\")\n * 2. All string values (e.g., \" Tap \" -> \"Tap\")\n * This handles LLM output that may include leading/trailing spaces.\n */\nfunction normalizeJsonObject(obj: any): any {\n // Handle null and undefined\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n // Handle arrays - recursively normalize each element\n if (Array.isArray(obj)) {\n return obj.map((item) => normalizeJsonObject(item));\n }\n\n // Handle objects\n if (typeof obj === 'object') {\n const normalized: any = {};\n\n for (const [key, value] of Object.entries(obj)) {\n // Trim the key to remove leading/trailing spaces\n const trimmedKey = key.trim();\n\n // Recursively normalize the value\n let normalizedValue = normalizeJsonObject(value);\n\n // Trim all string values\n if (typeof normalizedValue === 'string') {\n normalizedValue = normalizedValue.trim();\n }\n\n normalized[trimmedKey] = normalizedValue;\n }\n\n return normalized;\n }\n\n // Handle primitive strings\n if (typeof obj === 'string') {\n return obj.trim();\n }\n\n // Return other primitives as-is\n return obj;\n}\n\nexport function safeParseJson(input: string, vlMode: TVlModeTypes | undefined) {\n const cleanJsonString = extractJSONFromCodeBlock(input);\n // match the point\n if (cleanJsonString?.match(/\\((\\d+),(\\d+)\\)/)) {\n return cleanJsonString\n .match(/\\((\\d+),(\\d+)\\)/)\n ?.slice(1)\n .map(Number);\n }\n\n let parsed: any;\n let lastError: unknown;\n try {\n parsed = JSON.parse(cleanJsonString);\n return normalizeJsonObject(parsed);\n } catch (error) {\n lastError = error;\n }\n try {\n parsed = JSON.parse(jsonrepair(cleanJsonString));\n return normalizeJsonObject(parsed);\n } catch (error) {\n lastError = error;\n }\n\n if (vlMode === 'doubao-vision' || vlMode === 'vlm-ui-tars') {\n const jsonString = preprocessDoubaoBboxJson(cleanJsonString);\n try {\n parsed = JSON.parse(jsonrepair(jsonString));\n return normalizeJsonObject(parsed);\n } catch (error) {\n lastError = error;\n }\n }\n throw Error(\n `failed to parse LLM response into JSON. Error - ${String(\n lastError ?? 'unknown error',\n )}. Response - \\n ${input}`,\n );\n}\n"],"names":["createChatClient","modelConfig","socksProxy","httpProxy","modelName","openaiBaseURL","openaiApiKey","openaiExtraConfig","modelDescription","uiTarsVersion","vlMode","modelFamily","createOpenAIClient","timeout","proxyAgent","debugProxy","getDebug","sanitizeProxyUrl","url","parsed","URL","ifInBrowser","console","moduleName","ProxyAgent","socksDispatcher","proxyUrl","Error","port","Number","protocol","socksType","decodeURIComponent","error","openAIOptions","baseOpenAI","OpenAI","openai","globalConfigManager","MIDSCENE_LANGSMITH_DEBUG","langsmithModule","wrapOpenAI","MIDSCENE_LANGFUSE_DEBUG","langfuseModule","observeOpenAI","wrappedClient","callAI","messages","options","completion","maxTokens","MIDSCENE_MODEL_MAX_TOKENS","OPENAI_MAX_TOKENS","debugCall","debugProfileStats","debugProfileDetail","startTime","Date","temperature","isStreaming","content","accumulated","accumulatedReasoning","usage","timeCost","buildUsageInfo","usageData","cachedInputTokens","commonConfig","deepThinkConfig","debugMessage","warningMessage","resolveDeepThinkConfig","stream","chunk","reasoning_content","chunkData","undefined","estimatedTokens","Math","finalChunk","retryCount","retryInterval","maxAttempts","lastError","attempt","result","JSON","Promise","resolve","setTimeout","e","newError","callAIWithObjectResponse","response","assert","jsonContent","safeParseJson","callAIWithStringResponse","msgs","extractJSONFromCodeBlock","jsonMatch","codeBlockMatch","jsonLikeMatch","preprocessDoubaoBboxJson","input","deepThink","normalizedDeepThink","normalizeJsonObject","obj","Array","item","normalized","key","value","Object","trimmedKey","normalizedValue","cleanJsonString","jsonrepair","jsonString","String"],"mappings":";;;;;AA0BA,eAAeA,iBAAiB,EAC9BC,WAAW,EAGZ;IAQC,MAAM,EACJC,UAAU,EACVC,SAAS,EACTC,SAAS,EACTC,aAAa,EACbC,YAAY,EACZC,iBAAiB,EACjBC,gBAAgB,EAChB,oBAAoBC,aAAa,EACjCC,MAAM,EACNC,WAAW,EACXC,kBAAkB,EAClBC,OAAO,EACR,GAAGZ;IAEJ,IAAIa;IACJ,MAAMC,aAAaC,SAAS;IAI5B,MAAMC,mBAAmB,CAACC;QACxB,IAAI;YACF,MAAMC,SAAS,IAAIC,IAAIF;YACvB,IAAIC,OAAO,QAAQ,EAAE;gBAEnBA,OAAO,QAAQ,GAAG;gBAClB,OAAOA,OAAO,IAAI;YACpB;YACA,OAAOD;QACT,EAAE,OAAM;YAEN,OAAOA;QACT;IACF;IAEA,IAAIf,WAAW;QACbY,WAAW,oBAAoBE,iBAAiBd;QAChD,IAAIkB,aACFC,QAAQ,IAAI,CACV;aAEG;YAEL,MAAMC,aAAa;YACnB,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAACD;YACpCT,aAAa,IAAIU,WAAW;gBAC1B,KAAKrB;YAEP;QACF;IACF,OAAO,IAAID,YAAY;QACrBa,WAAW,qBAAqBE,iBAAiBf;QACjD,IAAImB,aACFC,QAAQ,IAAI,CACV;aAGF,IAAI;YAEF,MAAMC,aAAa;YACnB,MAAM,EAAEE,eAAe,EAAE,GAAG,MAAM,MAAM,CAACF;YAEzC,MAAMG,WAAW,IAAIN,IAAIlB;YAGzB,IAAI,CAACwB,SAAS,QAAQ,EACpB,MAAM,IAAIC,MAAM;YAIlB,MAAMC,OAAOC,OAAO,QAAQ,CAACH,SAAS,IAAI,EAAE;YAC5C,IAAI,CAACA,SAAS,IAAI,IAAIG,OAAO,KAAK,CAACD,OACjC,MAAM,IAAID,MAAM;YAIlB,MAAMG,WAAWJ,SAAS,QAAQ,CAAC,OAAO,CAAC,KAAK;YAChD,MAAMK,YACJD,AAAa,aAAbA,WAAwB,IAAIA,AAAa,aAAbA,WAAwB,IAAI;YAE1DhB,aAAaW,gBAAgB;gBAC3B,MAAMM;gBACN,MAAML,SAAS,QAAQ;gBACvBE;gBACA,GAAIF,SAAS,QAAQ,GACjB;oBACE,QAAQM,mBAAmBN,SAAS,QAAQ;oBAC5C,UAAUM,mBAAmBN,SAAS,QAAQ,IAAI;gBACpD,IACA,CAAC,CAAC;YACR;YACAX,WAAW,uCAAuC;gBAChD,MAAMgB;gBACN,MAAML,SAAS,QAAQ;gBACvB,MAAME;YACR;QACF,EAAE,OAAOK,OAAO;YACdX,QAAQ,KAAK,CAAC,oCAAoCW;YAClD,MAAM,IAAIN,MACR,CAAC,yBAAyB,EAAEzB,WAAW,+GAA+G,CAAC;QAE3J;IAEJ;IAEA,MAAMgC,gBAAgB;QACpB,SAAS7B;QACT,QAAQC;QAGR,GAAIQ,aAAa;YAAE,cAAc;gBAAE,YAAYA;YAAkB;QAAE,IAAI,CAAC,CAAC;QACzE,GAAGP,iBAAiB;QACpB,GAAI,AAAmB,YAAnB,OAAOM,UAAuB;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAClD,yBAAyB;IAC3B;IAEA,MAAMsB,aAAa,IAAIC,SAAOF;IAE9B,IAAIG,SAAiBF;IAGrB,IACEE,UACAC,oBAAoB,qBAAqB,CAACC,2BAC1C;QACA,IAAIlB,aACF,MAAM,IAAIM,MAAM;QAElBL,QAAQ,GAAG,CAAC;QAEZ,MAAMkB,kBAAkB;QACxB,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAACD;QACpCH,SAASI,WAAWJ;IACtB;IAGA,IACEA,UACAC,oBAAoB,qBAAqB,CAACI,0BAC1C;QACA,IAAIrB,aACF,MAAM,IAAIM,MAAM;QAElBL,QAAQ,GAAG,CAAC;QAEZ,MAAMqB,iBAAiB;QACvB,MAAM,EAAEC,aAAa,EAAE,GAAG,MAAM,MAAM,CAACD;QACvCN,SAASO,cAAcP;IACzB;IAEA,IAAIzB,oBAAoB;QACtB,MAAMiC,gBAAgB,MAAMjC,mBAAmBuB,YAAYD;QAE3D,IAAIW,eACFR,SAASQ;IAEb;IAEA,OAAO;QACL,YAAYR,OAAO,IAAI,CAAC,WAAW;QACnCjC;QACAI;QACAC;QACAC;QACAC;IACF;AACF;AAEO,eAAemC,OACpBC,QAAsC,EACtC9C,WAAyB,EACzB+C,OAIC;IAOD,MAAM,EACJC,UAAU,EACV7C,SAAS,EACTI,gBAAgB,EAChBC,aAAa,EACbC,MAAM,EACNC,WAAW,EACZ,GAAG,MAAMX,iBAAiB;QACzBC;IACF;IAEA,MAAMiD,YACJZ,oBAAoB,yBAAyB,CAACa,8BAC9Cb,oBAAoB,yBAAyB,CAACc;IAChD,MAAMC,YAAYrC,SAAS;IAC3B,MAAMsC,oBAAoBtC,SAAS;IACnC,MAAMuC,qBAAqBvC,SAAS;IAEpC,MAAMwC,YAAYC,KAAK,GAAG;IAC1B,MAAMC,cAAczD,YAAY,WAAW,IAAI;IAE/C,MAAM0D,cAAcX,SAAS,UAAUA,SAAS;IAChD,IAAIY;IACJ,IAAIC,cAAc;IAClB,IAAIC,uBAAuB;IAC3B,IAAIC;IACJ,IAAIC;IAEJ,MAAMC,iBAAiB,CAACC;QACtB,IAAI,CAACA,WAAW;QAEhB,MAAMC,oBACJD,WACC,uBAAuB;QAE1B,OAAO;YACL,eAAeA,UAAU,aAAa,IAAI;YAC1C,mBAAmBA,UAAU,iBAAiB,IAAI;YAClD,cAAcA,UAAU,YAAY,IAAI;YACxC,cAAcC,qBAAqB;YACnC,WAAWH,YAAY;YACvB,YAAY5D;YACZ,mBAAmBI;YACnB,QAAQP,YAAY,MAAM;QAC5B;IACF;IAEA,MAAMmE,eAAe;QACnBV;QACA,QAAQ,CAAC,CAACC;QACV,YAAYT;QACZ,GAAIxC,AAAW,iBAAXA,SACA;YACE,2BAA2B;QAC7B,IACA,CAAC,CAAC;IACR;IACA,MAAM,EACJ,QAAQ2D,eAAe,EACvBC,YAAY,EACZC,cAAc,EACf,GAAGC,uBAAuB;QACzB,WAAWxB,SAAS;QACpBrC;IACF;IACA,IAAI2D,cACFjB,UAAUiB;IAEZ,IAAIC,gBAAgB;QAClBlB,UAAUkB;QACVjD,QAAQ,IAAI,CAACiD;IACf;IAEA,IAAI;QACFlB,UACE,CAAC,QAAQ,EAAEM,cAAc,eAAe,GAAG,WAAW,EAAEvD,WAAW;QAGrE,IAAIuD,aAAa;YACf,MAAMc,SAAU,MAAMxB,WAAW,MAAM,CACrC;gBACE,OAAO7C;gBACP2C;gBACA,GAAGqB,YAAY;gBACf,GAAGC,eAAe;YACpB,GACA;gBACE,QAAQ;YACV;YAKF,WAAW,MAAMK,SAASD,OAAQ;gBAChC,MAAMb,UAAUc,MAAM,OAAO,EAAE,CAAC,EAAE,EAAE,OAAO,WAAW;gBACtD,MAAMC,oBACHD,MAAM,OAAO,EAAE,CAAC,EAAE,EAAE,OAAe,qBAAqB;gBAG3D,IAAIA,MAAM,KAAK,EACbX,QAAQW,MAAM,KAAK;gBAGrB,IAAId,WAAWe,mBAAmB;oBAChCd,eAAeD;oBACfE,wBAAwBa;oBACxB,MAAMC,YAAiC;wBACrChB;wBACAe;wBACAd;wBACA,YAAY;wBACZ,OAAOgB;oBACT;oBACA7B,QAAQ,OAAO,CAAE4B;gBACnB;gBAGA,IAAIF,MAAM,OAAO,EAAE,CAAC,EAAE,EAAE,eAAe;oBACrCV,WAAWP,KAAK,GAAG,KAAKD;oBAGxB,IAAI,CAACO,OAAO;wBAEV,MAAMe,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAClB,YAAY,MAAM,GAAG;wBAElCE,QAAQ;4BACN,eAAee;4BACf,mBAAmBA;4BACnB,cAAcA,AAAkB,IAAlBA;wBAChB;oBACF;oBAGA,MAAME,aAAkC;wBACtC,SAAS;wBACTnB;wBACA,mBAAmB;wBACnB,YAAY;wBACZ,OAAOI,eAAeF;oBACxB;oBACAf,QAAQ,OAAO,CAAEgC;oBACjB;gBACF;YACF;YACApB,UAAUC;YACVP,kBACE,CAAC,iBAAiB,EAAElD,UAAU,QAAQ,EAAEM,UAAU,UAAU,WAAW,EAAEsD,SAAS,eAAe,EAAEN,eAAe,IAAI;QAE1H,OAAO;YAEL,MAAMuB,aAAahF,YAAY,UAAU,IAAI;YAC7C,MAAMiF,gBAAgBjF,YAAY,aAAa,IAAI;YACnD,MAAMkF,cAAcF,aAAa;YAEjC,IAAIG;YAEJ,IAAK,IAAIC,UAAU,GAAGA,WAAWF,aAAaE,UAC5C,IAAI;gBACF,MAAMC,SAAS,MAAMrC,WAAW,MAAM,CAAC;oBACrC,OAAO7C;oBACP2C;oBACA,GAAGqB,YAAY;oBACf,GAAGC,eAAe;gBACpB;gBAEAL,WAAWP,KAAK,GAAG,KAAKD;gBAExBF,kBACE,CAAC,OAAO,EAAElD,UAAU,QAAQ,EAAEM,UAAU,UAAU,mBAAmB,EAAED,cAAc,iBAAiB,EAAE6E,OAAO,KAAK,EAAE,iBAAiB,GAAG,qBAAqB,EAAEA,OAAO,KAAK,EAAE,qBAAqB,GAAG,gBAAgB,EAAEA,OAAO,KAAK,EAAE,gBAAgB,GAAG,WAAW,EAAEtB,SAAS,aAAa,EAAEsB,OAAO,WAAW,IAAI,GAAG,eAAe,EAAE5B,eAAe,IAAI;gBAG9VH,mBACE,CAAC,oBAAoB,EAAEgC,KAAK,SAAS,CAACD,OAAO,KAAK,GAAG;gBAGvD,IAAI,CAACA,OAAO,OAAO,EACjB,MAAM,IAAI3D,MACR,CAAC,mCAAmC,EAAE4D,KAAK,SAAS,CAACD,SAAS;gBAIlE1B,UAAU0B,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,OAAO;gBAC3C,IAAI,CAAC1B,SACH,MAAM,IAAIjC,MAAM;gBAGlBmC,uBACGwB,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,EAAU,qBAAqB;gBAC3DvB,QAAQuB,OAAO,KAAK;gBACpB;YACF,EAAE,OAAOrD,OAAO;gBACdmD,YAAYnD;gBACZ,IAAIoD,UAAUF,aAAa;oBACzB7D,QAAQ,IAAI,CACV,CAAC,mCAAmC,EAAE+D,QAAQ,CAAC,EAAEF,YAAY,eAAe,EAAED,cAAc,aAAa,EAAEE,UAAU,OAAO,EAAE;oBAEhI,MAAM,IAAII,QAAQ,CAACC,UAAYC,WAAWD,SAASP;gBACrD;YACF;YAGF,IAAI,CAACtB,SACH,MAAMwB;QAEV;QAEA/B,UAAU,CAAC,4BAA4B,EAAES,sBAAsB;QAC/DT,UAAU,CAAC,kBAAkB,EAAEO,SAAS;QAGxC,IAAID,eAAe,CAACI,OAAO;YAEzB,MAAMe,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAEnB,AAAAA,CAAAA,WAAW,EAAC,EAAG,MAAM,GAAG;YAEtCG,QAAQ;gBACN,eAAee;gBACf,mBAAmBA;gBACnB,cAAcA,AAAkB,IAAlBA;YAChB;QACF;QAEA,OAAO;YACL,SAASlB,WAAW;YACpB,mBAAmBE,wBAAwBe;YAC3C,OAAOZ,eAAeF;YACtB,YAAY,CAAC,CAACJ;QAChB;IACF,EAAE,OAAOgC,GAAQ;QACfrE,QAAQ,KAAK,CAAC,kBAAkBqE;QAChC,MAAMC,WAAW,IAAIjE,MACnB,CAAC,eAAe,EAAEgC,cAAc,eAAe,GAAG,kBAAkB,EAAEvD,UAAU,GAAG,EAAEuF,EAAE,OAAO,CAAC,8DAA8D,CAAC,EAC9J;YACE,OAAOA;QACT;QAEF,MAAMC;IACR;AACF;AAEO,eAAeC,yBACpB9C,QAAsC,EACtC9C,WAAyB,EACzB+C,OAEC;IAOD,MAAM8C,WAAW,MAAMhD,OAAOC,UAAU9C,aAAa;QACnD,WAAW+C,SAAS;IACtB;IACA+C,OAAOD,UAAU;IACjB,MAAMpF,SAAST,YAAY,MAAM;IACjC,MAAM+F,cAAcC,cAAcH,SAAS,OAAO,EAAEpF;IACpDqF,OACE,AAAuB,YAAvB,OAAOC,aACP,CAAC,0CAA0C,EAAE/F,YAAY,SAAS,CAAC,GAAG,EAAE6F,SAAS,OAAO,EAAE;IAE5F,OAAO;QACL,SAASE;QACT,eAAeF,SAAS,OAAO;QAC/B,OAAOA,SAAS,KAAK;QACrB,mBAAmBA,SAAS,iBAAiB;IAC/C;AACF;AAEO,eAAeI,yBACpBC,IAAY,EACZlG,WAAyB;IAEzB,MAAM,EAAE2D,OAAO,EAAEG,KAAK,EAAE,GAAG,MAAMjB,OAAOqD,MAAMlG;IAC9C,OAAO;QAAE2D;QAASG;IAAM;AAC1B;AAEO,SAASqC,yBAAyBN,QAAgB;IACvD,IAAI;QAEF,MAAMO,YAAYP,SAAS,KAAK,CAAC;QACjC,IAAIO,WACF,OAAOA,SAAS,CAAC,EAAE;QAIrB,MAAMC,iBAAiBR,SAAS,KAAK,CACnC;QAEF,IAAIQ,gBACF,OAAOA,cAAc,CAAC,EAAE;QAI1B,MAAMC,gBAAgBT,SAAS,KAAK,CAAC;QACrC,IAAIS,eACF,OAAOA,aAAa,CAAC,EAAE;IAE3B,EAAE,OAAM,CAAC;IAET,OAAOT;AACT;AAEO,SAASU,yBAAyBC,KAAa;IACpD,IAAIA,MAAM,QAAQ,CAAC,SAEjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEO,SAASjC,uBAAuB,EACrCkC,SAAS,EACT/F,WAAW,EAIZ;IAKC,MAAMgG,sBAAsBD,AAAc,YAAdA,YAAwB7B,SAAY6B;IAEhE,IAAIC,AAAwB9B,WAAxB8B,qBACF,OAAO;QAAE,QAAQ,CAAC;QAAG,cAAc9B;IAAU;IAG/C,IAAIlE,AAAgB,eAAhBA,aACF,OAAO;QACL,QAAQ;YAAE,iBAAiBgG;QAAoB;QAC/C,cAAc,CAAC,oCAAoC,EAAEA,oBAAoB,aAAa,CAAC;IACzF;IAGF,IAAIhG,AAAgB,oBAAhBA,aACF,OAAO;QACL,QAAQ;YACN,UAAU;gBAAE,MAAMgG,sBAAsB,YAAY;YAAW;QACjE;QACA,cAAc,CAAC,kCAAkC,EAAEA,sBAAsB,YAAY,WAAW,kBAAkB,CAAC;IACrH;IAGF,IAAIhG,AAAgB,YAAhBA,aACF,OAAO;QACL,QAAQ;YACN,UAAU;gBAAE,MAAMgG,sBAAsB,YAAY;YAAW;QACjE;QACA,cAAc,CAAC,kCAAkC,EAAEA,sBAAsB,YAAY,WAAW,UAAU,CAAC;IAC7G;IAGF,OAAO;QACL,QAAQ,CAAC;QACT,cAAc,CAAC,6CAA6C,EAAEhG,eAAe,UAAU,CAAC,CAAC;QACzF,gBAAgB,CAAC,0DAA0D,EAAEA,eAAe,UAAU,EAAE,CAAC;IAC3G;AACF;AAQA,SAASiG,oBAAoBC,GAAQ;IAEnC,IAAIA,QAAAA,KACF,OAAOA;IAIT,IAAIC,MAAM,OAAO,CAACD,MAChB,OAAOA,IAAI,GAAG,CAAC,CAACE,OAASH,oBAAoBG;IAI/C,IAAI,AAAe,YAAf,OAAOF,KAAkB;QAC3B,MAAMG,aAAkB,CAAC;QAEzB,KAAK,MAAM,CAACC,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACN,KAAM;YAE9C,MAAMO,aAAaH,IAAI,IAAI;YAG3B,IAAII,kBAAkBT,oBAAoBM;YAG1C,IAAI,AAA2B,YAA3B,OAAOG,iBACTA,kBAAkBA,gBAAgB,IAAI;YAGxCL,UAAU,CAACI,WAAW,GAAGC;QAC3B;QAEA,OAAOL;IACT;IAGA,IAAI,AAAe,YAAf,OAAOH,KACT,OAAOA,IAAI,IAAI;IAIjB,OAAOA;AACT;AAEO,SAASZ,cAAcQ,KAAa,EAAE/F,MAAgC;IAC3E,MAAM4G,kBAAkBlB,yBAAyBK;IAEjD,IAAIa,iBAAiB,MAAM,oBACzB,OAAOA,gBACJ,KAAK,CAAC,oBACL,MAAM,GACP,IAAIzF;IAGT,IAAIV;IACJ,IAAIiE;IACJ,IAAI;QACFjE,SAASoE,KAAK,KAAK,CAAC+B;QACpB,OAAOV,oBAAoBzF;IAC7B,EAAE,OAAOc,OAAO;QACdmD,YAAYnD;IACd;IACA,IAAI;QACFd,SAASoE,KAAK,KAAK,CAACgC,WAAWD;QAC/B,OAAOV,oBAAoBzF;IAC7B,EAAE,OAAOc,OAAO;QACdmD,YAAYnD;IACd;IAEA,IAAIvB,AAAW,oBAAXA,UAA8BA,AAAW,kBAAXA,QAA0B;QAC1D,MAAM8G,aAAahB,yBAAyBc;QAC5C,IAAI;YACFnG,SAASoE,KAAK,KAAK,CAACgC,WAAWC;YAC/B,OAAOZ,oBAAoBzF;QAC7B,EAAE,OAAOc,OAAO;YACdmD,YAAYnD;QACd;IACF;IACA,MAAMN,MACJ,CAAC,gDAAgD,EAAE8F,OACjDrC,aAAa,iBACb,gBAAgB,EAAEqB,OAAO;AAE/B"}
|
|
@@ -20,7 +20,8 @@ async function uiTarsPlanning(userInstruction, options) {
|
|
|
20
20
|
let instruction = userInstruction;
|
|
21
21
|
if (actionContext) instruction = `<high_priority_knowledge>${actionContext}</high_priority_knowledge>\n<user_instruction>${userInstruction}</user_instruction>`;
|
|
22
22
|
const systemPrompt = getUiTarsPlanningPrompt() + instruction;
|
|
23
|
-
const
|
|
23
|
+
const screenshotBase64 = context.screenshot.getData();
|
|
24
|
+
const imagePayload = await resizeImageForUiTars(screenshotBase64, context.size, uiTarsModelVersion);
|
|
24
25
|
conversationHistory.append({
|
|
25
26
|
role: 'user',
|
|
26
27
|
content: [
|
|
@@ -213,7 +214,7 @@ async function uiTarsPlanning(userInstruction, options) {
|
|
|
213
214
|
log,
|
|
214
215
|
usage: res.usage,
|
|
215
216
|
rawResponse: JSON.stringify(res.content, void 0, 2),
|
|
216
|
-
|
|
217
|
+
shouldContinuePlanning: shouldContinue
|
|
217
218
|
};
|
|
218
219
|
}
|
|
219
220
|
function convertBboxToCoordinates(text) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["../../../src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n PlanningAIResponse,\n PlanningAction,\n Size,\n UIContext,\n} from '@/types';\nimport { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ConversationHistory } from './conversation-history';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { callAIWithStringResponse } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'left_double'\n | 'right_single'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function uiTarsPlanning(\n userInstruction: string,\n options: {\n conversationHistory: ConversationHistory;\n context: UIContext;\n modelConfig: IModelConfig;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, modelConfig, actionContext } = options;\n const { uiTarsModelVersion } = modelConfig;\n\n let instruction = userInstruction;\n if (actionContext) {\n instruction = `<high_priority_knowledge>${actionContext}</high_priority_knowledge>\\n<user_instruction>${userInstruction}</user_instruction>`;\n }\n\n const systemPrompt = getUiTarsPlanningPrompt() + instruction;\n\n const imagePayload = await resizeImageForUiTars(\n context.screenshotBase64,\n context.size,\n uiTarsModelVersion,\n );\n\n conversationHistory.append({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n },\n },\n ],\n });\n\n const res = await callAIWithStringResponse(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory.snapshot(),\n ],\n modelConfig,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const { size } = context;\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: uiTarsModelVersion,\n });\n\n debug(\n 'ui-tars modelVer',\n uiTarsModelVersion,\n ', parsed',\n JSON.stringify(parsed),\n );\n\n const transformActions: PlanningAction[] = [];\n const unhandledActions: Array<{ type: string; thought: string }> = [];\n let shouldContinue = true;\n parsed.forEach((action) => {\n const actionType = (action.action_type || '').toLowerCase();\n if (actionType === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (actionType === 'left_double') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'DoubleClick',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'right_single') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'RightClick',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'finished') {\n shouldContinue = false;\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (actionType === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys.join('+'),\n },\n thought: action.thought || '',\n });\n }\n } else if (actionType === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n } else if (actionType) {\n // Track unhandled action types\n unhandledActions.push({\n type: actionType,\n thought: action.thought || '',\n });\n debug('Unhandled action type:', actionType, 'thought:', action.thought);\n }\n });\n\n if (transformActions.length === 0) {\n const errorDetails: string[] = [];\n\n // Check if parsing failed\n if (parsed.length === 0) {\n errorDetails.push('Action parser returned no actions');\n\n // Check if response has Thought but no Action\n if (\n res.content.includes('Thought:') &&\n !res.content.includes('Action:')\n ) {\n errorDetails.push(\n 'Response contains \"Thought:\" but missing \"Action:\" line',\n );\n } else {\n errorDetails.push('Response may be malformed or empty');\n }\n }\n\n // Check if we have unhandled action types\n if (unhandledActions.length > 0) {\n const types = unhandledActions.map((a) => a.type).join(', ');\n errorDetails.push(`Unhandled action types: ${types}`);\n }\n\n const errorMessage = [\n 'No actions found in UI-TARS response.',\n ...errorDetails,\n `\\nRaw response: ${res.content}`,\n ].join('\\n');\n\n throw new Error(errorMessage, {\n cause: {\n prediction: res.content,\n parsed,\n unhandledActions,\n convertedText,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n const log = getSummary(res.content);\n\n conversationHistory.append({\n role: 'assistant',\n content: log,\n });\n\n return {\n actions: transformActions,\n log,\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n more_actions_needed_by_instruction: shouldContinue,\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface LeftDoubleAction extends BaseAction {\n action_type: 'left_double';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface RightSingleAction extends BaseAction {\n action_type: 'right_single';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | LeftDoubleAction\n | RightSingleAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n uiTarsVersion: UITarsModelVersion | undefined,\n) {\n if (uiTarsVersion === UITarsModelVersion.V1_5) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","uiTarsPlanning","userInstruction","options","conversationHistory","context","modelConfig","actionContext","uiTarsModelVersion","instruction","systemPrompt","getUiTarsPlanningPrompt","imagePayload","resizeImageForUiTars","res","callAIWithStringResponse","convertedText","convertBboxToCoordinates","size","parsed","actionParser","JSON","transformActions","unhandledActions","shouldContinue","action","actionType","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","errorDetails","types","a","errorMessage","Error","log","getSummary","undefined","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","imageBase64","uiTarsVersion","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;AA0BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,eACpBC,eAAuB,EACvBC,OAKC;IAED,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,WAAW,EAAEC,aAAa,EAAE,GAAGJ;IACrE,MAAM,EAAEK,kBAAkB,EAAE,GAAGF;IAE/B,IAAIG,cAAcP;IAClB,IAAIK,eACFE,cAAc,CAAC,yBAAyB,EAAEF,cAAc,8CAA8C,EAAEL,gBAAgB,mBAAmB,CAAC;IAG9I,MAAMQ,eAAeC,4BAA4BF;IAEjD,MAAMG,eAAe,MAAMC,qBACzBR,QAAQ,gBAAgB,EACxBA,QAAQ,IAAI,EACZG;IAGFJ,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKQ;gBACP;YACF;SACD;IACH;IAEA,MAAME,MAAM,MAAMC,yBAChB;QACE;YACE,MAAM;YACN,SAASL;QACX;WACGN,oBAAoB,QAAQ;KAChC,EACDE;IAEF,MAAMU,gBAAgBC,yBAAyBH,IAAI,OAAO;IAE1D,MAAM,EAAEI,IAAI,EAAE,GAAGb;IACjB,MAAM,EAAEc,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYJ;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOE,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUV;IACZ;IAEAf,MACE,oBACAe,oBACA,YACAa,KAAK,SAAS,CAACF;IAGjB,MAAMG,mBAAqC,EAAE;IAC7C,MAAMC,mBAA6D,EAAE;IACrE,IAAIC,iBAAiB;IACrBL,OAAO,OAAO,CAAC,CAACM;QACd,MAAMC,aAAcD,AAAAA,CAAAA,OAAO,WAAW,IAAI,EAAC,EAAG,WAAW;QACzD,IAAIC,AAAe,YAAfA,YAAwB;YAC1BC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM5B,QAAQ+B,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM7B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BqB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIQ,AAAe,kBAAfA,YAA8B;YACvCC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM5B,QAAQ+B,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM7B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BqB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASO,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,mBAAfA,YAA+B;YACxCC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM5B,QAAQ+B,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM7B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BqB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASO,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,WAAfA,YAAuB;YAChCC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCE,OAAOF,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMI,aAAaD,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YAC5D,MAAMY,WAAWF,SAASH,OAAO,aAAa,CAAC,OAAO,EAAEP;YACxDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM7B,YACJ;4BAAE,GAAGiC,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCX,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQO,OAAO,OAAO,IAAI;wBAC1B,MAAM7B,YACJ;4BAAE,GAAGkC,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjCZ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASO,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,WAAfA,YACTJ,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOG,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIC,AAAe,aAAfA,YACTJ,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWG,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIC,AAAe,eAAfA,YAA2B;YACpCF,iBAAiB;YACjBF,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO,CAAC;gBACR,SAASG,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,aAAfA,YACT,IAAKD,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMM,OAAOC,qBAAqBP,OAAO,aAAa,CAAC,GAAG;YAE1DH,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASS,KAAK,IAAI,CAAC;gBACrB;gBACA,SAASN,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEQ,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAe,WAAfA,YACTJ,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASG,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIC,YAAY;YAErBH,iBAAiB,IAAI,CAAC;gBACpB,MAAMG;gBACN,SAASD,OAAO,OAAO,IAAI;YAC7B;YACAhC,MAAM,0BAA0BiC,YAAY,YAAYD,OAAO,OAAO;QACxE;IACF;IAEA,IAAIH,AAA4B,MAA5BA,iBAAiB,MAAM,EAAQ;QACjC,MAAMY,eAAyB,EAAE;QAGjC,IAAIf,AAAkB,MAAlBA,OAAO,MAAM,EAAQ;YACvBe,aAAa,IAAI,CAAC;YAGlB,IACEpB,IAAI,OAAO,CAAC,QAAQ,CAAC,eACrB,CAACA,IAAI,OAAO,CAAC,QAAQ,CAAC,YAEtBoB,aAAa,IAAI,CACf;iBAGFA,aAAa,IAAI,CAAC;QAEtB;QAGA,IAAIX,iBAAiB,MAAM,GAAG,GAAG;YAC/B,MAAMY,QAAQZ,iBAAiB,GAAG,CAAC,CAACa,IAAMA,EAAE,IAAI,EAAE,IAAI,CAAC;YACvDF,aAAa,IAAI,CAAC,CAAC,wBAAwB,EAAEC,OAAO;QACtD;QAEA,MAAME,eAAe;YACnB;eACGH;YACH,CAAC,gBAAgB,EAAEpB,IAAI,OAAO,EAAE;SACjC,CAAC,IAAI,CAAC;QAEP,MAAM,IAAIwB,MAAMD,cAAc;YAC5B,OAAO;gBACL,YAAYvB,IAAI,OAAO;gBACvBK;gBACAI;gBACAP;YACF;QACF;IACF;IAEAvB,MAAM,oBAAoB4B,KAAK,SAAS,CAACC,kBAAkB,MAAM;IACjE,MAAMiB,MAAMC,WAAW1B,IAAI,OAAO;IAElCV,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAASmC;IACX;IAEA,OAAO;QACL,SAASjB;QACTiB;QACA,OAAOzB,IAAI,KAAK;QAChB,aAAaO,KAAK,SAAS,CAACP,IAAI,OAAO,EAAE2B,QAAW;QACpD,oCAAoCjB;IACtC;AACF;AAOA,SAASP,yBAAyByB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAIvD,KAAK,KAAK,CAAEkD,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAIxD,KAAK,KAAK,CAAEoD,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAAShB,SAAS8B,QAAgB,EAAExC,IAAuC;IACzE,MAAM,CAACqC,GAAGC,EAAE,GAAGnC,KAAK,KAAK,CAACqC;IAC1B,OAAO;QAACH,IAAIrC,KAAK,KAAK;QAAEsC,IAAItC,KAAK,MAAM;KAAC;AAC1C;AAkFO,eAAeL,qBACpB8C,WAAmB,EACnBzC,IAAU,EACV0C,aAA6C;IAE7C,IAAIA,kBAAkBC,mBAAmB,IAAI,EAAE;QAC7CpE,MAAM,uCAAuCyB;QAC7C,MAAM4C,gBAAgB5C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAM6C,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAehE,KAAK,IAAI,CAAC+D,YAAYD;YAC3C,MAAMG,WAAWjE,KAAK,KAAK,CAACkB,KAAK,KAAK,GAAG8C;YACzC,MAAME,YAAYlE,KAAK,KAAK,CAACkB,KAAK,MAAM,GAAG8C;YAC3CvE,MACE,2DACAwE,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|
|
1
|
+
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["../../../src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n PlanningAIResponse,\n PlanningAction,\n Size,\n UIContext,\n} from '@/types';\nimport { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ConversationHistory } from './conversation-history';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { callAIWithStringResponse } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'left_double'\n | 'right_single'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function uiTarsPlanning(\n userInstruction: string,\n options: {\n conversationHistory: ConversationHistory;\n context: UIContext;\n modelConfig: IModelConfig;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, modelConfig, actionContext } = options;\n const { uiTarsModelVersion } = modelConfig;\n\n let instruction = userInstruction;\n if (actionContext) {\n instruction = `<high_priority_knowledge>${actionContext}</high_priority_knowledge>\\n<user_instruction>${userInstruction}</user_instruction>`;\n }\n\n const systemPrompt = getUiTarsPlanningPrompt() + instruction;\n\n const screenshotBase64 = context.screenshot.getData();\n const imagePayload = await resizeImageForUiTars(\n screenshotBase64,\n context.size,\n uiTarsModelVersion,\n );\n\n conversationHistory.append({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n },\n },\n ],\n });\n\n const res = await callAIWithStringResponse(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory.snapshot(),\n ],\n modelConfig,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const { size } = context;\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: uiTarsModelVersion,\n });\n\n debug(\n 'ui-tars modelVer',\n uiTarsModelVersion,\n ', parsed',\n JSON.stringify(parsed),\n );\n\n const transformActions: PlanningAction[] = [];\n const unhandledActions: Array<{ type: string; thought: string }> = [];\n let shouldContinue = true;\n parsed.forEach((action) => {\n const actionType = (action.action_type || '').toLowerCase();\n if (actionType === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (actionType === 'left_double') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'DoubleClick',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'right_single') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'RightClick',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (actionType === 'finished') {\n shouldContinue = false;\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (actionType === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys.join('+'),\n },\n thought: action.thought || '',\n });\n }\n } else if (actionType === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n } else if (actionType) {\n // Track unhandled action types\n unhandledActions.push({\n type: actionType,\n thought: action.thought || '',\n });\n debug('Unhandled action type:', actionType, 'thought:', action.thought);\n }\n });\n\n if (transformActions.length === 0) {\n const errorDetails: string[] = [];\n\n // Check if parsing failed\n if (parsed.length === 0) {\n errorDetails.push('Action parser returned no actions');\n\n // Check if response has Thought but no Action\n if (\n res.content.includes('Thought:') &&\n !res.content.includes('Action:')\n ) {\n errorDetails.push(\n 'Response contains \"Thought:\" but missing \"Action:\" line',\n );\n } else {\n errorDetails.push('Response may be malformed or empty');\n }\n }\n\n // Check if we have unhandled action types\n if (unhandledActions.length > 0) {\n const types = unhandledActions.map((a) => a.type).join(', ');\n errorDetails.push(`Unhandled action types: ${types}`);\n }\n\n const errorMessage = [\n 'No actions found in UI-TARS response.',\n ...errorDetails,\n `\\nRaw response: ${res.content}`,\n ].join('\\n');\n\n throw new Error(errorMessage, {\n cause: {\n prediction: res.content,\n parsed,\n unhandledActions,\n convertedText,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n const log = getSummary(res.content);\n\n conversationHistory.append({\n role: 'assistant',\n content: log,\n });\n\n return {\n actions: transformActions,\n log,\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n shouldContinuePlanning: shouldContinue,\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface LeftDoubleAction extends BaseAction {\n action_type: 'left_double';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface RightSingleAction extends BaseAction {\n action_type: 'right_single';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | LeftDoubleAction\n | RightSingleAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n uiTarsVersion: UITarsModelVersion | undefined,\n) {\n if (uiTarsVersion === UITarsModelVersion.V1_5) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","uiTarsPlanning","userInstruction","options","conversationHistory","context","modelConfig","actionContext","uiTarsModelVersion","instruction","systemPrompt","getUiTarsPlanningPrompt","screenshotBase64","imagePayload","resizeImageForUiTars","res","callAIWithStringResponse","convertedText","convertBboxToCoordinates","size","parsed","actionParser","JSON","transformActions","unhandledActions","shouldContinue","action","actionType","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","errorDetails","types","a","errorMessage","Error","log","getSummary","undefined","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","imageBase64","uiTarsVersion","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;AA0BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,eACpBC,eAAuB,EACvBC,OAKC;IAED,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,WAAW,EAAEC,aAAa,EAAE,GAAGJ;IACrE,MAAM,EAAEK,kBAAkB,EAAE,GAAGF;IAE/B,IAAIG,cAAcP;IAClB,IAAIK,eACFE,cAAc,CAAC,yBAAyB,EAAEF,cAAc,8CAA8C,EAAEL,gBAAgB,mBAAmB,CAAC;IAG9I,MAAMQ,eAAeC,4BAA4BF;IAEjD,MAAMG,mBAAmBP,QAAQ,UAAU,CAAC,OAAO;IACnD,MAAMQ,eAAe,MAAMC,qBACzBF,kBACAP,QAAQ,IAAI,EACZG;IAGFJ,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKS;gBACP;YACF;SACD;IACH;IAEA,MAAME,MAAM,MAAMC,yBAChB;QACE;YACE,MAAM;YACN,SAASN;QACX;WACGN,oBAAoB,QAAQ;KAChC,EACDE;IAEF,MAAMW,gBAAgBC,yBAAyBH,IAAI,OAAO;IAE1D,MAAM,EAAEI,IAAI,EAAE,GAAGd;IACjB,MAAM,EAAEe,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYJ;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOE,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUX;IACZ;IAEAf,MACE,oBACAe,oBACA,YACAc,KAAK,SAAS,CAACF;IAGjB,MAAMG,mBAAqC,EAAE;IAC7C,MAAMC,mBAA6D,EAAE;IACrE,IAAIC,iBAAiB;IACrBL,OAAO,OAAO,CAAC,CAACM;QACd,MAAMC,aAAcD,AAAAA,CAAAA,OAAO,WAAW,IAAI,EAAC,EAAG,WAAW;QACzD,IAAIC,AAAe,YAAfA,YAAwB;YAC1BC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM7B,QAAQgC,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM9B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BsB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIQ,AAAe,kBAAfA,YAA8B;YACvCC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM7B,QAAQgC,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM9B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BsB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASO,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,mBAAfA,YAA+B;YACxCC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM7B,QAAQgC,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM9B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BsB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASO,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,WAAfA,YAAuB;YAChCC,OAAOF,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCE,OAAOF,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMI,aAAaD,SAASH,OAAO,aAAa,CAAC,SAAS,EAAEP;YAC5D,MAAMY,WAAWF,SAASH,OAAO,aAAa,CAAC,OAAO,EAAEP;YACxDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQG,OAAO,OAAO,IAAI;wBAC1B,MAAM9B,YACJ;4BAAE,GAAGkC,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCX,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQO,OAAO,OAAO,IAAI;wBAC1B,MAAM9B,YACJ;4BAAE,GAAGmC,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjCZ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASO,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,WAAfA,YACTJ,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOG,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIC,AAAe,aAAfA,YACTJ,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWG,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIC,AAAe,eAAfA,YAA2B;YACpCF,iBAAiB;YACjBF,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO,CAAC;gBACR,SAASG,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIC,AAAe,aAAfA,YACT,IAAKD,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMM,OAAOC,qBAAqBP,OAAO,aAAa,CAAC,GAAG;YAE1DH,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASS,KAAK,IAAI,CAAC;gBACrB;gBACA,SAASN,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEQ,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAe,WAAfA,YACTJ,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASG,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIC,YAAY;YAErBH,iBAAiB,IAAI,CAAC;gBACpB,MAAMG;gBACN,SAASD,OAAO,OAAO,IAAI;YAC7B;YACAjC,MAAM,0BAA0BkC,YAAY,YAAYD,OAAO,OAAO;QACxE;IACF;IAEA,IAAIH,AAA4B,MAA5BA,iBAAiB,MAAM,EAAQ;QACjC,MAAMY,eAAyB,EAAE;QAGjC,IAAIf,AAAkB,MAAlBA,OAAO,MAAM,EAAQ;YACvBe,aAAa,IAAI,CAAC;YAGlB,IACEpB,IAAI,OAAO,CAAC,QAAQ,CAAC,eACrB,CAACA,IAAI,OAAO,CAAC,QAAQ,CAAC,YAEtBoB,aAAa,IAAI,CACf;iBAGFA,aAAa,IAAI,CAAC;QAEtB;QAGA,IAAIX,iBAAiB,MAAM,GAAG,GAAG;YAC/B,MAAMY,QAAQZ,iBAAiB,GAAG,CAAC,CAACa,IAAMA,EAAE,IAAI,EAAE,IAAI,CAAC;YACvDF,aAAa,IAAI,CAAC,CAAC,wBAAwB,EAAEC,OAAO;QACtD;QAEA,MAAME,eAAe;YACnB;eACGH;YACH,CAAC,gBAAgB,EAAEpB,IAAI,OAAO,EAAE;SACjC,CAAC,IAAI,CAAC;QAEP,MAAM,IAAIwB,MAAMD,cAAc;YAC5B,OAAO;gBACL,YAAYvB,IAAI,OAAO;gBACvBK;gBACAI;gBACAP;YACF;QACF;IACF;IAEAxB,MAAM,oBAAoB6B,KAAK,SAAS,CAACC,kBAAkB,MAAM;IACjE,MAAMiB,MAAMC,WAAW1B,IAAI,OAAO;IAElCX,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAASoC;IACX;IAEA,OAAO;QACL,SAASjB;QACTiB;QACA,OAAOzB,IAAI,KAAK;QAChB,aAAaO,KAAK,SAAS,CAACP,IAAI,OAAO,EAAE2B,QAAW;QACpD,wBAAwBjB;IAC1B;AACF;AAOA,SAASP,yBAAyByB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAIxD,KAAK,KAAK,CAAEmD,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAIzD,KAAK,KAAK,CAAEqD,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAAShB,SAAS8B,QAAgB,EAAExC,IAAuC;IACzE,MAAM,CAACqC,GAAGC,EAAE,GAAGnC,KAAK,KAAK,CAACqC;IAC1B,OAAO;QAACH,IAAIrC,KAAK,KAAK;QAAEsC,IAAItC,KAAK,MAAM;KAAC;AAC1C;AAkFO,eAAeL,qBACpB8C,WAAmB,EACnBzC,IAAU,EACV0C,aAA6C;IAE7C,IAAIA,kBAAkBC,mBAAmB,IAAI,EAAE;QAC7CrE,MAAM,uCAAuC0B;QAC7C,MAAM4C,gBAAgB5C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAM6C,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAejE,KAAK,IAAI,CAACgE,YAAYD;YAC3C,MAAMG,WAAWlE,KAAK,KAAK,CAACmB,KAAK,KAAK,GAAG8C;YACzC,MAAME,YAAYnE,KAAK,KAAK,CAACmB,KAAK,MAAM,GAAG8C;YAC3CxE,MACE,2DACAyE,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|
package/dist/es/common.mjs
CHANGED
|
@@ -178,7 +178,7 @@ async function markupImageForLLM(screenshotBase64, tree, size) {
|
|
|
178
178
|
});
|
|
179
179
|
return imagePayload;
|
|
180
180
|
}
|
|
181
|
-
function buildYamlFlowFromPlans(plans, actionSpace
|
|
181
|
+
function buildYamlFlowFromPlans(plans, actionSpace) {
|
|
182
182
|
const flow = [];
|
|
183
183
|
for (const plan of plans){
|
|
184
184
|
const verb = plan.type;
|
|
@@ -195,9 +195,6 @@ function buildYamlFlowFromPlans(plans, actionSpace, sleep) {
|
|
|
195
195
|
};
|
|
196
196
|
flow.push(flowItem);
|
|
197
197
|
}
|
|
198
|
-
if (sleep) flow.push({
|
|
199
|
-
sleep
|
|
200
|
-
});
|
|
201
198
|
return flow;
|
|
202
199
|
}
|
|
203
200
|
const PointSchema = z.object({
|
|
@@ -331,6 +328,18 @@ const parseActionParam = (rawParam, zodSchema)=>{
|
|
|
331
328
|
for(const fieldName in locateFieldValues)validated[fieldName] = locateFieldValues[fieldName];
|
|
332
329
|
return validated;
|
|
333
330
|
};
|
|
334
|
-
|
|
331
|
+
const finalizeActionName = 'Finalize';
|
|
332
|
+
const getReadableTimeString = (format = 'YYYY-MM-DD HH:mm:ss')=>{
|
|
333
|
+
const now = new Date();
|
|
334
|
+
const year = now.getFullYear();
|
|
335
|
+
const month = String(now.getMonth() + 1).padStart(2, '0');
|
|
336
|
+
const day = String(now.getDate()).padStart(2, '0');
|
|
337
|
+
const hours = String(now.getHours()).padStart(2, '0');
|
|
338
|
+
const minutes = String(now.getMinutes()).padStart(2, '0');
|
|
339
|
+
const seconds = String(now.getSeconds()).padStart(2, '0');
|
|
340
|
+
const timeString = format.replace('YYYY', String(year)).replace('MM', month).replace('DD', day).replace('HH', hours).replace('mm', minutes).replace('ss', seconds);
|
|
341
|
+
return `${timeString} (${format})`;
|
|
342
|
+
};
|
|
343
|
+
export { PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwen2_5Bbox, buildYamlFlowFromPlans, dumpActionParam, dumpMidsceneLocatorField, expandSearchArea, fillBboxParam, finalizeActionName, findAllMidsceneLocatorField, getMidsceneLocationSchema, getReadableTimeString, ifMidsceneLocatorField, loadActionParam, markupImageForLLM, mergeRects, normalized01000, parseActionParam };
|
|
335
344
|
|
|
336
345
|
//# sourceMappingURL=common.mjs.map
|