@midscene/core 0.26.7-beta-20250821134240.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model/service-caller/index.mjs +15 -7
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +25 -81
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/insight/index.mjs +4 -4
- package/dist/es/insight/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/ai-model/service-caller/index.js +15 -7
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +25 -81
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/insight/index.js +4 -4
- package/dist/lib/insight/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model/ui-tars-planning.d.ts +2 -9
- package/dist/types/types.d.ts +3 -2
- package/package.json +3 -3
|
@@ -14,7 +14,7 @@ import { assertSchema } from "../prompt/assertion.mjs";
|
|
|
14
14
|
import { locatorSchema } from "../prompt/llm-locator.mjs";
|
|
15
15
|
import { planSchema } from "../prompt/llm-planning.mjs";
|
|
16
16
|
async function createChatClient({ AIActionTypeValue, modelPreferences }) {
|
|
17
|
-
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, openaiUseAzureDeprecated, useAzureOpenai, azureOpenaiScope, azureOpenaiKey, azureOpenaiEndpoint, azureOpenaiApiVersion, azureOpenaiDeployment, azureExtraConfig, useAnthropicSdk, anthropicApiKey } = decideModelConfig(modelPreferences, true);
|
|
17
|
+
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, openaiUseAzureDeprecated, useAzureOpenai, azureOpenaiScope, azureOpenaiKey, azureOpenaiEndpoint, azureOpenaiApiVersion, azureOpenaiDeployment, azureExtraConfig, useAnthropicSdk, anthropicApiKey, modelDescription } = decideModelConfig(modelPreferences, true);
|
|
18
18
|
let openai;
|
|
19
19
|
let proxyAgent;
|
|
20
20
|
const debugProxy = getDebug('ai:call:proxy');
|
|
@@ -75,7 +75,8 @@ async function createChatClient({ AIActionTypeValue, modelPreferences }) {
|
|
|
75
75
|
if (void 0 !== openai) return {
|
|
76
76
|
completion: openai.chat.completions,
|
|
77
77
|
style: 'openai',
|
|
78
|
-
modelName
|
|
78
|
+
modelName,
|
|
79
|
+
modelDescription
|
|
79
80
|
};
|
|
80
81
|
if (useAnthropicSdk) openai = new Anthropic({
|
|
81
82
|
apiKey: anthropicApiKey,
|
|
@@ -85,12 +86,13 @@ async function createChatClient({ AIActionTypeValue, modelPreferences }) {
|
|
|
85
86
|
if (void 0 !== openai && openai.messages) return {
|
|
86
87
|
completion: openai.messages,
|
|
87
88
|
style: 'anthropic',
|
|
88
|
-
modelName
|
|
89
|
+
modelName,
|
|
90
|
+
modelDescription
|
|
89
91
|
};
|
|
90
92
|
throw new Error('Openai SDK or Anthropic SDK is not initialized');
|
|
91
93
|
}
|
|
92
94
|
async function call(messages, AIActionTypeValue, modelPreferences, options) {
|
|
93
|
-
const { completion, style, modelName } = await createChatClient({
|
|
95
|
+
const { completion, style, modelName, modelDescription } = await createChatClient({
|
|
94
96
|
AIActionTypeValue,
|
|
95
97
|
modelPreferences
|
|
96
98
|
});
|
|
@@ -161,7 +163,9 @@ async function call(messages, AIActionTypeValue, modelPreferences, options) {
|
|
|
161
163
|
completion_tokens: usage.completion_tokens ?? 0,
|
|
162
164
|
total_tokens: usage.total_tokens ?? 0,
|
|
163
165
|
time_cost: timeCost ?? 0,
|
|
164
|
-
model_name: modelName
|
|
166
|
+
model_name: modelName,
|
|
167
|
+
model_description: modelDescription,
|
|
168
|
+
intent: modelPreferences.intent
|
|
165
169
|
}
|
|
166
170
|
};
|
|
167
171
|
options.onChunk(finalChunk);
|
|
@@ -242,7 +246,9 @@ async function call(messages, AIActionTypeValue, modelPreferences, options) {
|
|
|
242
246
|
completion_tokens: anthropicUsage.output_tokens ?? 0,
|
|
243
247
|
total_tokens: (anthropicUsage.input_tokens ?? 0) + (anthropicUsage.output_tokens ?? 0),
|
|
244
248
|
time_cost: timeCost ?? 0,
|
|
245
|
-
model_name: modelName
|
|
249
|
+
model_name: modelName,
|
|
250
|
+
model_description: modelDescription,
|
|
251
|
+
intent: modelPreferences.intent
|
|
246
252
|
} : void 0
|
|
247
253
|
};
|
|
248
254
|
options.onChunk(finalChunk);
|
|
@@ -282,7 +288,9 @@ async function call(messages, AIActionTypeValue, modelPreferences, options) {
|
|
|
282
288
|
completion_tokens: usage.completion_tokens ?? 0,
|
|
283
289
|
total_tokens: usage.total_tokens ?? 0,
|
|
284
290
|
time_cost: timeCost ?? 0,
|
|
285
|
-
model_name: modelName
|
|
291
|
+
model_name: modelName,
|
|
292
|
+
model_description: modelDescription,
|
|
293
|
+
intent: modelPreferences.intent
|
|
286
294
|
} : void 0,
|
|
287
295
|
isStreamed: !!isStreaming
|
|
288
296
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/service-caller/index.mjs","sources":["webpack://@midscene/core/./src/ai-model/service-caller/index.ts"],"sourcesContent":["import { AIResponseFormat, type AIUsageInfo } from '@/types';\nimport type { CodeGenerationChunk, StreamingCallback } from '@/types';\nimport { Anthropic } from '@anthropic-ai/sdk';\nimport {\n DefaultAzureCredential,\n getBearerTokenProvider,\n} from '@azure/identity';\nimport {\n type IModelPreferences,\n MIDSCENE_API_TYPE,\n MIDSCENE_LANGSMITH_DEBUG,\n OPENAI_MAX_TOKENS,\n decideModelConfig,\n getAIConfig,\n getAIConfigInBoolean,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { parseBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ifInBrowser } from '@midscene/shared/utils';\nimport { HttpsProxyAgent } from 'https-proxy-agent';\nimport { jsonrepair } from 'jsonrepair';\nimport OpenAI, { AzureOpenAI } from 'openai';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { Stream } from 'openai/streaming';\nimport { SocksProxyAgent } from 'socks-proxy-agent';\nimport { AIActionType, type AIArgs } from '../common';\nimport { assertSchema } from '../prompt/assertion';\nimport { locatorSchema } from '../prompt/llm-locator';\nimport { planSchema } from '../prompt/llm-planning';\n\nasync function createChatClient({\n AIActionTypeValue,\n modelPreferences,\n}: {\n AIActionTypeValue: AIActionType;\n modelPreferences: IModelPreferences;\n}): Promise<{\n completion: OpenAI.Chat.Completions;\n style: 'openai' | 'anthropic';\n modelName: string;\n}> {\n const {\n socksProxy,\n httpProxy,\n modelName,\n openaiBaseURL,\n openaiApiKey,\n openaiExtraConfig,\n openaiUseAzureDeprecated,\n useAzureOpenai,\n azureOpenaiScope,\n azureOpenaiKey,\n azureOpenaiEndpoint,\n azureOpenaiApiVersion,\n azureOpenaiDeployment,\n azureExtraConfig,\n useAnthropicSdk,\n anthropicApiKey,\n } = decideModelConfig(modelPreferences, true);\n\n let openai: OpenAI | AzureOpenAI | undefined;\n\n let proxyAgent = undefined;\n const debugProxy = getDebug('ai:call:proxy');\n if (httpProxy) {\n debugProxy('using http proxy', httpProxy);\n proxyAgent = new HttpsProxyAgent(httpProxy);\n } else if (socksProxy) {\n debugProxy('using socks proxy', socksProxy);\n proxyAgent = new SocksProxyAgent(socksProxy);\n }\n\n if (openaiUseAzureDeprecated) {\n // this is deprecated\n openai = new AzureOpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n dangerouslyAllowBrowser: true,\n }) as OpenAI;\n } else if (useAzureOpenai) {\n // https://learn.microsoft.com/en-us/azure/ai-services/openai/chatgpt-quickstart?tabs=bash%2Cjavascript-key%2Ctypescript-keyless%2Cpython&pivots=programming-language-javascript#rest-api\n // keyless authentication\n let tokenProvider: any = undefined;\n if (azureOpenaiScope) {\n assert(\n !ifInBrowser,\n 'Azure OpenAI is not supported in browser with Midscene.',\n );\n const credential = new DefaultAzureCredential();\n\n tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);\n\n openai = new AzureOpenAI({\n azureADTokenProvider: tokenProvider,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n } else {\n // endpoint, apiKey, apiVersion, deployment\n openai = new AzureOpenAI({\n apiKey: azureOpenaiKey,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n dangerouslyAllowBrowser: true,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n }\n } else if (!useAnthropicSdk) {\n openai = new OpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n defaultHeaders: {\n ...(openaiExtraConfig?.defaultHeaders || {}),\n [MIDSCENE_API_TYPE]: AIActionTypeValue.toString(),\n },\n dangerouslyAllowBrowser: true,\n });\n }\n\n if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {\n if (ifInBrowser) {\n throw new Error('langsmith is not supported in browser');\n }\n console.log('DEBUGGING MODE: langsmith wrapper enabled');\n const { wrapOpenAI } = await import('langsmith/wrappers');\n openai = wrapOpenAI(openai);\n }\n\n if (typeof openai !== 'undefined') {\n return {\n completion: openai.chat.completions,\n style: 'openai',\n modelName,\n };\n }\n\n // Anthropic\n if (useAnthropicSdk) {\n openai = new Anthropic({\n apiKey: anthropicApiKey,\n httpAgent: proxyAgent,\n dangerouslyAllowBrowser: true,\n }) as any;\n }\n\n if (typeof openai !== 'undefined' && (openai as any).messages) {\n return {\n completion: (openai as any).messages,\n style: 'anthropic',\n modelName,\n };\n }\n\n throw new Error('Openai SDK or Anthropic SDK is not initialized');\n}\n\nexport async function call(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n options?: {\n stream?: boolean;\n onChunk?: StreamingCallback;\n },\n): Promise<{ content: string; usage?: AIUsageInfo; isStreamed: boolean }> {\n const { completion, style, modelName } = await createChatClient({\n AIActionTypeValue,\n modelPreferences,\n });\n\n const responseFormat = getResponseFormat(modelName, AIActionTypeValue);\n\n const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);\n const debugCall = getDebug('ai:call');\n const debugProfileStats = getDebug('ai:profile:stats');\n const debugProfileDetail = getDebug('ai:profile:detail');\n\n const startTime = Date.now();\n\n const isStreaming = options?.stream && options?.onChunk;\n let content: string | undefined;\n let accumulated = '';\n let usage: OpenAI.CompletionUsage | undefined;\n let timeCost: number | undefined;\n\n const commonConfig = {\n temperature: vlLocateMode(modelPreferences) === 'vlm-ui-tars' ? 0.0 : 0.1,\n stream: !!isStreaming,\n max_tokens:\n typeof maxTokens === 'number'\n ? maxTokens\n : Number.parseInt(maxTokens || '2048', 10),\n ...(vlLocateMode(modelPreferences) === 'qwen-vl' // qwen specific config\n ? {\n vl_high_resolution_images: true,\n }\n : {}),\n };\n\n try {\n if (style === 'openai') {\n debugCall(\n `sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`,\n );\n\n if (isStreaming) {\n const stream = (await completion.create(\n {\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n },\n {\n stream: true,\n },\n )) as Stream<OpenAI.Chat.Completions.ChatCompletionChunk> & {\n _request_id?: string | null;\n };\n\n for await (const chunk of stream) {\n const content = chunk.choices?.[0]?.delta?.content || '';\n const reasoning_content =\n (chunk.choices?.[0]?.delta as any)?.reasoning_content || '';\n\n // Check for usage info in any chunk (OpenAI provides usage in separate chunks)\n if (chunk.usage) {\n usage = chunk.usage;\n }\n\n if (content || reasoning_content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n reasoning_content,\n accumulated,\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.choices?.[0]?.finish_reason) {\n timeCost = Date.now() - startTime;\n\n // If usage is not available from the stream, provide a basic usage info\n if (!usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor(accumulated.length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n },\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n debugProfileStats(\n `streaming model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, cost-ms, ${timeCost}`,\n );\n } else {\n const result = await completion.create({\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n\n debugProfileStats(\n `model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, ui-tars-version, ${uiTarsModelVersion(modelPreferences)}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`,\n );\n\n debugProfileDetail(\n `model usage detail: ${JSON.stringify(result.usage)}`,\n );\n\n assert(\n result.choices,\n `invalid response from LLM service: ${JSON.stringify(result)}`,\n );\n content = result.choices[0].message.content!;\n usage = result.usage;\n }\n\n debugCall(`response: ${content}`);\n assert(content, 'empty content');\n } else if (style === 'anthropic') {\n const convertImageContent = (content: any) => {\n if (content.type === 'image_url') {\n const imgBase64 = content.image_url.url;\n assert(imgBase64, 'image_url is required');\n const { mimeType, body } = parseBase64(content.image_url.url);\n return {\n source: {\n type: 'base64',\n media_type: mimeType,\n data: body,\n },\n type: 'image',\n };\n }\n return content;\n };\n\n if (isStreaming) {\n const stream = (await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any)) as any;\n\n for await (const chunk of stream) {\n const content = chunk.delta?.text || '';\n if (content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n accumulated,\n reasoning_content: '',\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.type === 'message_stop') {\n timeCost = Date.now() - startTime;\n const anthropicUsage = chunk.usage;\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: anthropicUsage\n ? {\n prompt_tokens: anthropicUsage.input_tokens ?? 0,\n completion_tokens: anthropicUsage.output_tokens ?? 0,\n total_tokens:\n (anthropicUsage.input_tokens ?? 0) +\n (anthropicUsage.output_tokens ?? 0),\n time_cost: timeCost ?? 0,\n model_name: modelName,\n }\n : undefined,\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n } else {\n const result = await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n content = (result as any).content[0].text as string;\n usage = result.usage;\n }\n\n assert(content, 'empty content');\n }\n // Ensure we always have usage info for streaming responses\n if (isStreaming && !usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor((content || '').length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n return {\n content: content || '',\n usage: usage\n ? {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n }\n : undefined,\n isStreamed: !!isStreaming,\n };\n } catch (e: any) {\n console.error(' call AI error', e);\n const newError = new Error(\n `failed to call ${isStreaming ? 'streaming ' : ''}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`,\n {\n cause: e,\n },\n );\n throw newError;\n }\n}\n\nexport const getResponseFormat = (\n modelName: string,\n AIActionTypeValue: AIActionType,\n):\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject => {\n let responseFormat:\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject\n | undefined;\n\n if (modelName.includes('gpt-4')) {\n switch (AIActionTypeValue) {\n case AIActionType.ASSERT:\n responseFormat = assertSchema;\n break;\n case AIActionType.INSPECT_ELEMENT:\n responseFormat = locatorSchema;\n break;\n case AIActionType.PLAN:\n responseFormat = planSchema;\n break;\n case AIActionType.EXTRACT_DATA:\n case AIActionType.DESCRIBE_ELEMENT:\n responseFormat = { type: AIResponseFormat.JSON };\n break;\n }\n }\n\n // gpt-4o-2024-05-13 only supports json_object response format\n if (modelName === 'gpt-4o-2024-05-13') {\n responseFormat = { type: AIResponseFormat.JSON };\n }\n\n return responseFormat;\n};\n\nexport async function callToGetJSONObject<T>(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const response = await call(messages, AIActionTypeValue, modelPreferences);\n assert(response, 'empty response');\n const jsonContent = safeParseJson(response.content, modelPreferences);\n return { content: jsonContent, usage: response.usage };\n}\n\nexport async function callAiFnWithStringResponse<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: string; usage?: AIUsageInfo }> {\n const { content, usage } = await call(\n msgs,\n AIActionTypeValue,\n modelPreferences,\n );\n return { content, usage };\n}\n\nexport function extractJSONFromCodeBlock(response: string) {\n try {\n // First, try to match a JSON object directly in the response\n const jsonMatch = response.match(/^\\s*(\\{[\\s\\S]*\\})\\s*$/);\n if (jsonMatch) {\n return jsonMatch[1];\n }\n\n // If no direct JSON object is found, try to extract JSON from a code block\n const codeBlockMatch = response.match(\n /```(?:json)?\\s*(\\{[\\s\\S]*?\\})\\s*```/,\n );\n if (codeBlockMatch) {\n return codeBlockMatch[1];\n }\n\n // If no code block is found, try to find a JSON-like structure in the text\n const jsonLikeMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (jsonLikeMatch) {\n return jsonLikeMatch[0];\n }\n } catch {}\n // If no JSON-like structure is found, return the original response\n return response;\n}\n\nexport function preprocessDoubaoBboxJson(input: string) {\n if (input.includes('bbox')) {\n // when its values like 940 445 969 490, replace all /\\d+\\s+\\d+/g with /$1,$2/g\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nexport function safeParseJson(\n input: string,\n modelPreferences: IModelPreferences,\n) {\n const cleanJsonString = extractJSONFromCodeBlock(input);\n // match the point\n if (cleanJsonString?.match(/\\((\\d+),(\\d+)\\)/)) {\n return cleanJsonString\n .match(/\\((\\d+),(\\d+)\\)/)\n ?.slice(1)\n .map(Number);\n }\n try {\n return JSON.parse(cleanJsonString);\n } catch {}\n try {\n return JSON.parse(jsonrepair(cleanJsonString));\n } catch (e) {}\n\n if (\n vlLocateMode(modelPreferences) === 'doubao-vision' ||\n vlLocateMode(modelPreferences) === 'vlm-ui-tars'\n ) {\n const jsonString = preprocessDoubaoBboxJson(cleanJsonString);\n return JSON.parse(jsonrepair(jsonString));\n }\n throw Error(`failed to parse json response: ${input}`);\n}\n"],"names":["createChatClient","AIActionTypeValue","modelPreferences","socksProxy","httpProxy","modelName","openaiBaseURL","openaiApiKey","openaiExtraConfig","openaiUseAzureDeprecated","useAzureOpenai","azureOpenaiScope","azureOpenaiKey","azureOpenaiEndpoint","azureOpenaiApiVersion","azureOpenaiDeployment","azureExtraConfig","useAnthropicSdk","anthropicApiKey","decideModelConfig","openai","proxyAgent","debugProxy","getDebug","HttpsProxyAgent","SocksProxyAgent","AzureOpenAI","tokenProvider","assert","ifInBrowser","credential","DefaultAzureCredential","getBearerTokenProvider","OpenAI","MIDSCENE_API_TYPE","getAIConfigInBoolean","MIDSCENE_LANGSMITH_DEBUG","Error","console","wrapOpenAI","Anthropic","call","messages","options","completion","style","responseFormat","getResponseFormat","maxTokens","getAIConfig","OPENAI_MAX_TOKENS","debugCall","debugProfileStats","debugProfileDetail","startTime","Date","isStreaming","content","accumulated","usage","timeCost","commonConfig","vlLocateMode","Number","stream","chunk","_chunk_choices__delta","_chunk_choices__delta1","_chunk_choices_2","reasoning_content","chunkData","undefined","estimatedTokens","Math","finalChunk","_result_usage","_result_usage1","_result_usage2","result","uiTarsModelVersion","JSON","convertImageContent","imgBase64","mimeType","body","parseBase64","m","Array","_chunk_delta","anthropicUsage","e","newError","AIActionType","assertSchema","locatorSchema","planSchema","AIResponseFormat","callToGetJSONObject","response","jsonContent","safeParseJson","callAiFnWithStringResponse","msgs","extractJSONFromCodeBlock","jsonMatch","codeBlockMatch","jsonLikeMatch","preprocessDoubaoBboxJson","input","cleanJsonString","_cleanJsonString_match","jsonrepair","jsonString"],"mappings":";;;;;;;;;;;;;;;AAiCA,eAAeA,iBAAiB,EAC9BC,iBAAiB,EACjBC,gBAAgB,EAIjB;IAKC,MAAM,EACJC,UAAU,EACVC,SAAS,EACTC,SAAS,EACTC,aAAa,EACbC,YAAY,EACZC,iBAAiB,EACjBC,wBAAwB,EACxBC,cAAc,EACdC,gBAAgB,EAChBC,cAAc,EACdC,mBAAmB,EACnBC,qBAAqB,EACrBC,qBAAqB,EACrBC,gBAAgB,EAChBC,eAAe,EACfC,eAAe,EAChB,GAAGC,kBAAkBjB,kBAAkB;IAExC,IAAIkB;IAEJ,IAAIC;IACJ,MAAMC,aAAaC,SAAS;IAC5B,IAAInB,WAAW;QACbkB,WAAW,oBAAoBlB;QAC/BiB,aAAa,IAAIG,gBAAgBpB;IACnC,OAAO,IAAID,YAAY;QACrBmB,WAAW,qBAAqBnB;QAChCkB,aAAa,IAAII,gBAAgBtB;IACnC;IAEA,IAAIM,0BAEFW,SAAS,IAAIM,YAAY;QACvB,SAASpB;QACT,QAAQC;QACR,WAAWc;QACX,GAAGb,iBAAiB;QACpB,yBAAyB;IAC3B;SACK,IAAIE,gBAAgB;QAGzB,IAAIiB;QACJ,IAAIhB,kBAAkB;YACpBiB,OACE,CAACC,aACD;YAEF,MAAMC,aAAa,IAAIC;YAEvBJ,gBAAgBK,uBAAuBF,YAAYnB;YAEnDS,SAAS,IAAIM,YAAY;gBACvB,sBAAsBC;gBACtB,UAAUd;gBACV,YAAYC;gBACZ,YAAYC;gBACZ,GAAGP,iBAAiB;gBACpB,GAAGQ,gBAAgB;YACrB;QACF,OAEEI,SAAS,IAAIM,YAAY;YACvB,QAAQd;YACR,UAAUC;YACV,YAAYC;YACZ,YAAYC;YACZ,yBAAyB;YACzB,GAAGP,iBAAiB;YACpB,GAAGQ,gBAAgB;QACrB;IAEJ,OAAO,IAAI,CAACC,iBACVG,SAAS,IAAIa,SAAO;QAClB,SAAS3B;QACT,QAAQC;QACR,WAAWc;QACX,GAAGb,iBAAiB;QACpB,gBAAgB;YACd,GAAIA,AAAAA,CAAAA,QAAAA,oBAAAA,KAAAA,IAAAA,kBAAmB,cAAc,AAAD,KAAK,CAAC,CAAC;YAC3C,CAAC0B,kBAAkB,EAAEjC,kBAAkB,QAAQ;QACjD;QACA,yBAAyB;IAC3B;IAGF,IAAImB,UAAUe,qBAAqBC,2BAA2B;QAC5D,IAAIP,aACF,MAAM,IAAIQ,MAAM;QAElBC,QAAQ,GAAG,CAAC;QACZ,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC;QACpCnB,SAASmB,WAAWnB;IACtB;IAEA,IAAI,AAAkB,WAAXA,QACT,OAAO;QACL,YAAYA,OAAO,IAAI,CAAC,WAAW;QACnC,OAAO;QACPf;IACF;IAIF,IAAIY,iBACFG,SAAS,IAAIoB,UAAU;QACrB,QAAQtB;QACR,WAAWG;QACX,yBAAyB;IAC3B;IAGF,IAAI,AAAkB,WAAXD,UAA2BA,OAAe,QAAQ,EAC3D,OAAO;QACL,YAAaA,OAAe,QAAQ;QACpC,OAAO;QACPf;IACF;IAGF,MAAM,IAAIgC,MAAM;AAClB;AAEO,eAAeI,KACpBC,QAAsC,EACtCzC,iBAA+B,EAC/BC,gBAAmC,EACnCyC,OAGC;IAED,MAAM,EAAEC,UAAU,EAAEC,KAAK,EAAExC,SAAS,EAAE,GAAG,MAAML,iBAAiB;QAC9DC;QACAC;IACF;IAEA,MAAM4C,iBAAiBC,kBAAkB1C,WAAWJ;IAEpD,MAAM+C,YAAYC,YAAYC;IAC9B,MAAMC,YAAY5B,SAAS;IAC3B,MAAM6B,oBAAoB7B,SAAS;IACnC,MAAM8B,qBAAqB9B,SAAS;IAEpC,MAAM+B,YAAYC,KAAK,GAAG;IAE1B,MAAMC,cAAcb,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,MAAM,AAAD,KAAKA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD;IACtD,IAAIc;IACJ,IAAIC,cAAc;IAClB,IAAIC;IACJ,IAAIC;IAEJ,MAAMC,eAAe;QACnB,aAAaC,AAAmC,kBAAnCA,aAAa5D,oBAAsC,MAAM;QACtE,QAAQ,CAAC,CAACsD;QACV,YACE,AAAqB,YAArB,OAAOR,YACHA,YACAe,OAAO,QAAQ,CAACf,aAAa,QAAQ;QAC3C,GAAIc,AAAmC,cAAnCA,aAAa5D,oBACb;YACE,2BAA2B;QAC7B,IACA,CAAC,CAAC;IACR;IAEA,IAAI;QACF,IAAI2C,AAAU,aAAVA,OAAoB;YACtBM,UACE,CAAC,QAAQ,EAAEK,cAAc,eAAe,GAAG,WAAW,EAAEnD,WAAW;YAGrE,IAAImD,aAAa;gBACf,MAAMQ,SAAU,MAAMpB,WAAW,MAAM,CACrC;oBACE,OAAOvC;oBACPqC;oBACA,iBAAiBI;oBACjB,GAAGe,YAAY;gBACjB,GACA;oBACE,QAAQ;gBACV;gBAKF,WAAW,MAAMI,SAASD,OAAQ;wBAChBE,uBAAAA,iBAAAA,gBAEbC,wBAAAA,kBAAAA,iBAoBCC,kBAAAA;oBAtBJ,MAAMX,UAAUS,AAAAA,SAAAA,CAAAA,iBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,kBAAAA,cAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,wBAAAA,gBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,sBAA2B,OAAO,AAAD,KAAK;oBACtD,MAAMG,oBACJ,AAAC,SAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,yBAAAA,iBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,uBAAmC,iBAAiB,AAAD,KAAK;oBAG3D,IAAIF,MAAM,KAAK,EACbN,QAAQM,MAAM,KAAK;oBAGrB,IAAIR,WAAWY,mBAAmB;wBAChCX,eAAeD;wBACf,MAAMa,YAAiC;4BACrCb;4BACAY;4BACAX;4BACA,YAAY;4BACZ,OAAOa;wBACT;wBACA5B,QAAQ,OAAO,CAAE2B;oBACnB;oBAGA,IAAI,QAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,iBAAoB,aAAa,EAAE;wBACrCR,WAAWL,KAAK,GAAG,KAAKD;wBAGxB,IAAI,CAACK,OAAO;4BAEV,MAAMa,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAACf,YAAY,MAAM,GAAG;4BAElCC,QAAQ;gCACN,eAAea;gCACf,mBAAmBA;gCACnB,cAAcA,AAAkB,IAAlBA;4BAChB;wBACF;wBAGA,MAAME,aAAkC;4BACtC,SAAS;4BACThB;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO;gCACL,eAAeC,MAAM,aAAa,IAAI;gCACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gCAC9C,cAAcA,MAAM,YAAY,IAAI;gCACpC,WAAWC,YAAY;gCACvB,YAAYvD;4BACd;wBACF;wBACAsC,QAAQ,OAAO,CAAE+B;wBACjB;oBACF;gBACF;gBACAjB,UAAUC;gBACVN,kBACE,CAAC,iBAAiB,EAAE/C,UAAU,QAAQ,EAAEyD,aAAa5D,qBAAqB,UAAU,WAAW,EAAE0D,UAAU;YAE/G,OAAO;oBAUoJe,eAAyDC,gBAAwDC;gBAT1Q,MAAMC,SAAS,MAAMlC,WAAW,MAAM,CAAC;oBACrC,OAAOvC;oBACPqC;oBACA,iBAAiBI;oBACjB,GAAGe,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBAExBF,kBACE,CAAC,OAAO,EAAE/C,UAAU,QAAQ,EAAEyD,aAAa5D,qBAAqB,UAAU,mBAAmB,EAAE6E,mBAAmB7E,kBAAkB,iBAAiB,EAAEyE,AAAAA,SAAAA,CAAAA,gBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,cAAc,aAAa,AAAD,KAAK,GAAG,qBAAqB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,iBAAiB,AAAD,KAAK,GAAG,gBAAgB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,YAAY,AAAD,KAAK,GAAG,WAAW,EAAEjB,SAAS,aAAa,EAAEkB,OAAO,WAAW,IAAI,IAAI;gBAG1WzB,mBACE,CAAC,oBAAoB,EAAE2B,KAAK,SAAS,CAACF,OAAO,KAAK,GAAG;gBAGvDlD,OACEkD,OAAO,OAAO,EACd,CAAC,mCAAmC,EAAEE,KAAK,SAAS,CAACF,SAAS;gBAEhErB,UAAUqB,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,OAAO;gBAC3CnB,QAAQmB,OAAO,KAAK;YACtB;YAEA3B,UAAU,CAAC,UAAU,EAAEM,SAAS;YAChC7B,OAAO6B,SAAS;QAClB,OAAO,IAAIZ,AAAU,gBAAVA,OAAuB;YAChC,MAAMoC,sBAAsB,CAACxB;gBAC3B,IAAIA,AAAiB,gBAAjBA,QAAQ,IAAI,EAAkB;oBAChC,MAAMyB,YAAYzB,QAAQ,SAAS,CAAC,GAAG;oBACvC7B,OAAOsD,WAAW;oBAClB,MAAM,EAAEC,QAAQ,EAAEC,IAAI,EAAE,GAAGC,YAAY5B,QAAQ,SAAS,CAAC,GAAG;oBAC5D,OAAO;wBACL,QAAQ;4BACN,MAAM;4BACN,YAAY0B;4BACZ,MAAMC;wBACR;wBACA,MAAM;oBACR;gBACF;gBACA,OAAO3B;YACT;YAEA,IAAID,aAAa;gBACf,MAAMQ,SAAU,MAAMpB,WAAW,MAAM,CAAC;oBACtC,OAAOvC;oBACP,QAAQ;oBACR,UAAUqC,SAAS,GAAG,CAAC,CAAC4C,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBxC;oBACjB,GAAGe,YAAY;gBACjB;gBAEA,WAAW,MAAMI,SAASD,OAAQ;wBAChBwB;oBAAhB,MAAM/B,UAAU+B,AAAAA,SAAAA,CAAAA,eAAAA,MAAM,KAAK,AAAD,IAAVA,KAAAA,IAAAA,aAAa,IAAI,AAAD,KAAK;oBACrC,IAAI/B,SAAS;wBACXC,eAAeD;wBACf,MAAMa,YAAiC;4BACrCb;4BACAC;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAOa;wBACT;wBACA5B,QAAQ,OAAO,CAAE2B;oBACnB;oBAGA,IAAIL,AAAe,mBAAfA,MAAM,IAAI,EAAqB;wBACjCL,WAAWL,KAAK,GAAG,KAAKD;wBACxB,MAAMmC,iBAAiBxB,MAAM,KAAK;wBAGlC,MAAMS,aAAkC;4BACtC,SAAS;4BACThB;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO+B,iBACH;gCACE,eAAeA,eAAe,YAAY,IAAI;gCAC9C,mBAAmBA,eAAe,aAAa,IAAI;gCACnD,cACGA,AAAAA,CAAAA,eAAe,YAAY,IAAI,KAC/BA,CAAAA,eAAe,aAAa,IAAI;gCACnC,WAAW7B,YAAY;gCACvB,YAAYvD;4BACd,IACAkE;wBACN;wBACA5B,QAAQ,OAAO,CAAE+B;wBACjB;oBACF;gBACF;gBACAjB,UAAUC;YACZ,OAAO;gBACL,MAAMoB,SAAS,MAAMlC,WAAW,MAAM,CAAC;oBACrC,OAAOvC;oBACP,QAAQ;oBACR,UAAUqC,SAAS,GAAG,CAAC,CAAC4C,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBxC;oBACjB,GAAGe,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBACxBG,UAAWqB,OAAe,OAAO,CAAC,EAAE,CAAC,IAAI;gBACzCnB,QAAQmB,OAAO,KAAK;YACtB;YAEAlD,OAAO6B,SAAS;QAClB;QAEA,IAAID,eAAe,CAACG,OAAO;YAEzB,MAAMa,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAEhB,AAAAA,CAAAA,WAAW,EAAC,EAAG,MAAM,GAAG;YAEtCE,QAAQ;gBACN,eAAea;gBACf,mBAAmBA;gBACnB,cAAcA,AAAkB,IAAlBA;YAChB;QACF;QAEA,OAAO;YACL,SAASf,WAAW;YACpB,OAAOE,QACH;gBACE,eAAeA,MAAM,aAAa,IAAI;gBACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gBAC9C,cAAcA,MAAM,YAAY,IAAI;gBACpC,WAAWC,YAAY;gBACvB,YAAYvD;YACd,IACAkE;YACJ,YAAY,CAAC,CAACf;QAChB;IACF,EAAE,OAAOkC,GAAQ;QACfpD,QAAQ,KAAK,CAAC,kBAAkBoD;QAChC,MAAMC,WAAW,IAAItD,MACnB,CAAC,eAAe,EAAEmB,cAAc,eAAe,GAAG,kBAAkB,EAAEkC,EAAE,OAAO,CAAC,8DAA8D,CAAC,EAC/I;YACE,OAAOA;QACT;QAEF,MAAMC;IACR;AACF;AAEO,MAAM5C,oBAAoB,CAC/B1C,WACAJ;IAIA,IAAI6C;IAKJ,IAAIzC,UAAU,QAAQ,CAAC,UACrB,OAAQJ;QACN,KAAK2F,aAAa,MAAM;YACtB9C,iBAAiB+C;YACjB;QACF,KAAKD,aAAa,eAAe;YAC/B9C,iBAAiBgD;YACjB;QACF,KAAKF,aAAa,IAAI;YACpB9C,iBAAiBiD;YACjB;QACF,KAAKH,aAAa,YAAY;QAC9B,KAAKA,aAAa,gBAAgB;YAChC9C,iBAAiB;gBAAE,MAAMkD,iBAAiB,IAAI;YAAC;YAC/C;IACJ;IAIF,IAAI3F,AAAc,wBAAdA,WACFyC,iBAAiB;QAAE,MAAMkD,iBAAiB,IAAI;IAAC;IAGjD,OAAOlD;AACT;AAEO,eAAemD,oBACpBvD,QAAsC,EACtCzC,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAMgG,WAAW,MAAMzD,KAAKC,UAAUzC,mBAAmBC;IACzD0B,OAAOsE,UAAU;IACjB,MAAMC,cAAcC,cAAcF,SAAS,OAAO,EAAEhG;IACpD,OAAO;QAAE,SAASiG;QAAa,OAAOD,SAAS,KAAK;IAAC;AACvD;AAEO,eAAeG,2BACpBC,IAAY,EACZrG,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAM,EAAEuD,OAAO,EAAEE,KAAK,EAAE,GAAG,MAAMlB,KAC/B6D,MACArG,mBACAC;IAEF,OAAO;QAAEuD;QAASE;IAAM;AAC1B;AAEO,SAAS4C,yBAAyBL,QAAgB;IACvD,IAAI;QAEF,MAAMM,YAAYN,SAAS,KAAK,CAAC;QACjC,IAAIM,WACF,OAAOA,SAAS,CAAC,EAAE;QAIrB,MAAMC,iBAAiBP,SAAS,KAAK,CACnC;QAEF,IAAIO,gBACF,OAAOA,cAAc,CAAC,EAAE;QAI1B,MAAMC,gBAAgBR,SAAS,KAAK,CAAC;QACrC,IAAIQ,eACF,OAAOA,aAAa,CAAC,EAAE;IAE3B,EAAE,OAAM,CAAC;IAET,OAAOR;AACT;AAEO,SAASS,yBAAyBC,KAAa;IACpD,IAAIA,MAAM,QAAQ,CAAC,SAEjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEO,SAASR,cACdQ,KAAa,EACb1G,gBAAmC;IAEnC,MAAM2G,kBAAkBN,yBAAyBK;IAEjD,IAAIC,QAAAA,kBAAAA,KAAAA,IAAAA,gBAAiB,KAAK,CAAC,oBAAoB;YACtCC;QAAP,OAAO,QAAAA,CAAAA,yBAAAA,gBACJ,KAAK,CAAC,kBAAiB,IADnBA,KAAAA,IAAAA,uBAEH,KAAK,CAAC,GACP,GAAG,CAAC/C;IACT;IACA,IAAI;QACF,OAAOiB,KAAK,KAAK,CAAC6B;IACpB,EAAE,OAAM,CAAC;IACT,IAAI;QACF,OAAO7B,KAAK,KAAK,CAAC+B,WAAWF;IAC/B,EAAE,OAAOnB,GAAG,CAAC;IAEb,IACE5B,AAAmC,oBAAnCA,aAAa5D,qBACb4D,AAAmC,kBAAnCA,aAAa5D,mBACb;QACA,MAAM8G,aAAaL,yBAAyBE;QAC5C,OAAO7B,KAAK,KAAK,CAAC+B,WAAWC;IAC/B;IACA,MAAM3E,MAAM,CAAC,+BAA+B,EAAEuE,OAAO;AACvD"}
|
|
1
|
+
{"version":3,"file":"ai-model/service-caller/index.mjs","sources":["webpack://@midscene/core/./src/ai-model/service-caller/index.ts"],"sourcesContent":["import { AIResponseFormat, type AIUsageInfo } from '@/types';\nimport type { CodeGenerationChunk, StreamingCallback } from '@/types';\nimport { Anthropic } from '@anthropic-ai/sdk';\nimport {\n DefaultAzureCredential,\n getBearerTokenProvider,\n} from '@azure/identity';\nimport {\n type IModelPreferences,\n MIDSCENE_API_TYPE,\n MIDSCENE_LANGSMITH_DEBUG,\n OPENAI_MAX_TOKENS,\n decideModelConfig,\n getAIConfig,\n getAIConfigInBoolean,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { parseBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ifInBrowser } from '@midscene/shared/utils';\nimport { HttpsProxyAgent } from 'https-proxy-agent';\nimport { jsonrepair } from 'jsonrepair';\nimport OpenAI, { AzureOpenAI } from 'openai';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { Stream } from 'openai/streaming';\nimport { SocksProxyAgent } from 'socks-proxy-agent';\nimport { AIActionType, type AIArgs } from '../common';\nimport { assertSchema } from '../prompt/assertion';\nimport { locatorSchema } from '../prompt/llm-locator';\nimport { planSchema } from '../prompt/llm-planning';\n\nasync function createChatClient({\n AIActionTypeValue,\n modelPreferences,\n}: {\n AIActionTypeValue: AIActionType;\n modelPreferences: IModelPreferences;\n}): Promise<{\n completion: OpenAI.Chat.Completions;\n style: 'openai' | 'anthropic';\n modelName: string;\n modelDescription: string;\n}> {\n const {\n socksProxy,\n httpProxy,\n modelName,\n openaiBaseURL,\n openaiApiKey,\n openaiExtraConfig,\n openaiUseAzureDeprecated,\n useAzureOpenai,\n azureOpenaiScope,\n azureOpenaiKey,\n azureOpenaiEndpoint,\n azureOpenaiApiVersion,\n azureOpenaiDeployment,\n azureExtraConfig,\n useAnthropicSdk,\n anthropicApiKey,\n modelDescription,\n } = decideModelConfig(modelPreferences, true);\n\n let openai: OpenAI | AzureOpenAI | undefined;\n\n let proxyAgent = undefined;\n const debugProxy = getDebug('ai:call:proxy');\n if (httpProxy) {\n debugProxy('using http proxy', httpProxy);\n proxyAgent = new HttpsProxyAgent(httpProxy);\n } else if (socksProxy) {\n debugProxy('using socks proxy', socksProxy);\n proxyAgent = new SocksProxyAgent(socksProxy);\n }\n\n if (openaiUseAzureDeprecated) {\n // this is deprecated\n openai = new AzureOpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n dangerouslyAllowBrowser: true,\n }) as OpenAI;\n } else if (useAzureOpenai) {\n // https://learn.microsoft.com/en-us/azure/ai-services/openai/chatgpt-quickstart?tabs=bash%2Cjavascript-key%2Ctypescript-keyless%2Cpython&pivots=programming-language-javascript#rest-api\n // keyless authentication\n let tokenProvider: any = undefined;\n if (azureOpenaiScope) {\n assert(\n !ifInBrowser,\n 'Azure OpenAI is not supported in browser with Midscene.',\n );\n const credential = new DefaultAzureCredential();\n\n tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);\n\n openai = new AzureOpenAI({\n azureADTokenProvider: tokenProvider,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n } else {\n // endpoint, apiKey, apiVersion, deployment\n openai = new AzureOpenAI({\n apiKey: azureOpenaiKey,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n dangerouslyAllowBrowser: true,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n }\n } else if (!useAnthropicSdk) {\n openai = new OpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n defaultHeaders: {\n ...(openaiExtraConfig?.defaultHeaders || {}),\n [MIDSCENE_API_TYPE]: AIActionTypeValue.toString(),\n },\n dangerouslyAllowBrowser: true,\n });\n }\n\n if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {\n if (ifInBrowser) {\n throw new Error('langsmith is not supported in browser');\n }\n console.log('DEBUGGING MODE: langsmith wrapper enabled');\n const { wrapOpenAI } = await import('langsmith/wrappers');\n openai = wrapOpenAI(openai);\n }\n\n if (typeof openai !== 'undefined') {\n return {\n completion: openai.chat.completions,\n style: 'openai',\n modelName,\n modelDescription,\n };\n }\n\n // Anthropic\n if (useAnthropicSdk) {\n openai = new Anthropic({\n apiKey: anthropicApiKey,\n httpAgent: proxyAgent,\n dangerouslyAllowBrowser: true,\n }) as any;\n }\n\n if (typeof openai !== 'undefined' && (openai as any).messages) {\n return {\n completion: (openai as any).messages,\n style: 'anthropic',\n modelName,\n modelDescription,\n };\n }\n\n throw new Error('Openai SDK or Anthropic SDK is not initialized');\n}\n\nexport async function call(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n options?: {\n stream?: boolean;\n onChunk?: StreamingCallback;\n },\n): Promise<{ content: string; usage?: AIUsageInfo; isStreamed: boolean }> {\n const { completion, style, modelName, modelDescription } =\n await createChatClient({\n AIActionTypeValue,\n modelPreferences,\n });\n\n const responseFormat = getResponseFormat(modelName, AIActionTypeValue);\n\n const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);\n const debugCall = getDebug('ai:call');\n const debugProfileStats = getDebug('ai:profile:stats');\n const debugProfileDetail = getDebug('ai:profile:detail');\n\n const startTime = Date.now();\n\n const isStreaming = options?.stream && options?.onChunk;\n let content: string | undefined;\n let accumulated = '';\n let usage: OpenAI.CompletionUsage | undefined;\n let timeCost: number | undefined;\n\n const commonConfig = {\n temperature: vlLocateMode(modelPreferences) === 'vlm-ui-tars' ? 0.0 : 0.1,\n stream: !!isStreaming,\n max_tokens:\n typeof maxTokens === 'number'\n ? maxTokens\n : Number.parseInt(maxTokens || '2048', 10),\n ...(vlLocateMode(modelPreferences) === 'qwen-vl' // qwen specific config\n ? {\n vl_high_resolution_images: true,\n }\n : {}),\n };\n\n try {\n if (style === 'openai') {\n debugCall(\n `sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`,\n );\n\n if (isStreaming) {\n const stream = (await completion.create(\n {\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n },\n {\n stream: true,\n },\n )) as Stream<OpenAI.Chat.Completions.ChatCompletionChunk> & {\n _request_id?: string | null;\n };\n\n for await (const chunk of stream) {\n const content = chunk.choices?.[0]?.delta?.content || '';\n const reasoning_content =\n (chunk.choices?.[0]?.delta as any)?.reasoning_content || '';\n\n // Check for usage info in any chunk (OpenAI provides usage in separate chunks)\n if (chunk.usage) {\n usage = chunk.usage;\n }\n\n if (content || reasoning_content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n reasoning_content,\n accumulated,\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.choices?.[0]?.finish_reason) {\n timeCost = Date.now() - startTime;\n\n // If usage is not available from the stream, provide a basic usage info\n if (!usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor(accumulated.length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelPreferences.intent,\n },\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n debugProfileStats(\n `streaming model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, cost-ms, ${timeCost}`,\n );\n } else {\n const result = await completion.create({\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n\n debugProfileStats(\n `model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, ui-tars-version, ${uiTarsModelVersion(modelPreferences)}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`,\n );\n\n debugProfileDetail(\n `model usage detail: ${JSON.stringify(result.usage)}`,\n );\n\n assert(\n result.choices,\n `invalid response from LLM service: ${JSON.stringify(result)}`,\n );\n content = result.choices[0].message.content!;\n usage = result.usage;\n }\n\n debugCall(`response: ${content}`);\n assert(content, 'empty content');\n } else if (style === 'anthropic') {\n const convertImageContent = (content: any) => {\n if (content.type === 'image_url') {\n const imgBase64 = content.image_url.url;\n assert(imgBase64, 'image_url is required');\n const { mimeType, body } = parseBase64(content.image_url.url);\n return {\n source: {\n type: 'base64',\n media_type: mimeType,\n data: body,\n },\n type: 'image',\n };\n }\n return content;\n };\n\n if (isStreaming) {\n const stream = (await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any)) as any;\n\n for await (const chunk of stream) {\n const content = chunk.delta?.text || '';\n if (content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n accumulated,\n reasoning_content: '',\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.type === 'message_stop') {\n timeCost = Date.now() - startTime;\n const anthropicUsage = chunk.usage;\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: anthropicUsage\n ? {\n prompt_tokens: anthropicUsage.input_tokens ?? 0,\n completion_tokens: anthropicUsage.output_tokens ?? 0,\n total_tokens:\n (anthropicUsage.input_tokens ?? 0) +\n (anthropicUsage.output_tokens ?? 0),\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelPreferences.intent,\n }\n : undefined,\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n } else {\n const result = await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n content = (result as any).content[0].text as string;\n usage = result.usage;\n }\n\n assert(content, 'empty content');\n }\n // Ensure we always have usage info for streaming responses\n if (isStreaming && !usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor((content || '').length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n return {\n content: content || '',\n usage: usage\n ? {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelPreferences.intent,\n }\n : undefined,\n isStreamed: !!isStreaming,\n };\n } catch (e: any) {\n console.error(' call AI error', e);\n const newError = new Error(\n `failed to call ${isStreaming ? 'streaming ' : ''}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`,\n {\n cause: e,\n },\n );\n throw newError;\n }\n}\n\nexport const getResponseFormat = (\n modelName: string,\n AIActionTypeValue: AIActionType,\n):\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject => {\n let responseFormat:\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject\n | undefined;\n\n if (modelName.includes('gpt-4')) {\n switch (AIActionTypeValue) {\n case AIActionType.ASSERT:\n responseFormat = assertSchema;\n break;\n case AIActionType.INSPECT_ELEMENT:\n responseFormat = locatorSchema;\n break;\n case AIActionType.PLAN:\n responseFormat = planSchema;\n break;\n case AIActionType.EXTRACT_DATA:\n case AIActionType.DESCRIBE_ELEMENT:\n responseFormat = { type: AIResponseFormat.JSON };\n break;\n }\n }\n\n // gpt-4o-2024-05-13 only supports json_object response format\n if (modelName === 'gpt-4o-2024-05-13') {\n responseFormat = { type: AIResponseFormat.JSON };\n }\n\n return responseFormat;\n};\n\nexport async function callToGetJSONObject<T>(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const response = await call(messages, AIActionTypeValue, modelPreferences);\n assert(response, 'empty response');\n const jsonContent = safeParseJson(response.content, modelPreferences);\n return { content: jsonContent, usage: response.usage };\n}\n\nexport async function callAiFnWithStringResponse<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: string; usage?: AIUsageInfo }> {\n const { content, usage } = await call(\n msgs,\n AIActionTypeValue,\n modelPreferences,\n );\n return { content, usage };\n}\n\nexport function extractJSONFromCodeBlock(response: string) {\n try {\n // First, try to match a JSON object directly in the response\n const jsonMatch = response.match(/^\\s*(\\{[\\s\\S]*\\})\\s*$/);\n if (jsonMatch) {\n return jsonMatch[1];\n }\n\n // If no direct JSON object is found, try to extract JSON from a code block\n const codeBlockMatch = response.match(\n /```(?:json)?\\s*(\\{[\\s\\S]*?\\})\\s*```/,\n );\n if (codeBlockMatch) {\n return codeBlockMatch[1];\n }\n\n // If no code block is found, try to find a JSON-like structure in the text\n const jsonLikeMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (jsonLikeMatch) {\n return jsonLikeMatch[0];\n }\n } catch {}\n // If no JSON-like structure is found, return the original response\n return response;\n}\n\nexport function preprocessDoubaoBboxJson(input: string) {\n if (input.includes('bbox')) {\n // when its values like 940 445 969 490, replace all /\\d+\\s+\\d+/g with /$1,$2/g\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nexport function safeParseJson(\n input: string,\n modelPreferences: IModelPreferences,\n) {\n const cleanJsonString = extractJSONFromCodeBlock(input);\n // match the point\n if (cleanJsonString?.match(/\\((\\d+),(\\d+)\\)/)) {\n return cleanJsonString\n .match(/\\((\\d+),(\\d+)\\)/)\n ?.slice(1)\n .map(Number);\n }\n try {\n return JSON.parse(cleanJsonString);\n } catch {}\n try {\n return JSON.parse(jsonrepair(cleanJsonString));\n } catch (e) {}\n\n if (\n vlLocateMode(modelPreferences) === 'doubao-vision' ||\n vlLocateMode(modelPreferences) === 'vlm-ui-tars'\n ) {\n const jsonString = preprocessDoubaoBboxJson(cleanJsonString);\n return JSON.parse(jsonrepair(jsonString));\n }\n throw Error(`failed to parse json response: ${input}`);\n}\n"],"names":["createChatClient","AIActionTypeValue","modelPreferences","socksProxy","httpProxy","modelName","openaiBaseURL","openaiApiKey","openaiExtraConfig","openaiUseAzureDeprecated","useAzureOpenai","azureOpenaiScope","azureOpenaiKey","azureOpenaiEndpoint","azureOpenaiApiVersion","azureOpenaiDeployment","azureExtraConfig","useAnthropicSdk","anthropicApiKey","modelDescription","decideModelConfig","openai","proxyAgent","debugProxy","getDebug","HttpsProxyAgent","SocksProxyAgent","AzureOpenAI","tokenProvider","assert","ifInBrowser","credential","DefaultAzureCredential","getBearerTokenProvider","OpenAI","MIDSCENE_API_TYPE","getAIConfigInBoolean","MIDSCENE_LANGSMITH_DEBUG","Error","console","wrapOpenAI","Anthropic","call","messages","options","completion","style","responseFormat","getResponseFormat","maxTokens","getAIConfig","OPENAI_MAX_TOKENS","debugCall","debugProfileStats","debugProfileDetail","startTime","Date","isStreaming","content","accumulated","usage","timeCost","commonConfig","vlLocateMode","Number","stream","chunk","_chunk_choices__delta","_chunk_choices__delta1","_chunk_choices_2","reasoning_content","chunkData","undefined","estimatedTokens","Math","finalChunk","_result_usage","_result_usage1","_result_usage2","result","uiTarsModelVersion","JSON","convertImageContent","imgBase64","mimeType","body","parseBase64","m","Array","_chunk_delta","anthropicUsage","e","newError","AIActionType","assertSchema","locatorSchema","planSchema","AIResponseFormat","callToGetJSONObject","response","jsonContent","safeParseJson","callAiFnWithStringResponse","msgs","extractJSONFromCodeBlock","jsonMatch","codeBlockMatch","jsonLikeMatch","preprocessDoubaoBboxJson","input","cleanJsonString","_cleanJsonString_match","jsonrepair","jsonString"],"mappings":";;;;;;;;;;;;;;;AAiCA,eAAeA,iBAAiB,EAC9BC,iBAAiB,EACjBC,gBAAgB,EAIjB;IAMC,MAAM,EACJC,UAAU,EACVC,SAAS,EACTC,SAAS,EACTC,aAAa,EACbC,YAAY,EACZC,iBAAiB,EACjBC,wBAAwB,EACxBC,cAAc,EACdC,gBAAgB,EAChBC,cAAc,EACdC,mBAAmB,EACnBC,qBAAqB,EACrBC,qBAAqB,EACrBC,gBAAgB,EAChBC,eAAe,EACfC,eAAe,EACfC,gBAAgB,EACjB,GAAGC,kBAAkBlB,kBAAkB;IAExC,IAAImB;IAEJ,IAAIC;IACJ,MAAMC,aAAaC,SAAS;IAC5B,IAAIpB,WAAW;QACbmB,WAAW,oBAAoBnB;QAC/BkB,aAAa,IAAIG,gBAAgBrB;IACnC,OAAO,IAAID,YAAY;QACrBoB,WAAW,qBAAqBpB;QAChCmB,aAAa,IAAII,gBAAgBvB;IACnC;IAEA,IAAIM,0BAEFY,SAAS,IAAIM,YAAY;QACvB,SAASrB;QACT,QAAQC;QACR,WAAWe;QACX,GAAGd,iBAAiB;QACpB,yBAAyB;IAC3B;SACK,IAAIE,gBAAgB;QAGzB,IAAIkB;QACJ,IAAIjB,kBAAkB;YACpBkB,OACE,CAACC,aACD;YAEF,MAAMC,aAAa,IAAIC;YAEvBJ,gBAAgBK,uBAAuBF,YAAYpB;YAEnDU,SAAS,IAAIM,YAAY;gBACvB,sBAAsBC;gBACtB,UAAUf;gBACV,YAAYC;gBACZ,YAAYC;gBACZ,GAAGP,iBAAiB;gBACpB,GAAGQ,gBAAgB;YACrB;QACF,OAEEK,SAAS,IAAIM,YAAY;YACvB,QAAQf;YACR,UAAUC;YACV,YAAYC;YACZ,YAAYC;YACZ,yBAAyB;YACzB,GAAGP,iBAAiB;YACpB,GAAGQ,gBAAgB;QACrB;IAEJ,OAAO,IAAI,CAACC,iBACVI,SAAS,IAAIa,SAAO;QAClB,SAAS5B;QACT,QAAQC;QACR,WAAWe;QACX,GAAGd,iBAAiB;QACpB,gBAAgB;YACd,GAAIA,AAAAA,CAAAA,QAAAA,oBAAAA,KAAAA,IAAAA,kBAAmB,cAAc,AAAD,KAAK,CAAC,CAAC;YAC3C,CAAC2B,kBAAkB,EAAElC,kBAAkB,QAAQ;QACjD;QACA,yBAAyB;IAC3B;IAGF,IAAIoB,UAAUe,qBAAqBC,2BAA2B;QAC5D,IAAIP,aACF,MAAM,IAAIQ,MAAM;QAElBC,QAAQ,GAAG,CAAC;QACZ,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC;QACpCnB,SAASmB,WAAWnB;IACtB;IAEA,IAAI,AAAkB,WAAXA,QACT,OAAO;QACL,YAAYA,OAAO,IAAI,CAAC,WAAW;QACnC,OAAO;QACPhB;QACAc;IACF;IAIF,IAAIF,iBACFI,SAAS,IAAIoB,UAAU;QACrB,QAAQvB;QACR,WAAWI;QACX,yBAAyB;IAC3B;IAGF,IAAI,AAAkB,WAAXD,UAA2BA,OAAe,QAAQ,EAC3D,OAAO;QACL,YAAaA,OAAe,QAAQ;QACpC,OAAO;QACPhB;QACAc;IACF;IAGF,MAAM,IAAImB,MAAM;AAClB;AAEO,eAAeI,KACpBC,QAAsC,EACtC1C,iBAA+B,EAC/BC,gBAAmC,EACnC0C,OAGC;IAED,MAAM,EAAEC,UAAU,EAAEC,KAAK,EAAEzC,SAAS,EAAEc,gBAAgB,EAAE,GACtD,MAAMnB,iBAAiB;QACrBC;QACAC;IACF;IAEF,MAAM6C,iBAAiBC,kBAAkB3C,WAAWJ;IAEpD,MAAMgD,YAAYC,YAAYC;IAC9B,MAAMC,YAAY5B,SAAS;IAC3B,MAAM6B,oBAAoB7B,SAAS;IACnC,MAAM8B,qBAAqB9B,SAAS;IAEpC,MAAM+B,YAAYC,KAAK,GAAG;IAE1B,MAAMC,cAAcb,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,MAAM,AAAD,KAAKA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD;IACtD,IAAIc;IACJ,IAAIC,cAAc;IAClB,IAAIC;IACJ,IAAIC;IAEJ,MAAMC,eAAe;QACnB,aAAaC,AAAmC,kBAAnCA,aAAa7D,oBAAsC,MAAM;QACtE,QAAQ,CAAC,CAACuD;QACV,YACE,AAAqB,YAArB,OAAOR,YACHA,YACAe,OAAO,QAAQ,CAACf,aAAa,QAAQ;QAC3C,GAAIc,AAAmC,cAAnCA,aAAa7D,oBACb;YACE,2BAA2B;QAC7B,IACA,CAAC,CAAC;IACR;IAEA,IAAI;QACF,IAAI4C,AAAU,aAAVA,OAAoB;YACtBM,UACE,CAAC,QAAQ,EAAEK,cAAc,eAAe,GAAG,WAAW,EAAEpD,WAAW;YAGrE,IAAIoD,aAAa;gBACf,MAAMQ,SAAU,MAAMpB,WAAW,MAAM,CACrC;oBACE,OAAOxC;oBACPsC;oBACA,iBAAiBI;oBACjB,GAAGe,YAAY;gBACjB,GACA;oBACE,QAAQ;gBACV;gBAKF,WAAW,MAAMI,SAASD,OAAQ;wBAChBE,uBAAAA,iBAAAA,gBAEbC,wBAAAA,kBAAAA,iBAoBCC,kBAAAA;oBAtBJ,MAAMX,UAAUS,AAAAA,SAAAA,CAAAA,iBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,kBAAAA,cAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,wBAAAA,gBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,sBAA2B,OAAO,AAAD,KAAK;oBACtD,MAAMG,oBACJ,AAAC,SAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,yBAAAA,iBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,uBAAmC,iBAAiB,AAAD,KAAK;oBAG3D,IAAIF,MAAM,KAAK,EACbN,QAAQM,MAAM,KAAK;oBAGrB,IAAIR,WAAWY,mBAAmB;wBAChCX,eAAeD;wBACf,MAAMa,YAAiC;4BACrCb;4BACAY;4BACAX;4BACA,YAAY;4BACZ,OAAOa;wBACT;wBACA5B,QAAQ,OAAO,CAAE2B;oBACnB;oBAGA,IAAI,QAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,iBAAoB,aAAa,EAAE;wBACrCR,WAAWL,KAAK,GAAG,KAAKD;wBAGxB,IAAI,CAACK,OAAO;4BAEV,MAAMa,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAACf,YAAY,MAAM,GAAG;4BAElCC,QAAQ;gCACN,eAAea;gCACf,mBAAmBA;gCACnB,cAAcA,AAAkB,IAAlBA;4BAChB;wBACF;wBAGA,MAAME,aAAkC;4BACtC,SAAS;4BACThB;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO;gCACL,eAAeC,MAAM,aAAa,IAAI;gCACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gCAC9C,cAAcA,MAAM,YAAY,IAAI;gCACpC,WAAWC,YAAY;gCACvB,YAAYxD;gCACZ,mBAAmBc;gCACnB,QAAQjB,iBAAiB,MAAM;4BACjC;wBACF;wBACA0C,QAAQ,OAAO,CAAE+B;wBACjB;oBACF;gBACF;gBACAjB,UAAUC;gBACVN,kBACE,CAAC,iBAAiB,EAAEhD,UAAU,QAAQ,EAAE0D,aAAa7D,qBAAqB,UAAU,WAAW,EAAE2D,UAAU;YAE/G,OAAO;oBAUoJe,eAAyDC,gBAAwDC;gBAT1Q,MAAMC,SAAS,MAAMlC,WAAW,MAAM,CAAC;oBACrC,OAAOxC;oBACPsC;oBACA,iBAAiBI;oBACjB,GAAGe,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBAExBF,kBACE,CAAC,OAAO,EAAEhD,UAAU,QAAQ,EAAE0D,aAAa7D,qBAAqB,UAAU,mBAAmB,EAAE8E,mBAAmB9E,kBAAkB,iBAAiB,EAAE0E,AAAAA,SAAAA,CAAAA,gBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,cAAc,aAAa,AAAD,KAAK,GAAG,qBAAqB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,iBAAiB,AAAD,KAAK,GAAG,gBAAgB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,YAAY,AAAD,KAAK,GAAG,WAAW,EAAEjB,SAAS,aAAa,EAAEkB,OAAO,WAAW,IAAI,IAAI;gBAG1WzB,mBACE,CAAC,oBAAoB,EAAE2B,KAAK,SAAS,CAACF,OAAO,KAAK,GAAG;gBAGvDlD,OACEkD,OAAO,OAAO,EACd,CAAC,mCAAmC,EAAEE,KAAK,SAAS,CAACF,SAAS;gBAEhErB,UAAUqB,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,OAAO;gBAC3CnB,QAAQmB,OAAO,KAAK;YACtB;YAEA3B,UAAU,CAAC,UAAU,EAAEM,SAAS;YAChC7B,OAAO6B,SAAS;QAClB,OAAO,IAAIZ,AAAU,gBAAVA,OAAuB;YAChC,MAAMoC,sBAAsB,CAACxB;gBAC3B,IAAIA,AAAiB,gBAAjBA,QAAQ,IAAI,EAAkB;oBAChC,MAAMyB,YAAYzB,QAAQ,SAAS,CAAC,GAAG;oBACvC7B,OAAOsD,WAAW;oBAClB,MAAM,EAAEC,QAAQ,EAAEC,IAAI,EAAE,GAAGC,YAAY5B,QAAQ,SAAS,CAAC,GAAG;oBAC5D,OAAO;wBACL,QAAQ;4BACN,MAAM;4BACN,YAAY0B;4BACZ,MAAMC;wBACR;wBACA,MAAM;oBACR;gBACF;gBACA,OAAO3B;YACT;YAEA,IAAID,aAAa;gBACf,MAAMQ,SAAU,MAAMpB,WAAW,MAAM,CAAC;oBACtC,OAAOxC;oBACP,QAAQ;oBACR,UAAUsC,SAAS,GAAG,CAAC,CAAC4C,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBxC;oBACjB,GAAGe,YAAY;gBACjB;gBAEA,WAAW,MAAMI,SAASD,OAAQ;wBAChBwB;oBAAhB,MAAM/B,UAAU+B,AAAAA,SAAAA,CAAAA,eAAAA,MAAM,KAAK,AAAD,IAAVA,KAAAA,IAAAA,aAAa,IAAI,AAAD,KAAK;oBACrC,IAAI/B,SAAS;wBACXC,eAAeD;wBACf,MAAMa,YAAiC;4BACrCb;4BACAC;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAOa;wBACT;wBACA5B,QAAQ,OAAO,CAAE2B;oBACnB;oBAGA,IAAIL,AAAe,mBAAfA,MAAM,IAAI,EAAqB;wBACjCL,WAAWL,KAAK,GAAG,KAAKD;wBACxB,MAAMmC,iBAAiBxB,MAAM,KAAK;wBAGlC,MAAMS,aAAkC;4BACtC,SAAS;4BACThB;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO+B,iBACH;gCACE,eAAeA,eAAe,YAAY,IAAI;gCAC9C,mBAAmBA,eAAe,aAAa,IAAI;gCACnD,cACGA,AAAAA,CAAAA,eAAe,YAAY,IAAI,KAC/BA,CAAAA,eAAe,aAAa,IAAI;gCACnC,WAAW7B,YAAY;gCACvB,YAAYxD;gCACZ,mBAAmBc;gCACnB,QAAQjB,iBAAiB,MAAM;4BACjC,IACAsE;wBACN;wBACA5B,QAAQ,OAAO,CAAE+B;wBACjB;oBACF;gBACF;gBACAjB,UAAUC;YACZ,OAAO;gBACL,MAAMoB,SAAS,MAAMlC,WAAW,MAAM,CAAC;oBACrC,OAAOxC;oBACP,QAAQ;oBACR,UAAUsC,SAAS,GAAG,CAAC,CAAC4C,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBxC;oBACjB,GAAGe,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBACxBG,UAAWqB,OAAe,OAAO,CAAC,EAAE,CAAC,IAAI;gBACzCnB,QAAQmB,OAAO,KAAK;YACtB;YAEAlD,OAAO6B,SAAS;QAClB;QAEA,IAAID,eAAe,CAACG,OAAO;YAEzB,MAAMa,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAEhB,AAAAA,CAAAA,WAAW,EAAC,EAAG,MAAM,GAAG;YAEtCE,QAAQ;gBACN,eAAea;gBACf,mBAAmBA;gBACnB,cAAcA,AAAkB,IAAlBA;YAChB;QACF;QAEA,OAAO;YACL,SAASf,WAAW;YACpB,OAAOE,QACH;gBACE,eAAeA,MAAM,aAAa,IAAI;gBACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gBAC9C,cAAcA,MAAM,YAAY,IAAI;gBACpC,WAAWC,YAAY;gBACvB,YAAYxD;gBACZ,mBAAmBc;gBACnB,QAAQjB,iBAAiB,MAAM;YACjC,IACAsE;YACJ,YAAY,CAAC,CAACf;QAChB;IACF,EAAE,OAAOkC,GAAQ;QACfpD,QAAQ,KAAK,CAAC,kBAAkBoD;QAChC,MAAMC,WAAW,IAAItD,MACnB,CAAC,eAAe,EAAEmB,cAAc,eAAe,GAAG,kBAAkB,EAAEkC,EAAE,OAAO,CAAC,8DAA8D,CAAC,EAC/I;YACE,OAAOA;QACT;QAEF,MAAMC;IACR;AACF;AAEO,MAAM5C,oBAAoB,CAC/B3C,WACAJ;IAIA,IAAI8C;IAKJ,IAAI1C,UAAU,QAAQ,CAAC,UACrB,OAAQJ;QACN,KAAK4F,aAAa,MAAM;YACtB9C,iBAAiB+C;YACjB;QACF,KAAKD,aAAa,eAAe;YAC/B9C,iBAAiBgD;YACjB;QACF,KAAKF,aAAa,IAAI;YACpB9C,iBAAiBiD;YACjB;QACF,KAAKH,aAAa,YAAY;QAC9B,KAAKA,aAAa,gBAAgB;YAChC9C,iBAAiB;gBAAE,MAAMkD,iBAAiB,IAAI;YAAC;YAC/C;IACJ;IAIF,IAAI5F,AAAc,wBAAdA,WACF0C,iBAAiB;QAAE,MAAMkD,iBAAiB,IAAI;IAAC;IAGjD,OAAOlD;AACT;AAEO,eAAemD,oBACpBvD,QAAsC,EACtC1C,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAMiG,WAAW,MAAMzD,KAAKC,UAAU1C,mBAAmBC;IACzD2B,OAAOsE,UAAU;IACjB,MAAMC,cAAcC,cAAcF,SAAS,OAAO,EAAEjG;IACpD,OAAO;QAAE,SAASkG;QAAa,OAAOD,SAAS,KAAK;IAAC;AACvD;AAEO,eAAeG,2BACpBC,IAAY,EACZtG,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAM,EAAEwD,OAAO,EAAEE,KAAK,EAAE,GAAG,MAAMlB,KAC/B6D,MACAtG,mBACAC;IAEF,OAAO;QAAEwD;QAASE;IAAM;AAC1B;AAEO,SAAS4C,yBAAyBL,QAAgB;IACvD,IAAI;QAEF,MAAMM,YAAYN,SAAS,KAAK,CAAC;QACjC,IAAIM,WACF,OAAOA,SAAS,CAAC,EAAE;QAIrB,MAAMC,iBAAiBP,SAAS,KAAK,CACnC;QAEF,IAAIO,gBACF,OAAOA,cAAc,CAAC,EAAE;QAI1B,MAAMC,gBAAgBR,SAAS,KAAK,CAAC;QACrC,IAAIQ,eACF,OAAOA,aAAa,CAAC,EAAE;IAE3B,EAAE,OAAM,CAAC;IAET,OAAOR;AACT;AAEO,SAASS,yBAAyBC,KAAa;IACpD,IAAIA,MAAM,QAAQ,CAAC,SAEjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEO,SAASR,cACdQ,KAAa,EACb3G,gBAAmC;IAEnC,MAAM4G,kBAAkBN,yBAAyBK;IAEjD,IAAIC,QAAAA,kBAAAA,KAAAA,IAAAA,gBAAiB,KAAK,CAAC,oBAAoB;YACtCC;QAAP,OAAO,QAAAA,CAAAA,yBAAAA,gBACJ,KAAK,CAAC,kBAAiB,IADnBA,KAAAA,IAAAA,uBAEH,KAAK,CAAC,GACP,GAAG,CAAC/C;IACT;IACA,IAAI;QACF,OAAOiB,KAAK,KAAK,CAAC6B;IACpB,EAAE,OAAM,CAAC;IACT,IAAI;QACF,OAAO7B,KAAK,KAAK,CAAC+B,WAAWF;IAC/B,EAAE,OAAOnB,GAAG,CAAC;IAEb,IACE5B,AAAmC,oBAAnCA,aAAa7D,qBACb6D,AAAmC,kBAAnCA,aAAa7D,mBACb;QACA,MAAM+G,aAAaL,yBAAyBE;QAC5C,OAAO7B,KAAK,KAAK,CAAC+B,WAAWC;IAC/B;IACA,MAAM3E,MAAM,CAAC,+BAA+B,EAAEuE,OAAO;AACvD"}
|
|
@@ -39,33 +39,23 @@ async function vlmPlanning(options) {
|
|
|
39
39
|
},
|
|
40
40
|
modelVer: modelVer || void 0
|
|
41
41
|
});
|
|
42
|
-
debug('modelVer', modelVer, 'parsed', JSON.stringify(parsed));
|
|
42
|
+
debug('ui-tars modelVer', modelVer, ', parsed', JSON.stringify(parsed));
|
|
43
43
|
const transformActions = [];
|
|
44
44
|
parsed.forEach((action)=>{
|
|
45
45
|
if ('click' === action.action_type) {
|
|
46
46
|
assert(action.action_inputs.start_box, 'start_box is required');
|
|
47
47
|
const point = getPoint(action.action_inputs.start_box, size);
|
|
48
|
-
transformActions.push({
|
|
49
|
-
type: 'Locate',
|
|
50
|
-
param: {},
|
|
51
|
-
locate: {
|
|
52
|
-
prompt: action.thought || '',
|
|
53
|
-
bbox: pointToBbox({
|
|
54
|
-
x: point[0],
|
|
55
|
-
y: point[1]
|
|
56
|
-
}, size.width, size.height)
|
|
57
|
-
}
|
|
58
|
-
});
|
|
59
48
|
transformActions.push({
|
|
60
49
|
type: 'Tap',
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
50
|
+
param: {
|
|
51
|
+
locate: {
|
|
52
|
+
prompt: action.thought || '',
|
|
53
|
+
bbox: pointToBbox({
|
|
54
|
+
x: point[0],
|
|
55
|
+
y: point[1]
|
|
56
|
+
}, size.width, size.height)
|
|
57
|
+
}
|
|
58
|
+
}
|
|
69
59
|
});
|
|
70
60
|
} else if ('drag' === action.action_type) {
|
|
71
61
|
assert(action.action_inputs.start_box, 'start_box is required');
|
|
@@ -73,18 +63,23 @@ async function vlmPlanning(options) {
|
|
|
73
63
|
const startPoint = getPoint(action.action_inputs.start_box, size);
|
|
74
64
|
const endPoint = getPoint(action.action_inputs.end_box, size);
|
|
75
65
|
transformActions.push({
|
|
76
|
-
type: '
|
|
66
|
+
type: 'DragAndDrop',
|
|
77
67
|
param: {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
68
|
+
from: {
|
|
69
|
+
prompt: action.thought || '',
|
|
70
|
+
bbox: pointToBbox({
|
|
71
|
+
x: startPoint[0],
|
|
72
|
+
y: startPoint[1]
|
|
73
|
+
}, size.width, size.height)
|
|
81
74
|
},
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
75
|
+
to: {
|
|
76
|
+
prompt: action.thought || '',
|
|
77
|
+
bbox: pointToBbox({
|
|
78
|
+
x: endPoint[0],
|
|
79
|
+
y: endPoint[1]
|
|
80
|
+
}, size.width, size.height)
|
|
85
81
|
}
|
|
86
82
|
},
|
|
87
|
-
locate: null,
|
|
88
83
|
thought: action.thought || ''
|
|
89
84
|
});
|
|
90
85
|
} else if ('type' === action.action_type) transformActions.push({
|
|
@@ -92,7 +87,6 @@ async function vlmPlanning(options) {
|
|
|
92
87
|
param: {
|
|
93
88
|
value: action.action_inputs.content
|
|
94
89
|
},
|
|
95
|
-
locate: null,
|
|
96
90
|
thought: action.thought || ''
|
|
97
91
|
});
|
|
98
92
|
else if ('scroll' === action.action_type) transformActions.push({
|
|
@@ -100,13 +94,11 @@ async function vlmPlanning(options) {
|
|
|
100
94
|
param: {
|
|
101
95
|
direction: action.action_inputs.direction
|
|
102
96
|
},
|
|
103
|
-
locate: null,
|
|
104
97
|
thought: action.thought || ''
|
|
105
98
|
});
|
|
106
99
|
else if ('finished' === action.action_type) transformActions.push({
|
|
107
100
|
type: 'Finished',
|
|
108
101
|
param: {},
|
|
109
|
-
locate: null,
|
|
110
102
|
thought: action.thought || ''
|
|
111
103
|
});
|
|
112
104
|
else if ('hotkey' === action.action_type) if (action.action_inputs.key) {
|
|
@@ -114,9 +106,8 @@ async function vlmPlanning(options) {
|
|
|
114
106
|
transformActions.push({
|
|
115
107
|
type: 'KeyboardPress',
|
|
116
108
|
param: {
|
|
117
|
-
|
|
109
|
+
keyName: keys
|
|
118
110
|
},
|
|
119
|
-
locate: null,
|
|
120
111
|
thought: action.thought || ''
|
|
121
112
|
});
|
|
122
113
|
} else console.warn('No key found in action: hotkey. Will not perform action.');
|
|
@@ -125,56 +116,8 @@ async function vlmPlanning(options) {
|
|
|
125
116
|
param: {
|
|
126
117
|
timeMs: 1000
|
|
127
118
|
},
|
|
128
|
-
locate: null,
|
|
129
|
-
thought: action.thought || ''
|
|
130
|
-
});
|
|
131
|
-
else if ('androidBackButton' === action.action_type) transformActions.push({
|
|
132
|
-
type: 'AndroidBackButton',
|
|
133
|
-
param: {},
|
|
134
|
-
locate: null,
|
|
135
|
-
thought: action.thought || ''
|
|
136
|
-
});
|
|
137
|
-
else if ('androidHomeButton' === action.action_type) transformActions.push({
|
|
138
|
-
type: 'AndroidHomeButton',
|
|
139
|
-
param: {},
|
|
140
|
-
locate: null,
|
|
141
119
|
thought: action.thought || ''
|
|
142
120
|
});
|
|
143
|
-
else if ('androidRecentAppsButton' === action.action_type) transformActions.push({
|
|
144
|
-
type: 'AndroidRecentAppsButton',
|
|
145
|
-
param: {}
|
|
146
|
-
});
|
|
147
|
-
else if ('androidLongPress' === action.action_type) {
|
|
148
|
-
assert(action.action_inputs.start_coords, 'start_coords is required for androidLongPress');
|
|
149
|
-
const point = action.action_inputs.start_coords;
|
|
150
|
-
transformActions.push({
|
|
151
|
-
type: 'AndroidLongPress',
|
|
152
|
-
param: {
|
|
153
|
-
x: point[0],
|
|
154
|
-
y: point[1],
|
|
155
|
-
duration: 1000
|
|
156
|
-
},
|
|
157
|
-
locate: null,
|
|
158
|
-
thought: action.thought || ''
|
|
159
|
-
});
|
|
160
|
-
} else if ('androidPull' === action.action_type) {
|
|
161
|
-
const pullDirection = action.action_inputs.direction || 'down';
|
|
162
|
-
const startPoint = action.action_inputs.start_coords ? {
|
|
163
|
-
x: action.action_inputs.start_coords[0],
|
|
164
|
-
y: action.action_inputs.start_coords[1]
|
|
165
|
-
} : void 0;
|
|
166
|
-
transformActions.push({
|
|
167
|
-
type: 'AndroidPull',
|
|
168
|
-
param: {
|
|
169
|
-
direction: pullDirection,
|
|
170
|
-
startPoint,
|
|
171
|
-
distance: action.action_inputs.distance,
|
|
172
|
-
duration: action.action_inputs.duration || 500
|
|
173
|
-
},
|
|
174
|
-
locate: null,
|
|
175
|
-
thought: action.thought || ''
|
|
176
|
-
});
|
|
177
|
-
}
|
|
178
121
|
});
|
|
179
122
|
if (0 === transformActions.length) throw new Error(`No actions found, response: ${res.content}`, {
|
|
180
123
|
cause: {
|
|
@@ -182,6 +125,7 @@ async function vlmPlanning(options) {
|
|
|
182
125
|
parsed
|
|
183
126
|
}
|
|
184
127
|
});
|
|
128
|
+
debug('transformActions', JSON.stringify(transformActions, null, 2));
|
|
185
129
|
return {
|
|
186
130
|
actions: transformActions,
|
|
187
131
|
actionsFromModel: parsed,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport {\n type IModelPreferences,\n UITarsModelVersion,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { call } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait'\n | 'androidBackButton'\n | 'androidHomeButton'\n | 'androidRecentAppsButton'\n | 'androidLongPress'\n | 'androidPull';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelPreferences: IModelPreferences;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelPreferences } =\n options;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await call(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelPreferences,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const modelVer = uiTarsModelVersion(modelPreferences);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: modelVer || undefined,\n });\n\n debug('modelVer', modelVer, 'parsed', JSON.stringify(parsed));\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Locate',\n param: {},\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n });\n transformActions.push({\n type: 'Tap',\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n param: action.thought || '',\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'Drag',\n param: {\n start_box: { x: startPoint[0], y: startPoint[1] },\n end_box: { x: endPoint[0], y: endPoint[1] },\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n value: keys,\n },\n locate: null,\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidBackButton') {\n transformActions.push({\n type: 'AndroidBackButton',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidHomeButton') {\n transformActions.push({\n type: 'AndroidHomeButton',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidRecentAppsButton') {\n transformActions.push({\n type: 'AndroidRecentAppsButton',\n param: {},\n });\n } else if (action.action_type === 'androidLongPress') {\n assert(\n action.action_inputs.start_coords,\n 'start_coords is required for androidLongPress',\n );\n const point = action.action_inputs.start_coords;\n transformActions.push({\n type: 'AndroidLongPress',\n param: {\n x: point[0],\n y: point[1],\n duration: 1000,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidPull') {\n const pullDirection = action.action_inputs.direction || 'down';\n const startPoint = action.action_inputs.start_coords\n ? {\n x: action.action_inputs.start_coords[0],\n y: action.action_inputs.start_coords[1],\n }\n : undefined;\n\n transformActions.push({\n type: 'AndroidPull',\n param: {\n direction: pullDirection as 'up' | 'down',\n startPoint,\n distance: (action.action_inputs as any).distance,\n duration: (action.action_inputs as any).duration || 500,\n },\n locate: null,\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\ninterface AndroidLongPressAction extends BaseAction {\n action_type: 'androidLongPress';\n action_inputs: {\n start_coords: [number, number]; // Coordinates for long press\n duration?: number; // Duration in milliseconds\n };\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction\n | AndroidLongPressAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (\n vlLocateMode(modelPreferences) === 'vlm-ui-tars' &&\n uiTarsModelVersion(modelPreferences) === UITarsModelVersion.V1_5\n ) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelPreferences","systemPrompt","getUiTarsPlanningPrompt","res","call","AIActionType","convertedText","convertBboxToCoordinates","modelVer","uiTarsModelVersion","parsed","actionParser","undefined","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","pullDirection","Error","getSummary","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","vlLocateMode","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AAmCA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,gBAAgB,EAAE,GACpEJ;IACF,MAAMK,eAAeC,4BAA4BJ;IAEjD,MAAMK,MAAM,MAAMC,KAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGJ;KACJ,EACDQ,aAAa,eAAe,EAC5BL;IAEF,MAAMM,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAMK,WAAWC,mBAAmBT;IAEpC,MAAM,EAAEU,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYL;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOP,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUS,YAAYI;IACxB;IAEAzB,MAAM,YAAYqB,UAAU,UAAUK,KAAK,SAAS,CAACH;IAErD,MAAMI,mBAAqC,EAAE;IAC7CJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMxB,QAAQ0B,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YACvDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO,CAAC;gBACR,QAAQ;oBACN,QAAQC,OAAO,OAAO,IAAI;oBAC1B,MAAMzB,YACJ;wBAAE,GAAGC,KAAK,CAAC,EAAE;wBAAE,GAAGA,KAAK,CAAC,EAAE;oBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;gBAEf;YACF;YACAe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,QAAQ;oBACN,QAAQC,OAAO,OAAO,IAAI;oBAC1B,MAAMzB,YACJ;wBAAE,GAAGC,KAAK,CAAC,EAAE;wBAAE,GAAGA,KAAK,CAAC,EAAE;oBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;gBAEf;gBACA,OAAOgB,OAAO,OAAO,IAAI;YAC3B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YAC5D,MAAMoB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEhB;YACxDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAW;wBAAE,GAAGI,UAAU,CAAC,EAAE;wBAAE,GAAGA,UAAU,CAAC,EAAE;oBAAC;oBAChD,SAAS;wBAAE,GAAGC,QAAQ,CAAC,EAAE;wBAAE,GAAGA,QAAQ,CAAC,EAAE;oBAAC;gBAC5C;gBACA,QAAQ;gBACR,SAASJ,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,QAAQ;YACR,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,QAAQ;YACR,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,OAAOM;gBACT;gBACA,QAAQ;gBACR,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAdEO,QAAQ,IAAI,CACV;aAcC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,8BAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;QACV;aACK,IAAIC,AAAuB,uBAAvBA,OAAO,WAAW,EAAyB;YACpDC,OACED,OAAO,aAAa,CAAC,YAAY,EACjC;YAEF,MAAMxB,QAAQwB,OAAO,aAAa,CAAC,YAAY;YAC/CD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,GAAGvB,KAAK,CAAC,EAAE;oBACX,GAAGA,KAAK,CAAC,EAAE;oBACX,UAAU;gBACZ;gBACA,QAAQ;gBACR,SAASwB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,kBAAvBA,OAAO,WAAW,EAAoB;YAC/C,MAAMQ,gBAAgBR,OAAO,aAAa,CAAC,SAAS,IAAI;YACxD,MAAMG,aAAaH,OAAO,aAAa,CAAC,YAAY,GAChD;gBACE,GAAGA,OAAO,aAAa,CAAC,YAAY,CAAC,EAAE;gBACvC,GAAGA,OAAO,aAAa,CAAC,YAAY,CAAC,EAAE;YACzC,IACAH;YAEJE,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAWS;oBACXL;oBACA,UAAWH,OAAO,aAAa,CAAS,QAAQ;oBAChD,UAAWA,OAAO,aAAa,CAAS,QAAQ,IAAI;gBACtD;gBACA,QAAQ;gBACR,SAASA,OAAO,OAAO,IAAI;YAC7B;QACF;IACF;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIU,MAAM,CAAC,4BAA4B,EAAErB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBO;QACF;IACF;IAGF,OAAO;QACL,SAASI;QACT,kBAAkBJ;QAClB,gBAAgBe,WAAWtB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaU,KAAK,SAAS,CAACV,IAAI,OAAO,EAAES,QAAW;IACtD;AACF;AAOA,SAASL,yBAAyBmB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI7C,KAAK,KAAK,CAAEwC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI9C,KAAK,KAAK,CAAE0C,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASX,SAASyB,QAAgB,EAAE3C,IAAuC;IACzE,MAAM,CAACwC,GAAGC,EAAE,GAAG3B,KAAK,KAAK,CAAC6B;IAC1B,OAAO;QAACH,IAAIxC,KAAK,KAAK;QAAEyC,IAAIzC,KAAK,MAAM;KAAC;AAC1C;AA2EO,eAAe4C,qBACpBC,WAAmB,EACnB7C,IAAU,EACVC,gBAAmC;IAEnC,IACE6C,AAAmC,kBAAnCA,aAAa7C,qBACbS,mBAAmBT,sBAAsB8C,mBAAmB,IAAI,EAChE;QACA3D,MAAM,uCAAuCY;QAC7C,MAAMgD,gBAAgBhD,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMiD,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAevD,KAAK,IAAI,CAACsD,YAAYD;YAC3C,MAAMG,WAAWxD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGkD;YACzC,MAAME,YAAYzD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGkD;YAC3C9D,MACE,2DACA+D,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|
|
1
|
+
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport {\n type IModelPreferences,\n UITarsModelVersion,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { call } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelPreferences: IModelPreferences;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelPreferences } =\n options;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await call(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelPreferences,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const modelVer = uiTarsModelVersion(modelPreferences);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: modelVer || undefined,\n });\n\n debug('ui-tars modelVer', modelVer, ', parsed', JSON.stringify(parsed));\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys,\n },\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (\n vlLocateMode(modelPreferences) === 'vlm-ui-tars' &&\n uiTarsModelVersion(modelPreferences) === UITarsModelVersion.V1_5\n ) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelPreferences","systemPrompt","getUiTarsPlanningPrompt","res","call","AIActionType","convertedText","convertBboxToCoordinates","modelVer","uiTarsModelVersion","parsed","actionParser","undefined","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","Error","getSummary","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","vlLocateMode","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AA8BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,gBAAgB,EAAE,GACpEJ;IACF,MAAMK,eAAeC,4BAA4BJ;IAEjD,MAAMK,MAAM,MAAMC,KAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGJ;KACJ,EACDQ,aAAa,eAAe,EAC5BL;IAEF,MAAMM,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAMK,WAAWC,mBAAmBT;IAEpC,MAAM,EAAEU,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYL;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOP,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUS,YAAYI;IACxB;IAEAzB,MAAM,oBAAoBqB,UAAU,YAAYK,KAAK,SAAS,CAACH;IAE/D,MAAMI,mBAAqC,EAAE;IAC7CJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMxB,QAAQ0B,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YACvDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIgB,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YAC5D,MAAMoB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEhB;YACxDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAG4B,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCnB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQgB,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAG6B,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjCpB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASgB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASM;gBACX;gBACA,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEO,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASC,OAAO,OAAO,IAAI;QAC7B;IAEJ;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIS,MAAM,CAAC,4BAA4B,EAAEpB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBO;QACF;IACF;IAGFvB,MAAM,oBAAoB0B,KAAK,SAAS,CAACC,kBAAkB,MAAM;IAEjE,OAAO;QACL,SAASA;QACT,kBAAkBJ;QAClB,gBAAgBc,WAAWrB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaU,KAAK,SAAS,CAACV,IAAI,OAAO,EAAES,QAAW;IACtD;AACF;AAOA,SAASL,yBAAyBkB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI5C,KAAK,KAAK,CAAEuC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI7C,KAAK,KAAK,CAAEyC,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASV,SAASwB,QAAgB,EAAE1C,IAAuC;IACzE,MAAM,CAACuC,GAAGC,EAAE,GAAG1B,KAAK,KAAK,CAAC4B;IAC1B,OAAO;QAACH,IAAIvC,KAAK,KAAK;QAAEwC,IAAIxC,KAAK,MAAM;KAAC;AAC1C;AAkEO,eAAe2C,qBACpBC,WAAmB,EACnB5C,IAAU,EACVC,gBAAmC;IAEnC,IACE4C,AAAmC,kBAAnCA,aAAa5C,qBACbS,mBAAmBT,sBAAsB6C,mBAAmB,IAAI,EAChE;QACA1D,MAAM,uCAAuCY;QAC7C,MAAM+C,gBAAgB/C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMgD,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAetD,KAAK,IAAI,CAACqD,YAAYD;YAC3C,MAAMG,WAAWvD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGiD;YACzC,MAAME,YAAYxD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGiD;YAC3C7D,MACE,2DACA8D,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|
|
@@ -168,6 +168,9 @@ class Insight {
|
|
|
168
168
|
const context = await this.contextRetrieverFn('describe');
|
|
169
169
|
const { screenshotBase64, size } = context;
|
|
170
170
|
assert(screenshotBase64, 'screenshot is required for insight.describe');
|
|
171
|
+
const modelPreferences = {
|
|
172
|
+
intent: 'grounding'
|
|
173
|
+
};
|
|
171
174
|
const systemPrompt = elementDescriberInstruction();
|
|
172
175
|
const defaultRectSize = 30;
|
|
173
176
|
const targetRect = Array.isArray(target) ? {
|
|
@@ -187,9 +190,6 @@ class Insight {
|
|
|
187
190
|
borderThickness: 3
|
|
188
191
|
});
|
|
189
192
|
if (null == opt ? void 0 : opt.deepThink) {
|
|
190
|
-
const modelPreferences = {
|
|
191
|
-
intent: 'grounding'
|
|
192
|
-
};
|
|
193
193
|
const searchArea = expandSearchArea(targetRect, context.size, modelPreferences);
|
|
194
194
|
debug('describe: set searchArea', searchArea);
|
|
195
195
|
imagePayload = await cropByRect(imagePayload, searchArea, getIsUseQwenVl(modelPreferences));
|
|
@@ -213,7 +213,7 @@ class Insight {
|
|
|
213
213
|
}
|
|
214
214
|
];
|
|
215
215
|
const callAIFn = this.aiVendorFn || callToGetJSONObject;
|
|
216
|
-
const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT);
|
|
216
|
+
const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT, modelPreferences);
|
|
217
217
|
const { content } = res;
|
|
218
218
|
assert(!content.error, `describe failed: ${content.error}`);
|
|
219
219
|
assert(content.description, 'failed to describe the element');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"insight/index.mjs","sources":["webpack://@midscene/core/./src/insight/index.ts"],"sourcesContent":["import {\n AIActionType,\n type AIArgs,\n callAiFn,\n expandSearchArea,\n} from '@/ai-model/common';\nimport {\n AiExtractElementInfo,\n AiLocateElement,\n callToGetJSONObject,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport type {\n AIDescribeElementResponse,\n AIElementResponse,\n AIUsageInfo,\n BaseElement,\n DetailedLocateParam,\n DumpSubscriber,\n InsightAction,\n InsightExtractOption,\n InsightExtractParam,\n InsightOptions,\n InsightTaskInfo,\n LocateResult,\n PartialInsightDumpFromSDK,\n Rect,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n MIDSCENE_FORCE_DEEP_THINK,\n getAIConfigInBoolean,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../ai-model/common';\nimport { emitInsightDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext<BaseElement>;\n callAI?: typeof callAiFn<AIElementResponse>;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\nconst debug = getDebug('ai:insight');\nexport default class Insight<\n ElementType extends BaseElement = BaseElement,\n ContextType extends UIContext<ElementType> = UIContext<ElementType>,\n> {\n contextRetrieverFn: (\n action: InsightAction,\n ) => Promise<ContextType> | ContextType;\n\n aiVendorFn: (...args: Array<any>) => Promise<any> = callAiFn;\n\n onceDumpUpdatedFn?: DumpSubscriber;\n\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n\n constructor(\n context:\n | ContextType\n | ((action: InsightAction) => Promise<ContextType> | ContextType),\n opt?: InsightOptions,\n ) {\n assert(context, 'context is required for Insight');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt?: LocateOpts,\n ): Promise<LocateResult> {\n const { callAI } = opt || {};\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = getAIConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n if (searchAreaPrompt && !vlLocateMode(modelPreferences)) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn('locate'));\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const {\n parseResult,\n rect,\n elementById,\n rawResponse,\n usage,\n isOrderSensitive,\n } = await AiLocateElement({\n callAI: callAI || this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements: BaseElement[] = [];\n (parseResult.elements || []).forEach((item) => {\n if ('id' in item) {\n const element = elementById(item?.id);\n\n if (!element) {\n console.warn(\n `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`,\n );\n return;\n }\n elements.push(element);\n }\n });\n\n emitInsightDump(\n {\n ...dumpData,\n matchedElement: elements,\n },\n dumpSubscriber,\n );\n\n if (errorLog) {\n throw new Error(errorLog);\n }\n\n assert(\n elements.length <= 1,\n `locate: multiple elements found, length = ${elements.length}`,\n );\n\n if (elements.length === 1) {\n return {\n element: {\n id: elements[0]!.id,\n indexId: elements[0]!.indexId,\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n xpaths: elements[0]!.xpaths || [],\n attributes: elements[0]!.attributes,\n isOrderSensitive,\n },\n rect,\n };\n }\n return {\n element: null,\n rect,\n };\n }\n\n async extract<T>(\n dataDemand: InsightExtractParam,\n opt?: InsightExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<{\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n }> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n const modelPreferences: IModelPreferences = {\n intent: 'VQA',\n };\n\n const context = await this.contextRetrieverFn('extract');\n\n const startTime = Date.now();\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelPreferences,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n emitInsightDump(\n {\n ...dumpData,\n data,\n },\n dumpSubscriber,\n );\n\n if (errorLog && !data && !opt?.doNotThrowError) {\n throw new Error(errorLog);\n }\n\n return {\n data,\n thought,\n usage,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for insight.describe');\n const context = await this.contextRetrieverFn('describe');\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for insight.describe');\n\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const modelPreferences: IModelPreferences = { intent: 'grounding' };\n const searchArea = expandSearchArea(\n targetRect,\n context.size,\n modelPreferences,\n );\n debug('describe: set searchArea', searchArea);\n imagePayload = await cropByRect(\n imagePayload,\n searchArea,\n getIsUseQwenVl(modelPreferences),\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn =\n this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;\n\n const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT);\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["debug","getDebug","Insight","query","opt","_parseResult_errors","callAI","queryPrompt","assert","dumpSubscriber","undefined","globalDeepThinkSwitch","getAIConfigInBoolean","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","modelPreferences","vlLocateMode","console","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","elementById","rawResponse","usage","isOrderSensitive","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","item","element","emitInsightDump","Error","dataDemand","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","cropByRect","getIsUseQwenVl","msgs","callAIFn","callToGetJSONObject","res","AIActionType","content","callAiFn","Promise"],"mappings":";;;;;;;;;;;;;;;;;;;AAoDA,MAAMA,QAAQC,SAAS;AACR,MAAMC;IAmCnB,MAAM,OACJC,KAA0B,EAC1BC,GAAgB,EACO;YAkFnBC;QAjFJ,MAAM,EAAEC,MAAM,EAAE,GAAGF,OAAO,CAAC;QAC3B,MAAMG,cAAc,AAAiB,YAAjB,OAAOJ,QAAqBA,QAAQA,MAAM,MAAM;QACpEK,OAAOD,aAAa;QACpB,MAAME,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzBF,OAAO,AAAiB,YAAjB,OAAOL,OAAoB;QAElC,MAAMQ,wBAAwBC,qBAC5BC;QAEF,IAAIF,uBACFX,MAAM,yBAAyBW;QAEjC,IAAIG;QACJ,IAAIX,MAAM,SAAS,IAAIQ,uBACrBG,mBAAmBX,MAAM,MAAM;QAEjC,MAAMY,mBAAsC;YAC1C,QAAQ;QACV;QAEA,IAAID,oBAAoB,CAACE,aAAaD,mBAAmB;YACvDE,QAAQ,IAAI,CACV;YAEFH,mBAAmBJ;QACrB;QAEA,MAAMQ,UAAUd,AAAAA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,OAAO,AAAD,KAAM,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE/D,IAAIe;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,gBAAgB;gBACzCL;gBACA,oBAAoBJ;YACtB;YACAN,OACEc,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EACJC,WAAW,EACXC,IAAI,EACJC,WAAW,EACXC,WAAW,EACXC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,gBAAgB;YACxB,QAAQ1B,UAAU,IAAI,CAAC,UAAU;YACjCY;YACA,0BAA0BX;YAC1B,cAAce;QAChB;QAEA,MAAMW,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACN;YAC5B,gBAAgBM,KAAK,SAAS,CAACT;YAC/BI;YACAX;YACAC;YACAC;QACF;QAEA,IAAIe;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,6BAA6B,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS9B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAaoB;YACb,MAAM;YACNO;YACA,WAAW,CAAC,CAACf;YACb,OAAOiB;QACT;QAEA,MAAME,WAA0B,EAAE;QACjCZ,CAAAA,YAAY,QAAQ,IAAI,EAAC,EAAG,OAAO,CAAC,CAACa;YACpC,IAAI,QAAQA,MAAM;gBAChB,MAAMC,UAAUZ,YAAYW,QAAAA,OAAAA,KAAAA,IAAAA,KAAM,EAAE;gBAEpC,IAAI,CAACC,SAAS,YACZvB,QAAQ,IAAI,CACV,CAAC,+BAA+B,EAAEsB,KAAK,EAAE,CAAC,0CAA0C,CAAC;gBAIzFD,SAAS,IAAI,CAACE;YAChB;QACF;QAEAC,gBACE;YACE,GAAGJ,QAAQ;YACX,gBAAgBC;QAClB,GACA7B;QAGF,IAAI2B,UACF,MAAM,IAAIM,MAAMN;QAGlB5B,OACE8B,SAAS,MAAM,IAAI,GACnB,CAAC,0CAA0C,EAAEA,SAAS,MAAM,EAAE;QAGhE,IAAIA,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,IAAIA,QAAQ,CAAC,EAAE,CAAE,EAAE;gBACnB,SAASA,QAAQ,CAAC,EAAE,CAAE,OAAO;gBAC7B,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM,IAAI,EAAE;gBACjC,YAAYA,QAAQ,CAAC,EAAE,CAAE,UAAU;gBACnCP;YACF;YACAJ;QACF;QAEF,OAAO;YACL,SAAS;YACTA;QACF;IACF;IAEA,MAAM,QACJgB,UAA+B,EAC/BvC,GAA0B,EAC1BwC,gBAAoC,EAKnC;YA+BGvC;QA9BJG,OACE,AAAsB,YAAtB,OAAOmC,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMlC,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzB,MAAMK,mBAAsC;YAC1C,QAAQ;QACV;QAEA,MAAMG,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE9C,MAAMM,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEI,KAAK,EAAE,GAAG,MAAMe,qBAAwB;YAC3D3B;YACA,WAAWyB;YACXC;YACA,eAAexC;YACfW;QACF;QAEA,MAAMkB,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACT;QAC9B;QAEA,IAAIU;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,qBAAqB,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTM;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNT;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGrB,eAAe,CAAC;QAG1Ce,gBACE;YACE,GAAGJ,QAAQ;YACXS;QACF,GACArC;QAGF,IAAI2B,YAAY,CAACU,QAAQ,CAAC1C,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,eAAe,AAAD,GAC3C,MAAM,IAAIsC,MAAMN;QAGlB,OAAO;YACLU;YACAC;YACAjB;QACF;IACF;IAEA,MAAM,SACJkB,MAA+B,EAC/B5C,GAEC,EACwD;QACzDI,OAAOwC,QAAQ;QACf,MAAM9B,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAC9C,MAAM,EAAE+B,gBAAgB,EAAEC,IAAI,EAAE,GAAGhC;QACnCV,OAAOyC,kBAAkB;QAEzB,MAAME,eAAeC;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,wBAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAIlD,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,SAAS,EAAE;YAElB,MAAMW,mBAAsC;gBAAE,QAAQ;YAAY;YAClE,MAAMI,aAAawC,iBACjBL,YACApC,QAAQ,IAAI,EACZH;YAEFf,MAAM,4BAA4BmB;YAClCsC,eAAe,MAAMG,WACnBH,cACAtC,YACA0C,eAAe9C;QAEnB;QAEA,MAAM+C,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WACJ,IAAI,CAAC,UAAU,IAAIC;QAErB,MAAMC,MAAM,MAAMF,SAASD,MAAMI,aAAa,gBAAgB;QAE9D,MAAM,EAAEC,OAAO,EAAE,GAAGF;QACpBzD,OAAO,CAAC2D,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1D3D,OAAO2D,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IAvUA,YACEjD,OAEmE,EACnEd,GAAoB,CACpB;QAfF;QAIA,qCAAoDgE;QAEpD;QAEA;QAQE5D,OAAOU,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMmD,QAAQ,OAAO,CAACnD;QAGlD,IAAI,AAA2B,WAApBd,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,UAAU,AAAD,GACvB,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,QAAQ,AAAD,GACrB,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAqTF"}
|
|
1
|
+
{"version":3,"file":"insight/index.mjs","sources":["webpack://@midscene/core/./src/insight/index.ts"],"sourcesContent":["import {\n AIActionType,\n type AIArgs,\n callAiFn,\n expandSearchArea,\n} from '@/ai-model/common';\nimport {\n AiExtractElementInfo,\n AiLocateElement,\n callToGetJSONObject,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport type {\n AIDescribeElementResponse,\n AIElementResponse,\n AIUsageInfo,\n BaseElement,\n DetailedLocateParam,\n DumpSubscriber,\n InsightAction,\n InsightExtractOption,\n InsightExtractParam,\n InsightOptions,\n InsightTaskInfo,\n LocateResult,\n PartialInsightDumpFromSDK,\n Rect,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n MIDSCENE_FORCE_DEEP_THINK,\n getAIConfigInBoolean,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../ai-model/common';\nimport { emitInsightDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext<BaseElement>;\n callAI?: typeof callAiFn<AIElementResponse>;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\nconst debug = getDebug('ai:insight');\nexport default class Insight<\n ElementType extends BaseElement = BaseElement,\n ContextType extends UIContext<ElementType> = UIContext<ElementType>,\n> {\n contextRetrieverFn: (\n action: InsightAction,\n ) => Promise<ContextType> | ContextType;\n\n aiVendorFn: (...args: Array<any>) => Promise<any> = callAiFn;\n\n onceDumpUpdatedFn?: DumpSubscriber;\n\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n\n constructor(\n context:\n | ContextType\n | ((action: InsightAction) => Promise<ContextType> | ContextType),\n opt?: InsightOptions,\n ) {\n assert(context, 'context is required for Insight');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt?: LocateOpts,\n ): Promise<LocateResult> {\n const { callAI } = opt || {};\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = getAIConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n if (searchAreaPrompt && !vlLocateMode(modelPreferences)) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn('locate'));\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const {\n parseResult,\n rect,\n elementById,\n rawResponse,\n usage,\n isOrderSensitive,\n } = await AiLocateElement({\n callAI: callAI || this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements: BaseElement[] = [];\n (parseResult.elements || []).forEach((item) => {\n if ('id' in item) {\n const element = elementById(item?.id);\n\n if (!element) {\n console.warn(\n `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`,\n );\n return;\n }\n elements.push(element);\n }\n });\n\n emitInsightDump(\n {\n ...dumpData,\n matchedElement: elements,\n },\n dumpSubscriber,\n );\n\n if (errorLog) {\n throw new Error(errorLog);\n }\n\n assert(\n elements.length <= 1,\n `locate: multiple elements found, length = ${elements.length}`,\n );\n\n if (elements.length === 1) {\n return {\n element: {\n id: elements[0]!.id,\n indexId: elements[0]!.indexId,\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n xpaths: elements[0]!.xpaths || [],\n attributes: elements[0]!.attributes,\n isOrderSensitive,\n },\n rect,\n };\n }\n return {\n element: null,\n rect,\n };\n }\n\n async extract<T>(\n dataDemand: InsightExtractParam,\n opt?: InsightExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<{\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n }> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n const modelPreferences: IModelPreferences = {\n intent: 'VQA',\n };\n\n const context = await this.contextRetrieverFn('extract');\n\n const startTime = Date.now();\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelPreferences,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n emitInsightDump(\n {\n ...dumpData,\n data,\n },\n dumpSubscriber,\n );\n\n if (errorLog && !data && !opt?.doNotThrowError) {\n throw new Error(errorLog);\n }\n\n return {\n data,\n thought,\n usage,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for insight.describe');\n const context = await this.contextRetrieverFn('describe');\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for insight.describe');\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const modelPreferences: IModelPreferences = { intent: 'grounding' };\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n const searchArea = expandSearchArea(\n targetRect,\n context.size,\n modelPreferences,\n );\n debug('describe: set searchArea', searchArea);\n imagePayload = await cropByRect(\n imagePayload,\n searchArea,\n getIsUseQwenVl(modelPreferences),\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn =\n this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;\n\n const res = await callAIFn(\n msgs,\n AIActionType.DESCRIBE_ELEMENT,\n modelPreferences,\n );\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["debug","getDebug","Insight","query","opt","_parseResult_errors","callAI","queryPrompt","assert","dumpSubscriber","undefined","globalDeepThinkSwitch","getAIConfigInBoolean","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","modelPreferences","vlLocateMode","console","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","elementById","rawResponse","usage","isOrderSensitive","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","item","element","emitInsightDump","Error","dataDemand","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","cropByRect","getIsUseQwenVl","msgs","callAIFn","callToGetJSONObject","res","AIActionType","content","callAiFn","Promise"],"mappings":";;;;;;;;;;;;;;;;;;;AAoDA,MAAMA,QAAQC,SAAS;AACR,MAAMC;IAmCnB,MAAM,OACJC,KAA0B,EAC1BC,GAAgB,EACO;YAkFnBC;QAjFJ,MAAM,EAAEC,MAAM,EAAE,GAAGF,OAAO,CAAC;QAC3B,MAAMG,cAAc,AAAiB,YAAjB,OAAOJ,QAAqBA,QAAQA,MAAM,MAAM;QACpEK,OAAOD,aAAa;QACpB,MAAME,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzBF,OAAO,AAAiB,YAAjB,OAAOL,OAAoB;QAElC,MAAMQ,wBAAwBC,qBAC5BC;QAEF,IAAIF,uBACFX,MAAM,yBAAyBW;QAEjC,IAAIG;QACJ,IAAIX,MAAM,SAAS,IAAIQ,uBACrBG,mBAAmBX,MAAM,MAAM;QAEjC,MAAMY,mBAAsC;YAC1C,QAAQ;QACV;QAEA,IAAID,oBAAoB,CAACE,aAAaD,mBAAmB;YACvDE,QAAQ,IAAI,CACV;YAEFH,mBAAmBJ;QACrB;QAEA,MAAMQ,UAAUd,AAAAA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,OAAO,AAAD,KAAM,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE/D,IAAIe;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,gBAAgB;gBACzCL;gBACA,oBAAoBJ;YACtB;YACAN,OACEc,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EACJC,WAAW,EACXC,IAAI,EACJC,WAAW,EACXC,WAAW,EACXC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,gBAAgB;YACxB,QAAQ1B,UAAU,IAAI,CAAC,UAAU;YACjCY;YACA,0BAA0BX;YAC1B,cAAce;QAChB;QAEA,MAAMW,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACN;YAC5B,gBAAgBM,KAAK,SAAS,CAACT;YAC/BI;YACAX;YACAC;YACAC;QACF;QAEA,IAAIe;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,6BAA6B,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS9B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAaoB;YACb,MAAM;YACNO;YACA,WAAW,CAAC,CAACf;YACb,OAAOiB;QACT;QAEA,MAAME,WAA0B,EAAE;QACjCZ,CAAAA,YAAY,QAAQ,IAAI,EAAC,EAAG,OAAO,CAAC,CAACa;YACpC,IAAI,QAAQA,MAAM;gBAChB,MAAMC,UAAUZ,YAAYW,QAAAA,OAAAA,KAAAA,IAAAA,KAAM,EAAE;gBAEpC,IAAI,CAACC,SAAS,YACZvB,QAAQ,IAAI,CACV,CAAC,+BAA+B,EAAEsB,KAAK,EAAE,CAAC,0CAA0C,CAAC;gBAIzFD,SAAS,IAAI,CAACE;YAChB;QACF;QAEAC,gBACE;YACE,GAAGJ,QAAQ;YACX,gBAAgBC;QAClB,GACA7B;QAGF,IAAI2B,UACF,MAAM,IAAIM,MAAMN;QAGlB5B,OACE8B,SAAS,MAAM,IAAI,GACnB,CAAC,0CAA0C,EAAEA,SAAS,MAAM,EAAE;QAGhE,IAAIA,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,IAAIA,QAAQ,CAAC,EAAE,CAAE,EAAE;gBACnB,SAASA,QAAQ,CAAC,EAAE,CAAE,OAAO;gBAC7B,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM,IAAI,EAAE;gBACjC,YAAYA,QAAQ,CAAC,EAAE,CAAE,UAAU;gBACnCP;YACF;YACAJ;QACF;QAEF,OAAO;YACL,SAAS;YACTA;QACF;IACF;IAEA,MAAM,QACJgB,UAA+B,EAC/BvC,GAA0B,EAC1BwC,gBAAoC,EAKnC;YA+BGvC;QA9BJG,OACE,AAAsB,YAAtB,OAAOmC,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMlC,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzB,MAAMK,mBAAsC;YAC1C,QAAQ;QACV;QAEA,MAAMG,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE9C,MAAMM,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEI,KAAK,EAAE,GAAG,MAAMe,qBAAwB;YAC3D3B;YACA,WAAWyB;YACXC;YACA,eAAexC;YACfW;QACF;QAEA,MAAMkB,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACT;QAC9B;QAEA,IAAIU;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,qBAAqB,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTM;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNT;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGrB,eAAe,CAAC;QAG1Ce,gBACE;YACE,GAAGJ,QAAQ;YACXS;QACF,GACArC;QAGF,IAAI2B,YAAY,CAACU,QAAQ,CAAC1C,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,eAAe,AAAD,GAC3C,MAAM,IAAIsC,MAAMN;QAGlB,OAAO;YACLU;YACAC;YACAjB;QACF;IACF;IAEA,MAAM,SACJkB,MAA+B,EAC/B5C,GAEC,EACwD;QACzDI,OAAOwC,QAAQ;QACf,MAAM9B,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAC9C,MAAM,EAAE+B,gBAAgB,EAAEC,IAAI,EAAE,GAAGhC;QACnCV,OAAOyC,kBAAkB;QAEzB,MAAMlC,mBAAsC;YAAE,QAAQ;QAAY;QAClE,MAAMoC,eAAeC;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,wBAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAIlD,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,SAAS,EAAE;YAClB,MAAMe,aAAawC,iBACjBL,YACApC,QAAQ,IAAI,EACZH;YAEFf,MAAM,4BAA4BmB;YAClCsC,eAAe,MAAMG,WACnBH,cACAtC,YACA0C,eAAe9C;QAEnB;QAEA,MAAM+C,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WACJ,IAAI,CAAC,UAAU,IAAIC;QAErB,MAAMC,MAAM,MAAMF,SAChBD,MACAI,aAAa,gBAAgB,EAC7BnD;QAGF,MAAM,EAAEoD,OAAO,EAAE,GAAGF;QACpBzD,OAAO,CAAC2D,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1D3D,OAAO2D,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IA1UA,YACEjD,OAEmE,EACnEd,GAAoB,CACpB;QAfF;QAIA,qCAAoDgE;QAEpD;QAEA;QAQE5D,OAAOU,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMmD,QAAQ,OAAO,CAACnD;QAGlD,IAAI,AAA2B,WAApBd,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,UAAU,AAAD,GACvB,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,QAAQ,AAAD,GACrB,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAwTF"}
|