@midscene/core 1.9.4-beta-20260610095330.0 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/es/agent/task-builder.mjs +3 -1
  2. package/dist/es/agent/task-builder.mjs.map +1 -1
  3. package/dist/es/agent/tasks.mjs +8 -4
  4. package/dist/es/agent/tasks.mjs.map +1 -1
  5. package/dist/es/agent/utils.mjs +1 -1
  6. package/dist/es/ai-model/inspect.mjs +11 -2
  7. package/dist/es/ai-model/inspect.mjs.map +1 -1
  8. package/dist/es/ai-model/llm-planning.mjs +4 -2
  9. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  10. package/dist/es/ai-model/models/auto-glm/locate.mjs +2 -1
  11. package/dist/es/ai-model/models/auto-glm/locate.mjs.map +1 -1
  12. package/dist/es/ai-model/models/auto-glm/planning.mjs +4 -3
  13. package/dist/es/ai-model/models/auto-glm/planning.mjs.map +1 -1
  14. package/dist/es/ai-model/models/gpt.mjs +12 -6
  15. package/dist/es/ai-model/models/gpt.mjs.map +1 -1
  16. package/dist/es/ai-model/models/kimi.mjs +42 -0
  17. package/dist/es/ai-model/models/kimi.mjs.map +1 -0
  18. package/dist/es/ai-model/models/registry.mjs +3 -1
  19. package/dist/es/ai-model/models/registry.mjs.map +1 -1
  20. package/dist/es/ai-model/models/ui-tars/planning.mjs +3 -2
  21. package/dist/es/ai-model/models/ui-tars/planning.mjs.map +1 -1
  22. package/dist/es/ai-model/service-caller/index.mjs +13 -7
  23. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  24. package/dist/es/service/index.mjs +9 -1
  25. package/dist/es/service/index.mjs.map +1 -1
  26. package/dist/es/types.mjs.map +1 -1
  27. package/dist/es/utils.mjs +2 -2
  28. package/dist/lib/agent/task-builder.js +3 -1
  29. package/dist/lib/agent/task-builder.js.map +1 -1
  30. package/dist/lib/agent/tasks.js +8 -4
  31. package/dist/lib/agent/tasks.js.map +1 -1
  32. package/dist/lib/agent/utils.js +1 -1
  33. package/dist/lib/ai-model/inspect.js +11 -2
  34. package/dist/lib/ai-model/inspect.js.map +1 -1
  35. package/dist/lib/ai-model/llm-planning.js +4 -2
  36. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  37. package/dist/lib/ai-model/models/auto-glm/locate.js +2 -1
  38. package/dist/lib/ai-model/models/auto-glm/locate.js.map +1 -1
  39. package/dist/lib/ai-model/models/auto-glm/planning.js +4 -3
  40. package/dist/lib/ai-model/models/auto-glm/planning.js.map +1 -1
  41. package/dist/lib/ai-model/models/gpt.js +12 -6
  42. package/dist/lib/ai-model/models/gpt.js.map +1 -1
  43. package/dist/lib/ai-model/models/kimi.js +76 -0
  44. package/dist/lib/ai-model/models/kimi.js.map +1 -0
  45. package/dist/lib/ai-model/models/registry.js +3 -1
  46. package/dist/lib/ai-model/models/registry.js.map +1 -1
  47. package/dist/lib/ai-model/models/ui-tars/planning.js +3 -2
  48. package/dist/lib/ai-model/models/ui-tars/planning.js.map +1 -1
  49. package/dist/lib/ai-model/service-caller/index.js +13 -7
  50. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  51. package/dist/lib/service/index.js +9 -1
  52. package/dist/lib/service/index.js.map +1 -1
  53. package/dist/lib/types.js.map +1 -1
  54. package/dist/lib/utils.js +2 -2
  55. package/dist/types/ai-model/inspect.d.ts +2 -0
  56. package/dist/types/ai-model/models/gpt.d.ts +2 -2
  57. package/dist/types/ai-model/models/kimi.d.ts +18 -0
  58. package/dist/types/ai-model/models/registry.d.ts +17 -2
  59. package/dist/types/ai-model/service-caller/index.d.ts +9 -1
  60. package/dist/types/ai-model/workflows/inspect/types.d.ts +1 -0
  61. package/dist/types/types.d.ts +15 -0
  62. package/package.json +2 -2
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocateResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport { generateElementByRect } from '@midscene/shared/extractor';\nimport { cropByRect, scaleImage } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport {\n expandSearchArea,\n multimodalPromptToChatMessages,\n userPromptToMultimodalPrompt,\n userPromptToString,\n} from '../common';\nimport type { ModelRuntime } from './models';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n} from './service-caller/index';\nimport { prepareModelImage } from './workflows/image-preprocess';\nimport {\n mergePixelBboxesToRect,\n pixelBboxToRect,\n} from './workflows/inspect/locate-result-rect';\nimport { mapSearchAreaPixelBboxToOriginalPixelBbox } from './workflows/inspect/search-area-mapping';\nimport type {\n LocateOptions,\n LocateResult,\n SearchAreaConfig,\n} from './workflows/inspect/types';\n\nexport type InspectAIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nexport {\n userPromptToString as extraTextFromUserPrompt,\n multimodalPromptToChatMessages as promptsToChatParam,\n} from '../common';\n\nfunction hasLocateResult(input: unknown, resultKey: string) {\n if (!input || typeof input !== 'object') {\n return false;\n }\n\n const record = input as Record<string, unknown>;\n const locateResult = record[resultKey];\n return Array.isArray(locateResult)\n ? locateResult.length > 0\n : locateResult !== undefined;\n}\n\nexport async function buildSearchAreaConfig(options: {\n context: UIContext;\n baseRect: Rect;\n}): Promise<SearchAreaConfig> {\n const { context, baseRect } = options;\n const scaleRatio = 2;\n const sectionRect = expandSearchArea(baseRect, context.shotSize);\n\n const croppedResult = await cropByRect(\n context.screenshot.base64,\n sectionRect,\n );\n\n const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);\n return {\n sourceRect: sectionRect,\n image: {\n imageBase64: scaledResult.imageBase64,\n width: scaledResult.width,\n height: scaledResult.height,\n },\n mapping: {\n offset: {\n x: sectionRect.left,\n y: sectionRect.top,\n },\n scale: scaleRatio,\n },\n };\n}\n\nexport async function AiLocateElement(\n options: LocateOptions & { targetElementDescription: TUserPrompt },\n): Promise<LocateResult> {\n const { targetElementDescription, ...locateOptions } = options;\n const locateAdapter = options.modelRuntime.adapter.locate;\n if (locateAdapter.kind === 'custom') {\n return locateAdapter.locateFn(targetElementDescription, locateOptions);\n }\n return genericLocate(targetElementDescription, locateOptions);\n}\n\nexport async function genericLocate(\n elementDescription: TUserPrompt,\n options: LocateOptions,\n): Promise<LocateResult> {\n const { context } = options;\n const modelRuntime = options.modelRuntime;\n const { adapter } = modelRuntime;\n assert(\n adapter.locate.kind === 'standard',\n 'generic locate requires a standard locate adapter',\n );\n const screenshotBase64 = context.screenshot.base64;\n\n assert(elementDescription, 'cannot find the target element description');\n const elementDescriptionText = userPromptToString(elementDescription);\n const userInstructionPrompt = findElementPrompt(elementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(\n adapter.locate.resultAdapter.promptSpec,\n );\n\n const modelImage = options.searchConfig?.image ?? {\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n };\n const preparedImage = await prepareModelImage({\n imageBase64: modelImage.imageBase64,\n width: modelImage.width,\n height: modelImage.height,\n policy: adapter.imagePreprocess,\n });\n\n const imagePayload = preparedImage.imageBase64;\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof elementDescription !== 'string') {\n const addOns = await multimodalPromptToChatMessages(\n userPromptToMultimodalPrompt(elementDescription),\n );\n msgs.push(...addOns);\n }\n\n let res: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AIElementLocateResponse>>\n >;\n try {\n res = await callAIWithObjectResponse<AIElementLocateResponse>(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n jsonParserSource: 'locate',\n },\n );\n } catch (callError) {\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n element: undefined,\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElement: LocateResultElement | undefined;\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n const resultAdapter = adapter.locate.resultAdapter;\n if (!hasLocateResult(res.content, resultAdapter.promptSpec.resultKey)) {\n return {\n rect: undefined,\n parseResult: {\n element: undefined,\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n }\n\n try {\n const mapping = options.searchConfig?.mapping;\n const targetPixelBbox = resultAdapter.adaptElementLocateResultToPixelBbox(\n res.content,\n {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n },\n );\n resRect = pixelBboxToRect(\n mapSearchAreaPixelBboxToOriginalPixelBbox(targetPixelBbox, mapping),\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n elementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElement = element;\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse locate result: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n element: matchedElement,\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelRuntime: ModelRuntime;\n abortSignal?: AbortSignal;\n}): Promise<{\n searchAreaConfig?: SearchAreaConfig;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const modelRuntime = options.modelRuntime;\n const { adapter } = modelRuntime;\n assert(\n adapter.locate.kind === 'standard',\n 'section locate requires a standard locate adapter',\n );\n const screenshotBase64 = context.screenshot.base64;\n const preparedImage = await prepareModelImage({\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n policy: adapter.imagePreprocess,\n });\n\n const systemPrompt = systemPromptToLocateSection(\n adapter.locate.resultAdapter.promptSpec,\n );\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n userPromptToString(sectionDescription),\n );\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: preparedImage.imageBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await multimodalPromptToChatMessages(\n userPromptToMultimodalPrompt(sectionDescription),\n );\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n jsonParserSource: 'section-locator',\n },\n );\n } catch (callError) {\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n searchAreaConfig: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let searchAreaConfig:\n | Awaited<ReturnType<typeof buildSearchAreaConfig>>\n | undefined;\n let sectionError = result.content.error;\n const resultAdapter = adapter.locate.resultAdapter;\n if (!hasLocateResult(result.content, resultAdapter.promptSpec.resultKey)) {\n return {\n searchAreaConfig: undefined,\n error: sectionError,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n }\n\n try {\n const adaptedResult =\n resultAdapter.adaptSectionLocateResultToPixelBboxGroup(result.content, {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n });\n const mergedRect = mergePixelBboxesToRect([\n adaptedResult.target,\n ...(adaptedResult.references ?? []),\n ]);\n debugSection('mergedRect %j', mergedRect);\n\n const expandedRect = expandSearchArea(mergedRect, context.shotSize);\n const originalWidth = expandedRect.width;\n const originalHeight = expandedRect.height;\n debugSection('expanded sectionRect %j', expandedRect);\n\n searchAreaConfig = await buildSearchAreaConfig({\n context,\n baseRect: mergedRect,\n });\n\n debugSection(\n 'scaled section image from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n searchAreaConfig.image.width,\n searchAreaConfig.image.height,\n searchAreaConfig.mapping.scale,\n );\n } catch (error) {\n const parseErrorMessage =\n error instanceof Error\n ? `Failed to parse section locate result: ${error.message}`\n : 'unknown error in section locate';\n sectionError = sectionError\n ? `${sectionError} (${parseErrorMessage})`\n : parseErrorMessage;\n }\n\n return {\n searchAreaConfig,\n error: sectionError,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelRuntime: ModelRuntime;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelRuntime } =\n options;\n const systemPrompt = systemPromptToExtract({\n screenshotIncluded: extractOption?.screenshotIncluded !== false,\n referenceImagesIncluded: !!multimodalPrompt?.images?.length,\n });\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'text',\n text: 'This is the current screenshot to evaluate. Unless <DATA_DEMAND> explicitly asks for comparison or matching against reference images, base your answer on this screenshot and its contents when provided.',\n });\n\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await multimodalPromptToChatMessages(multimodalPrompt);\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelRuntime);\n\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n modelRuntime: ModelRuntime,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n debugInspect('AiJudgeOrderSensitive: description=%s', description);\n\n const result = await callAIWithObjectResponse<{ isOrderSensitive: boolean }>(\n msgs,\n modelRuntime,\n {\n jsonParserSource: 'generic-object',\n },\n );\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","hasLocateResult","input","resultKey","record","locateResult","Array","undefined","buildSearchAreaConfig","options","context","baseRect","scaleRatio","sectionRect","expandSearchArea","croppedResult","cropByRect","scaledResult","scaleImage","AiLocateElement","targetElementDescription","locateOptions","locateAdapter","genericLocate","elementDescription","modelRuntime","adapter","assert","screenshotBase64","elementDescriptionText","userPromptToString","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","modelImage","preparedImage","prepareModelImage","imagePayload","msgs","addOns","multimodalPromptToChatMessages","userPromptToMultimodalPrompt","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","usage","JSON","resRect","matchedElement","errors","resultAdapter","mapping","targetPixelBbox","pixelBboxToRect","mapSearchAreaPixelBboxToOriginalPixelBbox","element","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","searchAreaConfig","sectionError","adaptedResult","mergedRect","mergePixelBboxesToRect","expandedRect","originalWidth","originalHeight","error","parseErrorMessage","AiExtractElementInfo","dataQuery","extractOption","multimodalPrompt","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AAiEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAO9B,SAASE,gBAAgBC,KAAc,EAAEC,SAAiB;IACxD,IAAI,CAACD,SAAS,AAAiB,YAAjB,OAAOA,OACnB,OAAO;IAGT,MAAME,SAASF;IACf,MAAMG,eAAeD,MAAM,CAACD,UAAU;IACtC,OAAOG,MAAM,OAAO,CAACD,gBACjBA,aAAa,MAAM,GAAG,IACtBA,AAAiBE,WAAjBF;AACN;AAEO,eAAeG,sBAAsBC,OAG3C;IACC,MAAM,EAAEC,OAAO,EAAEC,QAAQ,EAAE,GAAGF;IAC9B,MAAMG,aAAa;IACnB,MAAMC,cAAcC,iBAAiBH,UAAUD,QAAQ,QAAQ;IAE/D,MAAMK,gBAAgB,MAAMC,WAC1BN,QAAQ,UAAU,CAAC,MAAM,EACzBG;IAGF,MAAMI,eAAe,MAAMC,WAAWH,cAAc,WAAW,EAAEH;IACjE,OAAO;QACL,YAAYC;QACZ,OAAO;YACL,aAAaI,aAAa,WAAW;YACrC,OAAOA,aAAa,KAAK;YACzB,QAAQA,aAAa,MAAM;QAC7B;QACA,SAAS;YACP,QAAQ;gBACN,GAAGJ,YAAY,IAAI;gBACnB,GAAGA,YAAY,GAAG;YACpB;YACA,OAAOD;QACT;IACF;AACF;AAEO,eAAeO,gBACpBV,OAAkE;IAElE,MAAM,EAAEW,wBAAwB,EAAE,GAAGC,eAAe,GAAGZ;IACvD,MAAMa,gBAAgBb,QAAQ,YAAY,CAAC,OAAO,CAAC,MAAM;IACzD,IAAIa,AAAuB,aAAvBA,cAAc,IAAI,EACpB,OAAOA,cAAc,QAAQ,CAACF,0BAA0BC;IAE1D,OAAOE,cAAcH,0BAA0BC;AACjD;AAEO,eAAeE,cACpBC,kBAA+B,EAC/Bf,OAAsB;IAEtB,MAAM,EAAEC,OAAO,EAAE,GAAGD;IACpB,MAAMgB,eAAehB,QAAQ,YAAY;IACzC,MAAM,EAAEiB,OAAO,EAAE,GAAGD;IACpBE,OACED,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,EACnB;IAEF,MAAME,mBAAmBlB,QAAQ,UAAU,CAAC,MAAM;IAElDiB,OAAOH,oBAAoB;IAC3B,MAAMK,yBAAyBC,mBAAmBN;IAClD,MAAMO,wBAAwBC,kBAAkBH;IAChD,MAAMI,eAAeC,4BACnBR,QAAQ,MAAM,CAAC,aAAa,CAAC,UAAU;IAGzC,MAAMS,aAAa1B,QAAQ,YAAY,EAAE,SAAS;QAChD,aAAamB;QACb,OAAOlB,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;IACjC;IACA,MAAM0B,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaF,WAAW,WAAW;QACnC,OAAOA,WAAW,KAAK;QACvB,QAAQA,WAAW,MAAM;QACzB,QAAQT,QAAQ,eAAe;IACjC;IAEA,MAAMY,eAAeF,cAAc,WAAW;IAE9C,MAAMG,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKK;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMP;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOP,oBAAiC;QAC1C,MAAMgB,SAAS,MAAMC,+BACnBC,6BAA6BlB;QAE/Be,KAAK,IAAI,IAAIC;IACf;IAEA,IAAIG;IAGJ,IAAI;QACFA,MAAM,MAAMC,yBACVL,MACAd,cACA;YACE,aAAahB,QAAQ,WAAW;YAChC,kBAAkB;QACpB;IAEJ,EAAE,OAAOoC,WAAW;QAClB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMK,QACJN,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGtC;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,SAASA;gBACT,QAAQ;oBAAC,CAAC,eAAe,EAAEuC,cAAc;iBAAC;YAC5C;YACAG;YACAE;YACA,mBAAmB5C;QACrB;IACF;IAEA,MAAM0C,cAAcG,KAAK,SAAS,CAACT,IAAI,OAAO;IAE9C,IAAIU;IACJ,IAAIC;IACJ,IAAIC,SACF,YAAYZ,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,MAAMa,gBAAgB9B,QAAQ,MAAM,CAAC,aAAa;IAClD,IAAI,CAACzB,gBAAgB0C,IAAI,OAAO,EAAEa,cAAc,UAAU,CAAC,SAAS,GAClE,OAAO;QACL,MAAMjD;QACN,aAAa;YACX,SAASA;YACT,QAAQgD;QACV;QACAN;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;IAGF,IAAI;QACF,MAAMc,UAAUhD,QAAQ,YAAY,EAAE;QACtC,MAAMiD,kBAAkBF,cAAc,mCAAmC,CACvEb,IAAI,OAAO,EACX;YACE,cAAcP,cAAc,YAAY;YACxC,aAAaA,cAAc,WAAW;QACxC;QAEFiB,UAAUM,gBACRC,0CAA0CF,iBAAiBD;QAG7D3D,aAAa,WAAWuD;QAExB,MAAMQ,UAA+BC,sBACnCT,SACAxB;QAEF0B,SAAS,EAAE;QAEX,IAAIM,SACFP,iBAAiBO;IAErB,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAahB,QACT,CAAC,+BAA+B,EAAEgB,EAAE,OAAO,EAAE,GAC7C;QACN,IAAI,AAACR,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAES,IAAI,CAAC,CAAC;aAFtBT,SAAS;YAACS;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMX;QACN,aAAa;YACX,SAASC;YACT,QAAQC;QACV;QACAN;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAesB,gBAAgBxD,OAKrC;IAMC,MAAM,EAAEC,OAAO,EAAEwD,kBAAkB,EAAE,GAAGzD;IACxC,MAAMgB,eAAehB,QAAQ,YAAY;IACzC,MAAM,EAAEiB,OAAO,EAAE,GAAGD;IACpBE,OACED,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,EACnB;IAEF,MAAME,mBAAmBlB,QAAQ,UAAU,CAAC,MAAM;IAClD,MAAM0B,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaT;QACb,OAAOlB,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;QAC/B,QAAQgB,QAAQ,eAAe;IACjC;IAEA,MAAMO,eAAekC,4BACnBzC,QAAQ,MAAM,CAAC,aAAa,CAAC,UAAU;IAEzC,MAAM0C,gCAAgCC,0BACpCvC,mBAAmBoC;IAErB,MAAM3B,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG,cAAc,WAAW;wBAC9B,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMgC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM1B,SAAS,MAAMC,+BACnBC,6BAA6BwB;QAE/B3B,KAAK,IAAI,IAAIC;IACf;IAEA,IAAI8B;IAGJ,IAAI;QACFA,SAAS,MAAM1B,yBACbL,MACAd,cACA;YACE,aAAahB,QAAQ,WAAW;YAChC,kBAAkB;QACpB;IAEJ,EAAE,OAAOoC,WAAW;QAClB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMK,QACJN,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGtC;QAChE,OAAO;YACL,kBAAkBA;YAClB,OAAO,CAAC,eAAe,EAAEuC,cAAc;YACvCG;YACAE;QACF;IACF;IAEA,IAAIoB;IAGJ,IAAIC,eAAeF,OAAO,OAAO,CAAC,KAAK;IACvC,MAAMd,gBAAgB9B,QAAQ,MAAM,CAAC,aAAa;IAClD,IAAI,CAACzB,gBAAgBqE,OAAO,OAAO,EAAEd,cAAc,UAAU,CAAC,SAAS,GACrE,OAAO;QACL,kBAAkBjD;QAClB,OAAOiE;QACP,aAAapB,KAAK,SAAS,CAACkB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;IAGF,IAAI;QACF,MAAMG,gBACJjB,cAAc,wCAAwC,CAACc,OAAO,OAAO,EAAE;YACrE,cAAclC,cAAc,YAAY;YACxC,aAAaA,cAAc,WAAW;QACxC;QACF,MAAMsC,aAAaC,uBAAuB;YACxCF,cAAc,MAAM;eAChBA,cAAc,UAAU,IAAI,EAAE;SACnC;QACDzE,aAAa,iBAAiB0E;QAE9B,MAAME,eAAe9D,iBAAiB4D,YAAYhE,QAAQ,QAAQ;QAClE,MAAMmE,gBAAgBD,aAAa,KAAK;QACxC,MAAME,iBAAiBF,aAAa,MAAM;QAC1C5E,aAAa,2BAA2B4E;QAExCL,mBAAmB,MAAM/D,sBAAsB;YAC7CE;YACA,UAAUgE;QACZ;QAEA1E,aACE,uDACA6E,eACAC,gBACAP,iBAAiB,KAAK,CAAC,KAAK,EAC5BA,iBAAiB,KAAK,CAAC,MAAM,EAC7BA,iBAAiB,OAAO,CAAC,KAAK;IAElC,EAAE,OAAOQ,OAAO;QACd,MAAMC,oBACJD,iBAAiBhC,QACb,CAAC,uCAAuC,EAAEgC,MAAM,OAAO,EAAE,GACzD;QACNP,eAAeA,eACX,GAAGA,aAAa,EAAE,EAAEQ,kBAAkB,CAAC,CAAC,GACxCA;IACN;IAEA,OAAO;QACLT;QACA,OAAOC;QACP,aAAapB,KAAK,SAAS,CAACkB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAeW,qBAAwBxE,OAO7C;IACC,MAAM,EAAEyE,SAAS,EAAExE,OAAO,EAAEyE,aAAa,EAAEC,gBAAgB,EAAE3D,YAAY,EAAE,GACzEhB;IACF,MAAMwB,eAAeoD,sBAAsB;QACzC,oBAAoBF,eAAe,uBAAuB;QAC1D,yBAAyB,CAAC,CAACC,kBAAkB,QAAQ;IACvD;IACA,MAAMxD,mBAAmBlB,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM4E,wBAAwBC,uBAC5B9E,QAAQ,eAAe,IAAI,IAC3ByE;IAGF,MAAMM,cAAyD,EAAE;IAEjE,IAAIL,eAAe,uBAAuB,OAAO;QAC/CK,YAAY,IAAI,CAAC;YACf,MAAM;YACN,MAAM;QACR;QAEAA,YAAY,IAAI,CAAC;YACf,MAAM;YACN,WAAW;gBACT,KAAK5D;gBACL,QAAQ;YACV;QACF;IACF;IAEA4D,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM/C,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAASuD;QACX;KACD;IAED,IAAIJ,kBAAkB;QACpB,MAAM5C,SAAS,MAAMC,+BAA+B2C;QACpD7C,KAAK,IAAI,IAAIC;IACf;IAEA,MAAM,EACJ,SAASS,WAAW,EACpBE,KAAK,EACLsC,iBAAiB,EAClB,GAAG,MAAMC,OAAOnD,MAAMd;IAEvB,IAAIkE;IACJ,IAAI;QACFA,cAAcC,2BAA8B3C;IAC9C,EAAE,OAAO4C,YAAY;QACnB,MAAM/C,eACJ+C,sBAAsB9C,QAAQ8C,WAAW,OAAO,GAAG7C,OAAO6C;QAC5D,MAAM,IAAI3C,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAE;IAEJ;IAEA,OAAO;QACLwC;QACA1C;QACAE;QACAsC;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBtE,YAA0B;IAK1B,MAAMQ,eAAe+D;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMxD,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAASgE;QACX;KACD;IAEDnG,aAAa,yCAAyCiG;IAEtD,MAAMzB,SAAS,MAAM1B,yBACnBL,MACAd,cACA;QACE,kBAAkB;IACpB;IAGF,OAAO;QACL,kBAAkB6C,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocateResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport { generateElementByRect } from '@midscene/shared/extractor';\nimport { cropByRect, scaleImage } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport {\n expandSearchArea,\n multimodalPromptToChatMessages,\n userPromptToMultimodalPrompt,\n userPromptToString,\n} from '../common';\nimport type { ModelRuntime } from './models';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n} from './service-caller/index';\nimport { prepareModelImage } from './workflows/image-preprocess';\nimport {\n mergePixelBboxesToRect,\n pixelBboxToRect,\n} from './workflows/inspect/locate-result-rect';\nimport { mapSearchAreaPixelBboxToOriginalPixelBbox } from './workflows/inspect/search-area-mapping';\nimport type {\n LocateOptions,\n LocateResult,\n SearchAreaConfig,\n} from './workflows/inspect/types';\n\nexport type InspectAIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nexport {\n userPromptToString as extraTextFromUserPrompt,\n multimodalPromptToChatMessages as promptsToChatParam,\n} from '../common';\n\nfunction hasLocateResult(input: unknown, resultKey: string) {\n if (!input || typeof input !== 'object') {\n return false;\n }\n\n const record = input as Record<string, unknown>;\n const locateResult = record[resultKey];\n return Array.isArray(locateResult)\n ? locateResult.length > 0\n : locateResult !== undefined;\n}\n\nexport async function buildSearchAreaConfig(options: {\n context: UIContext;\n baseRect: Rect;\n}): Promise<SearchAreaConfig> {\n const { context, baseRect } = options;\n const scaleRatio = 2;\n const sectionRect = expandSearchArea(baseRect, context.shotSize);\n\n const croppedResult = await cropByRect(\n context.screenshot.base64,\n sectionRect,\n );\n\n const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);\n return {\n sourceRect: sectionRect,\n image: {\n imageBase64: scaledResult.imageBase64,\n width: scaledResult.width,\n height: scaledResult.height,\n },\n mapping: {\n offset: {\n x: sectionRect.left,\n y: sectionRect.top,\n },\n scale: scaleRatio,\n },\n };\n}\n\nexport async function AiLocateElement(\n options: LocateOptions & { targetElementDescription: TUserPrompt },\n): Promise<LocateResult> {\n const { targetElementDescription, ...locateOptions } = options;\n const locateAdapter = options.modelRuntime.adapter.locate;\n if (locateAdapter.kind === 'custom') {\n return locateAdapter.locateFn(targetElementDescription, locateOptions);\n }\n return genericLocate(targetElementDescription, locateOptions);\n}\n\nexport async function genericLocate(\n elementDescription: TUserPrompt,\n options: LocateOptions,\n): Promise<LocateResult> {\n const { context } = options;\n const modelRuntime = options.modelRuntime;\n const { adapter } = modelRuntime;\n assert(\n adapter.locate.kind === 'standard',\n 'generic locate requires a standard locate adapter',\n );\n const screenshotBase64 = context.screenshot.base64;\n\n assert(elementDescription, 'cannot find the target element description');\n const elementDescriptionText = userPromptToString(elementDescription);\n const userInstructionPrompt = findElementPrompt(elementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(\n adapter.locate.resultAdapter.promptSpec,\n );\n\n const modelImage = options.searchConfig?.image ?? {\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n };\n const preparedImage = await prepareModelImage({\n imageBase64: modelImage.imageBase64,\n width: modelImage.width,\n height: modelImage.height,\n policy: adapter.imagePreprocess,\n });\n\n const imagePayload = preparedImage.imageBase64;\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof elementDescription !== 'string') {\n const addOns = await multimodalPromptToChatMessages(\n userPromptToMultimodalPrompt(elementDescription),\n );\n msgs.push(...addOns);\n }\n\n let res: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AIElementLocateResponse>>\n >;\n try {\n res = await callAIWithObjectResponse<AIElementLocateResponse>(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n jsonParserSource: 'locate',\n },\n );\n } catch (callError) {\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n const rawChoiceMessage =\n callError instanceof AIResponseParseError\n ? callError.rawChoiceMessage\n : undefined;\n return {\n rect: undefined,\n parseResult: {\n element: undefined,\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n rawChoiceMessage,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElement: LocateResultElement | undefined;\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n const resultAdapter = adapter.locate.resultAdapter;\n if (!hasLocateResult(res.content, resultAdapter.promptSpec.resultKey)) {\n return {\n rect: undefined,\n parseResult: {\n element: undefined,\n errors: errors as string[],\n },\n rawResponse,\n rawChoiceMessage: res.rawChoiceMessage,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n }\n\n try {\n const mapping = options.searchConfig?.mapping;\n const targetPixelBbox = resultAdapter.adaptElementLocateResultToPixelBbox(\n res.content,\n {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n },\n );\n resRect = pixelBboxToRect(\n mapSearchAreaPixelBboxToOriginalPixelBbox(targetPixelBbox, mapping),\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n elementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElement = element;\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse locate result: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n element: matchedElement,\n errors: errors as string[],\n },\n rawResponse,\n rawChoiceMessage: res.rawChoiceMessage,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelRuntime: ModelRuntime;\n abortSignal?: AbortSignal;\n}): Promise<{\n searchAreaConfig?: SearchAreaConfig;\n error?: string;\n rawResponse: string;\n rawChoiceMessage?: unknown;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const modelRuntime = options.modelRuntime;\n const { adapter } = modelRuntime;\n assert(\n adapter.locate.kind === 'standard',\n 'section locate requires a standard locate adapter',\n );\n const screenshotBase64 = context.screenshot.base64;\n const preparedImage = await prepareModelImage({\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n policy: adapter.imagePreprocess,\n });\n\n const systemPrompt = systemPromptToLocateSection(\n adapter.locate.resultAdapter.promptSpec,\n );\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n userPromptToString(sectionDescription),\n );\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: preparedImage.imageBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await multimodalPromptToChatMessages(\n userPromptToMultimodalPrompt(sectionDescription),\n );\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n jsonParserSource: 'section-locator',\n },\n );\n } catch (callError) {\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n const rawChoiceMessage =\n callError instanceof AIResponseParseError\n ? callError.rawChoiceMessage\n : undefined;\n return {\n searchAreaConfig: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n rawChoiceMessage,\n usage,\n };\n }\n\n let searchAreaConfig:\n | Awaited<ReturnType<typeof buildSearchAreaConfig>>\n | undefined;\n let sectionError = result.content.error;\n const resultAdapter = adapter.locate.resultAdapter;\n if (!hasLocateResult(result.content, resultAdapter.promptSpec.resultKey)) {\n return {\n searchAreaConfig: undefined,\n error: sectionError,\n rawResponse: JSON.stringify(result.content),\n rawChoiceMessage: result.rawChoiceMessage,\n usage: result.usage,\n };\n }\n\n try {\n const adaptedResult =\n resultAdapter.adaptSectionLocateResultToPixelBboxGroup(result.content, {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n });\n const mergedRect = mergePixelBboxesToRect([\n adaptedResult.target,\n ...(adaptedResult.references ?? []),\n ]);\n debugSection('mergedRect %j', mergedRect);\n\n const expandedRect = expandSearchArea(mergedRect, context.shotSize);\n const originalWidth = expandedRect.width;\n const originalHeight = expandedRect.height;\n debugSection('expanded sectionRect %j', expandedRect);\n\n searchAreaConfig = await buildSearchAreaConfig({\n context,\n baseRect: mergedRect,\n });\n\n debugSection(\n 'scaled section image from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n searchAreaConfig.image.width,\n searchAreaConfig.image.height,\n searchAreaConfig.mapping.scale,\n );\n } catch (error) {\n const parseErrorMessage =\n error instanceof Error\n ? `Failed to parse section locate result: ${error.message}`\n : 'unknown error in section locate';\n sectionError = sectionError\n ? `${sectionError} (${parseErrorMessage})`\n : parseErrorMessage;\n }\n\n return {\n searchAreaConfig,\n error: sectionError,\n rawResponse: JSON.stringify(result.content),\n rawChoiceMessage: result.rawChoiceMessage,\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelRuntime: ModelRuntime;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelRuntime } =\n options;\n const systemPrompt = systemPromptToExtract({\n screenshotIncluded: extractOption?.screenshotIncluded !== false,\n referenceImagesIncluded: !!multimodalPrompt?.images?.length,\n });\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'text',\n text: 'This is the current screenshot to evaluate. Unless <DATA_DEMAND> explicitly asks for comparison or matching against reference images, base your answer on this screenshot and its contents when provided.',\n });\n\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await multimodalPromptToChatMessages(multimodalPrompt);\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n rawChoiceMessage,\n } = await callAI(msgs, modelRuntime);\n\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n rawChoiceMessage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n rawChoiceMessage,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n modelRuntime: ModelRuntime,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n debugInspect('AiJudgeOrderSensitive: description=%s', description);\n\n const result = await callAIWithObjectResponse<{ isOrderSensitive: boolean }>(\n msgs,\n modelRuntime,\n {\n jsonParserSource: 'generic-object',\n },\n );\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","hasLocateResult","input","resultKey","record","locateResult","Array","undefined","buildSearchAreaConfig","options","context","baseRect","scaleRatio","sectionRect","expandSearchArea","croppedResult","cropByRect","scaledResult","scaleImage","AiLocateElement","targetElementDescription","locateOptions","locateAdapter","genericLocate","elementDescription","modelRuntime","adapter","assert","screenshotBase64","elementDescriptionText","userPromptToString","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","modelImage","preparedImage","prepareModelImage","imagePayload","msgs","addOns","multimodalPromptToChatMessages","userPromptToMultimodalPrompt","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","usage","rawChoiceMessage","JSON","resRect","matchedElement","errors","resultAdapter","mapping","targetPixelBbox","pixelBboxToRect","mapSearchAreaPixelBboxToOriginalPixelBbox","element","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","searchAreaConfig","sectionError","adaptedResult","mergedRect","mergePixelBboxesToRect","expandedRect","originalWidth","originalHeight","error","parseErrorMessage","AiExtractElementInfo","dataQuery","extractOption","multimodalPrompt","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AAiEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAO9B,SAASE,gBAAgBC,KAAc,EAAEC,SAAiB;IACxD,IAAI,CAACD,SAAS,AAAiB,YAAjB,OAAOA,OACnB,OAAO;IAGT,MAAME,SAASF;IACf,MAAMG,eAAeD,MAAM,CAACD,UAAU;IACtC,OAAOG,MAAM,OAAO,CAACD,gBACjBA,aAAa,MAAM,GAAG,IACtBA,AAAiBE,WAAjBF;AACN;AAEO,eAAeG,sBAAsBC,OAG3C;IACC,MAAM,EAAEC,OAAO,EAAEC,QAAQ,EAAE,GAAGF;IAC9B,MAAMG,aAAa;IACnB,MAAMC,cAAcC,iBAAiBH,UAAUD,QAAQ,QAAQ;IAE/D,MAAMK,gBAAgB,MAAMC,WAC1BN,QAAQ,UAAU,CAAC,MAAM,EACzBG;IAGF,MAAMI,eAAe,MAAMC,WAAWH,cAAc,WAAW,EAAEH;IACjE,OAAO;QACL,YAAYC;QACZ,OAAO;YACL,aAAaI,aAAa,WAAW;YACrC,OAAOA,aAAa,KAAK;YACzB,QAAQA,aAAa,MAAM;QAC7B;QACA,SAAS;YACP,QAAQ;gBACN,GAAGJ,YAAY,IAAI;gBACnB,GAAGA,YAAY,GAAG;YACpB;YACA,OAAOD;QACT;IACF;AACF;AAEO,eAAeO,gBACpBV,OAAkE;IAElE,MAAM,EAAEW,wBAAwB,EAAE,GAAGC,eAAe,GAAGZ;IACvD,MAAMa,gBAAgBb,QAAQ,YAAY,CAAC,OAAO,CAAC,MAAM;IACzD,IAAIa,AAAuB,aAAvBA,cAAc,IAAI,EACpB,OAAOA,cAAc,QAAQ,CAACF,0BAA0BC;IAE1D,OAAOE,cAAcH,0BAA0BC;AACjD;AAEO,eAAeE,cACpBC,kBAA+B,EAC/Bf,OAAsB;IAEtB,MAAM,EAAEC,OAAO,EAAE,GAAGD;IACpB,MAAMgB,eAAehB,QAAQ,YAAY;IACzC,MAAM,EAAEiB,OAAO,EAAE,GAAGD;IACpBE,OACED,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,EACnB;IAEF,MAAME,mBAAmBlB,QAAQ,UAAU,CAAC,MAAM;IAElDiB,OAAOH,oBAAoB;IAC3B,MAAMK,yBAAyBC,mBAAmBN;IAClD,MAAMO,wBAAwBC,kBAAkBH;IAChD,MAAMI,eAAeC,4BACnBR,QAAQ,MAAM,CAAC,aAAa,CAAC,UAAU;IAGzC,MAAMS,aAAa1B,QAAQ,YAAY,EAAE,SAAS;QAChD,aAAamB;QACb,OAAOlB,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;IACjC;IACA,MAAM0B,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaF,WAAW,WAAW;QACnC,OAAOA,WAAW,KAAK;QACvB,QAAQA,WAAW,MAAM;QACzB,QAAQT,QAAQ,eAAe;IACjC;IAEA,MAAMY,eAAeF,cAAc,WAAW;IAE9C,MAAMG,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKK;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMP;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOP,oBAAiC;QAC1C,MAAMgB,SAAS,MAAMC,+BACnBC,6BAA6BlB;QAE/Be,KAAK,IAAI,IAAIC;IACf;IAEA,IAAIG;IAGJ,IAAI;QACFA,MAAM,MAAMC,yBACVL,MACAd,cACA;YACE,aAAahB,QAAQ,WAAW;YAChC,kBAAkB;QACpB;IAEJ,EAAE,OAAOoC,WAAW;QAClB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMK,QACJN,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGtC;QAChE,MAAM6C,mBACJP,qBAAqBK,uBACjBL,UAAU,gBAAgB,GAC1BtC;QACN,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,SAASA;gBACT,QAAQ;oBAAC,CAAC,eAAe,EAAEuC,cAAc;iBAAC;YAC5C;YACAG;YACAG;YACAD;YACA,mBAAmB5C;QACrB;IACF;IAEA,MAAM0C,cAAcI,KAAK,SAAS,CAACV,IAAI,OAAO;IAE9C,IAAIW;IACJ,IAAIC;IACJ,IAAIC,SACF,YAAYb,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,MAAMc,gBAAgB/B,QAAQ,MAAM,CAAC,aAAa;IAClD,IAAI,CAACzB,gBAAgB0C,IAAI,OAAO,EAAEc,cAAc,UAAU,CAAC,SAAS,GAClE,OAAO;QACL,MAAMlD;QACN,aAAa;YACX,SAASA;YACT,QAAQiD;QACV;QACAP;QACA,kBAAkBN,IAAI,gBAAgB;QACtC,OAAOA,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;IAGF,IAAI;QACF,MAAMe,UAAUjD,QAAQ,YAAY,EAAE;QACtC,MAAMkD,kBAAkBF,cAAc,mCAAmC,CACvEd,IAAI,OAAO,EACX;YACE,cAAcP,cAAc,YAAY;YACxC,aAAaA,cAAc,WAAW;QACxC;QAEFkB,UAAUM,gBACRC,0CAA0CF,iBAAiBD;QAG7D5D,aAAa,WAAWwD;QAExB,MAAMQ,UAA+BC,sBACnCT,SACAzB;QAEF2B,SAAS,EAAE;QAEX,IAAIM,SACFP,iBAAiBO;IAErB,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAajB,QACT,CAAC,+BAA+B,EAAEiB,EAAE,OAAO,EAAE,GAC7C;QACN,IAAI,AAACR,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAES,IAAI,CAAC,CAAC;aAFtBT,SAAS;YAACS;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMX;QACN,aAAa;YACX,SAASC;YACT,QAAQC;QACV;QACAP;QACA,kBAAkBN,IAAI,gBAAgB;QACtC,OAAOA,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAeuB,gBAAgBzD,OAKrC;IAOC,MAAM,EAAEC,OAAO,EAAEyD,kBAAkB,EAAE,GAAG1D;IACxC,MAAMgB,eAAehB,QAAQ,YAAY;IACzC,MAAM,EAAEiB,OAAO,EAAE,GAAGD;IACpBE,OACED,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,EACnB;IAEF,MAAME,mBAAmBlB,QAAQ,UAAU,CAAC,MAAM;IAClD,MAAM0B,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaT;QACb,OAAOlB,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;QAC/B,QAAQgB,QAAQ,eAAe;IACjC;IAEA,MAAMO,eAAemC,4BACnB1C,QAAQ,MAAM,CAAC,aAAa,CAAC,UAAU;IAEzC,MAAM2C,gCAAgCC,0BACpCxC,mBAAmBqC;IAErB,MAAM5B,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG,cAAc,WAAW;wBAC9B,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMiC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAMC,+BACnBC,6BAA6ByB;QAE/B5B,KAAK,IAAI,IAAIC;IACf;IAEA,IAAI+B;IAGJ,IAAI;QACFA,SAAS,MAAM3B,yBACbL,MACAd,cACA;YACE,aAAahB,QAAQ,WAAW;YAChC,kBAAkB;QACpB;IAEJ,EAAE,OAAOoC,WAAW;QAClB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMK,QACJN,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGtC;QAChE,MAAM6C,mBACJP,qBAAqBK,uBACjBL,UAAU,gBAAgB,GAC1BtC;QACN,OAAO;YACL,kBAAkBA;YAClB,OAAO,CAAC,eAAe,EAAEuC,cAAc;YACvCG;YACAG;YACAD;QACF;IACF;IAEA,IAAIqB;IAGJ,IAAIC,eAAeF,OAAO,OAAO,CAAC,KAAK;IACvC,MAAMd,gBAAgB/B,QAAQ,MAAM,CAAC,aAAa;IAClD,IAAI,CAACzB,gBAAgBsE,OAAO,OAAO,EAAEd,cAAc,UAAU,CAAC,SAAS,GACrE,OAAO;QACL,kBAAkBlD;QAClB,OAAOkE;QACP,aAAapB,KAAK,SAAS,CAACkB,OAAO,OAAO;QAC1C,kBAAkBA,OAAO,gBAAgB;QACzC,OAAOA,OAAO,KAAK;IACrB;IAGF,IAAI;QACF,MAAMG,gBACJjB,cAAc,wCAAwC,CAACc,OAAO,OAAO,EAAE;YACrE,cAAcnC,cAAc,YAAY;YACxC,aAAaA,cAAc,WAAW;QACxC;QACF,MAAMuC,aAAaC,uBAAuB;YACxCF,cAAc,MAAM;eAChBA,cAAc,UAAU,IAAI,EAAE;SACnC;QACD1E,aAAa,iBAAiB2E;QAE9B,MAAME,eAAe/D,iBAAiB6D,YAAYjE,QAAQ,QAAQ;QAClE,MAAMoE,gBAAgBD,aAAa,KAAK;QACxC,MAAME,iBAAiBF,aAAa,MAAM;QAC1C7E,aAAa,2BAA2B6E;QAExCL,mBAAmB,MAAMhE,sBAAsB;YAC7CE;YACA,UAAUiE;QACZ;QAEA3E,aACE,uDACA8E,eACAC,gBACAP,iBAAiB,KAAK,CAAC,KAAK,EAC5BA,iBAAiB,KAAK,CAAC,MAAM,EAC7BA,iBAAiB,OAAO,CAAC,KAAK;IAElC,EAAE,OAAOQ,OAAO;QACd,MAAMC,oBACJD,iBAAiBjC,QACb,CAAC,uCAAuC,EAAEiC,MAAM,OAAO,EAAE,GACzD;QACNP,eAAeA,eACX,GAAGA,aAAa,EAAE,EAAEQ,kBAAkB,CAAC,CAAC,GACxCA;IACN;IAEA,OAAO;QACLT;QACA,OAAOC;QACP,aAAapB,KAAK,SAAS,CAACkB,OAAO,OAAO;QAC1C,kBAAkBA,OAAO,gBAAgB;QACzC,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAeW,qBAAwBzE,OAO7C;IACC,MAAM,EAAE0E,SAAS,EAAEzE,OAAO,EAAE0E,aAAa,EAAEC,gBAAgB,EAAE5D,YAAY,EAAE,GACzEhB;IACF,MAAMwB,eAAeqD,sBAAsB;QACzC,oBAAoBF,eAAe,uBAAuB;QAC1D,yBAAyB,CAAC,CAACC,kBAAkB,QAAQ;IACvD;IACA,MAAMzD,mBAAmBlB,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM6E,wBAAwBC,uBAC5B/E,QAAQ,eAAe,IAAI,IAC3B0E;IAGF,MAAMM,cAAyD,EAAE;IAEjE,IAAIL,eAAe,uBAAuB,OAAO;QAC/CK,YAAY,IAAI,CAAC;YACf,MAAM;YACN,MAAM;QACR;QAEAA,YAAY,IAAI,CAAC;YACf,MAAM;YACN,WAAW;gBACT,KAAK7D;gBACL,QAAQ;YACV;QACF;IACF;IAEA6D,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMhD,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAASwD;QACX;KACD;IAED,IAAIJ,kBAAkB;QACpB,MAAM7C,SAAS,MAAMC,+BAA+B4C;QACpD9C,KAAK,IAAI,IAAIC;IACf;IAEA,MAAM,EACJ,SAASS,WAAW,EACpBE,KAAK,EACLuC,iBAAiB,EACjBtC,gBAAgB,EACjB,GAAG,MAAMuC,OAAOpD,MAAMd;IAEvB,IAAImE;IACJ,IAAI;QACFA,cAAcC,2BAA8B5C;IAC9C,EAAE,OAAO6C,YAAY;QACnB,MAAMhD,eACJgD,sBAAsB/C,QAAQ+C,WAAW,OAAO,GAAG9C,OAAO8C;QAC5D,MAAM,IAAI5C,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAE,OACAC;IAEJ;IAEA,OAAO;QACLwC;QACA3C;QACAG;QACAD;QACAuC;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBvE,YAA0B;IAK1B,MAAMQ,eAAegE;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMzD,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASN;QAAa;QACxC;YACE,MAAM;YACN,SAASiE;QACX;KACD;IAEDpG,aAAa,yCAAyCkG;IAEtD,MAAMzB,SAAS,MAAM3B,yBACnBL,MACAd,cACA;QACE,kBAAkB;IACpB;IAGF,OAAO;QACL,kBAAkB8C,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
@@ -165,7 +165,7 @@ async function plan(userInstruction, opts) {
165
165
  ...instruction,
166
166
  ...historyLog
167
167
  ];
168
- let { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelRuntime, {
168
+ let { content: rawResponse, usage, reasoning_content, rawChoiceMessage } = await callAI(msgs, modelRuntime, {
169
169
  abortSignal: opts.abortSignal,
170
170
  requiresOriginalImageDetail: opts.includeLocateInPlanning
171
171
  });
@@ -181,6 +181,7 @@ async function plan(userInstruction, opts) {
181
181
  rawResponse = retry.content;
182
182
  usage = retry.usage;
183
183
  reasoning_content = retry.reasoning_content;
184
+ rawChoiceMessage = retry.rawChoiceMessage;
184
185
  planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);
185
186
  }
186
187
  if (planFromAI.action && void 0 !== planFromAI.finalizeSuccess) {
@@ -201,6 +202,7 @@ async function plan(userInstruction, opts) {
201
202
  ...planFromAI,
202
203
  actions,
203
204
  rawResponse,
205
+ rawChoiceMessage,
204
206
  usage,
205
207
  reasoning_content,
206
208
  yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),
@@ -251,7 +253,7 @@ async function plan(userInstruction, opts) {
251
253
  return returnValue;
252
254
  } catch (parseError) {
253
255
  const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
254
- throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage);
256
+ throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage, rawChoiceMessage);
255
257
  }
256
258
  }
257
259
  export { parseXMLPlanningResponse, plan };
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import { type TUserPrompt, userPromptToString } from '@/common';\nimport type {\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n} from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { buildYamlFlowFromPlans, findAllMidsceneLocatorField } from '../common';\nimport { planningModelFamilyRequiredForLocateMessage } from './errors';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport { AIResponseParseError, callAI } from './service-caller/index';\nimport type { JsonParser, JsonParserSource } from './service-caller/json';\nimport { prepareModelImage } from './workflows/image-preprocess';\nimport type { PlanOptions } from './workflows/planning/types';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\nconst noPreviousActionsText =\n 'No previous actions have been executed in this aiAct execution yet. If the instruction asks for actions, choose the first action to execute.';\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse.\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n jsonParser: JsonParser,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n // Strip any trailing XML tags that LLM might have leaked into the action type\n // e.g. \"KeyboardPress</action-type>\\n<action-param-json>\" -> \"KeyboardPress\"\n const type = actionType.split('<')[0].trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = jsonParser(actionParamStr, {\n source: 'planning-action-param',\n preserveStringValueKeys:\n type.toLowerCase() === 'input' ? ['value'] : undefined,\n });\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: TUserPrompt,\n opts: PlanOptions,\n): Promise<PlanningAIResponse> {\n const { context, conversationHistory } = opts;\n const modelRuntime = opts.modelRuntime;\n const { adapter } = modelRuntime;\n const { shotSize } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n if (opts.includeLocateInPlanning && !modelRuntime.config.modelFamily) {\n throw new Error(\n planningModelFamilyRequiredForLocateMessage(modelRuntime.config.slot),\n );\n }\n\n const locateResultAdapter =\n modelRuntime.config.modelFamily && adapter.locate.kind === 'standard'\n ? adapter.locate.resultAdapter\n : undefined;\n\n // Only enable sub-goals when aiAct is in deep-thinking planning mode.\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n locatePromptSpec: locateResultAdapter?.promptSpec,\n includeLocateInPlanning: opts.includeLocateInPlanning,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n const preparedImage = await prepareModelImage({\n imageBase64: screenshotBase64,\n width: shotSize.width,\n height: shotSize.height,\n policy: adapter.imagePreprocess,\n });\n const imagePayload = preparedImage.imageBase64;\n\n const userInstructionText = userPromptToString(userInstruction);\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const referenceImageMessages = opts.referenceImageMessages ?? [];\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstructionText}</user_instruction>`,\n },\n ],\n },\n ...referenceImageMessages,\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In planning deep-think mode: show full sub-goals with logs\n // Otherwise: show historical execution logs\n const executionProgressText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const executionProgressSection = executionProgressText\n ? `\\n\\n${executionProgressText}`\n : conversationHistory.pendingFeedbackMessage\n ? ''\n : `\\n\\n${noPreviousActionsText}`;\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `This is the current screenshot.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n let {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelRuntime, {\n abortSignal: opts.abortSignal,\n // Planning with locate results is localization-sensitive. Adapters decide\n // whether this should request original image detail.\n requiresOriginalImageDetail: opts.includeLocateInPlanning,\n });\n\n // Parse XML response to JSON object, retry once on parse failure\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);\n } catch {\n const retry = await callAI(msgs, modelRuntime, {\n abortSignal: opts.abortSignal,\n // Keep retry requests consistent with the initial planning call.\n requiresOriginalImageDetail: opts.includeLocateInPlanning,\n });\n rawResponse = retry.content;\n usage = retry.usage;\n reasoning_content = retry.reasoning_content;\n planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed in planning deep-think mode.\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (!opts.includeLocateInPlanning) {\n if (typeof locateResult === 'object') {\n // In prompt-only planning mode, ignore any accidental coordinates from the model.\n action.param[field] = { prompt: locateResult.prompt };\n }\n return;\n }\n\n assert(\n locateResultAdapter,\n 'generic planning locate normalization requires a standard locate adapter',\n );\n action.param[field] = {\n ...locateResult,\n locatedPixelBbox: locateResultAdapter.adaptPlanningParamToPixelBbox(\n locateResult,\n {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n },\n ),\n };\n }\n });\n });\n\n // Update sub-goals in conversation history only in planning deep-think mode.\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // Without planning deep-think mode, accumulate logs as historical execution steps.\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["debug","getDebug","warnLog","noPreviousActionsText","parseXMLPlanningResponse","xmlString","jsonParser","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","e","Error","plan","userInstruction","opts","context","conversationHistory","modelRuntime","adapter","shotSize","screenshotBase64","planningModelFamilyRequiredForLocateMessage","locateResultAdapter","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","preparedImage","prepareModelImage","imagePayload","userInstructionText","userPromptToString","actionContext","referenceImageMessages","instruction","latestFeedbackMessage","executionProgressText","executionProgressSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","retry","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;;;;;;AAqBA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,UAAUD,SAAS,YAAY;IAAE,SAAS;AAAK;AAErD,MAAME,wBACJ;AAKK,SAASC,yBACdC,SAAiB,EACjBC,UAAsB;IAEtB,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,SAASD,cAAcH,WAAW;IACxC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QAGrD,MAAMc,OAAOd,WAAW,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI;QAC1C,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQrB,WAAWO,gBAAgB;gBACjC,QAAQ;gBACR,yBACEa,AAAuB,YAAvBA,KAAK,WAAW,KAAiB;oBAAC;iBAAQ,GAAGR;YACjD;QACF,EAAE,OAAOU,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFH,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeO,KACpBC,eAA4B,EAC5BC,IAAiB;IAEjB,MAAM,EAAEC,OAAO,EAAEC,mBAAmB,EAAE,GAAGF;IACzC,MAAMG,eAAeH,KAAK,YAAY;IACtC,MAAM,EAAEI,OAAO,EAAE,GAAGD;IACpB,MAAM,EAAEE,QAAQ,EAAE,GAAGJ;IACrB,MAAMK,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,IAAID,KAAK,uBAAuB,IAAI,CAACG,aAAa,MAAM,CAAC,WAAW,EAClE,MAAM,IAAIN,MACRU,4CAA4CJ,aAAa,MAAM,CAAC,IAAI;IAIxE,MAAMK,sBACJL,aAAa,MAAM,CAAC,WAAW,IAAIC,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,GAClDA,QAAQ,MAAM,CAAC,aAAa,GAC5BlB;IAGN,MAAMuB,kBAAkBT,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMU,eAAe,MAAMC,2BAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,kBAAkBQ,qBAAqB;QACvC,yBAAyBR,KAAK,uBAAuB;QACrD,gBAAgB;QAChBS;IACF;IAEA,MAAMG,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaP;QACb,OAAOD,SAAS,KAAK;QACrB,QAAQA,SAAS,MAAM;QACvB,QAAQD,QAAQ,eAAe;IACjC;IACA,MAAMU,eAAeF,cAAc,WAAW;IAE9C,MAAMG,sBAAsBC,mBAAmBjB;IAC/C,MAAMkB,gBAAgBjB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMkB,yBAAyBlB,KAAK,sBAAsB,IAAI,EAAE;IAChE,MAAMmB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGF,cAAc,kBAAkB,EAAEF,oBAAoB,mBAAmB,CAAC;gBACrF;aACD;QACH;WACGG;KACJ;IAED,IAAIE;IAKJ,MAAMC,wBAAwBZ,kBAC1BP,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMoB,2BAA2BD,wBAC7B,CAAC,IAAI,EAAEA,uBAAuB,GAC9BnB,oBAAoB,sBAAsB,GACxC,KACA,CAAC,IAAI,EAAE/B,uBAAuB;IAGpC,MAAMoD,eAAerB,oBAAoB,cAAc;IACvD,MAAMsB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIrB,oBAAoB,sBAAsB,EAAE;QAC9CkB,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGlB,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEsB,kBAAkBF,0BAA0B;gBACzN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAZ,oBAAoB,mCAAmC;IACzD,OACEkB,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,+BAA+B,EAAEI,kBAAkBF,0BAA0B;YACtF;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKR;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFZ,oBAAoB,MAAM,CAACkB;IAG3BlB,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMuB,aAAavB,oBAAoB,QAAQ,CAACF,KAAK,kBAAkB;IAEvE,MAAM0B,OAAqC;QACzC;YAAE,MAAM;YAAU,SAAShB;QAAa;WACrCS;WACAM;KACJ;IAED,IAAI,EACF,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMvB,cAAc;QACnC,aAAaH,KAAK,WAAW;QAG7B,6BAA6BA,KAAK,uBAAuB;IAC3D;IAGA,IAAI+B;IACJ,IAAI;QACF,IAAI;YACFA,aAAa3D,yBAAyBuD,aAAavB,QAAQ,UAAU;QACvE,EAAE,OAAM;YACN,MAAM4B,QAAQ,MAAMF,OAAOJ,MAAMvB,cAAc;gBAC7C,aAAaH,KAAK,WAAW;gBAE7B,6BAA6BA,KAAK,uBAAuB;YAC3D;YACA2B,cAAcK,MAAM,OAAO;YAC3BJ,QAAQI,MAAM,KAAK;YACnBH,oBAAoBG,MAAM,iBAAiB;YAC3CD,aAAa3D,yBAAyBuD,aAAavB,QAAQ,UAAU;QACvE;QAEA,IAAI2B,WAAW,MAAM,IAAIA,AAA+B7C,WAA/B6C,WAAW,eAAe,EAAgB;YACjE7D,QACE;YAEF6D,WAAW,eAAe,GAAG7C;YAC7B6C,WAAW,eAAe,GAAG7C;QAC/B;QAEA,MAAM+C,UAAUF,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIG,yBAAyB;QAG7B,IAAIH,AAA+B7C,WAA/B6C,WAAW,eAAe,EAAgB;YAC5C/D,MAAM;YACNkE,yBAAyB;YAEzB,IAAIzB,iBACFP,oBAAoB,uBAAuB;QAE/C;QAEA,MAAMiC,cAAkC;YACtC,GAAGJ,UAAU;YACbE;YACAN;YACAC;YACAC;YACA,UAAUO,uBAAuBH,SAASjC,KAAK,WAAW;YAC1DkC;QACF;QAEAG,OAAON,YAAY;QAEnBE,QAAQ,OAAO,CAAC,CAACxC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAM6C,sBAAsBtC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACP,SAAWA,OAAO,IAAI,KAAKC;YAG9B1B,MAAM,+BAA+BsE;YACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENtE,MAAM,gBAAgBuE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAejD,OAAO,KAAK,CAACgD,MAAM;gBACxC,IAAIC,cAAc;oBAChB,IAAI,CAAC1C,KAAK,uBAAuB,EAAE;wBACjC,IAAI,AAAwB,YAAxB,OAAO0C,cAETjD,OAAO,KAAK,CAACgD,MAAM,GAAG;4BAAE,QAAQC,aAAa,MAAM;wBAAC;wBAEtD;oBACF;oBAEAL,OACE7B,qBACA;oBAEFf,OAAO,KAAK,CAACgD,MAAM,GAAG;wBACpB,GAAGC,YAAY;wBACf,kBAAkBlC,oBAAoB,6BAA6B,CACjEkC,cACA;4BACE,cAAc9B,cAAc,YAAY;4BACxC,aAAaA,cAAc,WAAW;wBACxC;oBAEJ;gBACF;YACF;QACF;QAGA,IAAIH,iBAAiB;YACnB,IAAIsB,WAAW,cAAc,EAAE,QAC7B7B,oBAAoB,aAAa,CAAC6B,WAAW,cAAc;YAE7D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMY,SAASZ,WAAW,mBAAmB,CAChD7B,oBAAoB,mBAAmB,CAACyC;YAI5C,IAAIZ,WAAW,GAAG,EAChB7B,oBAAoB,gBAAgB,CAAC6B,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChB7B,oBAAoB,mBAAmB,CAAC6B,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnB7B,oBAAoB,YAAY,CAAC6B,WAAW,MAAM;QAGpD7B,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMyB;gBACR;aACD;QACH;QAEA,OAAOQ;IACT,EAAE,OAAOS,YAAY;QAEnB,MAAMC,eACJD,sBAAsB/C,QAAQ+C,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClClB,aACAC;IAEJ;AACF"}
1
+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import { type TUserPrompt, userPromptToString } from '@/common';\nimport type {\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n} from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { buildYamlFlowFromPlans, findAllMidsceneLocatorField } from '../common';\nimport { planningModelFamilyRequiredForLocateMessage } from './errors';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport { AIResponseParseError, callAI } from './service-caller/index';\nimport type { JsonParser, JsonParserSource } from './service-caller/json';\nimport { prepareModelImage } from './workflows/image-preprocess';\nimport type { PlanOptions } from './workflows/planning/types';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\nconst noPreviousActionsText =\n 'No previous actions have been executed in this aiAct execution yet. If the instruction asks for actions, choose the first action to execute.';\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse.\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n jsonParser: JsonParser,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n // Strip any trailing XML tags that LLM might have leaked into the action type\n // e.g. \"KeyboardPress</action-type>\\n<action-param-json>\" -> \"KeyboardPress\"\n const type = actionType.split('<')[0].trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = jsonParser(actionParamStr, {\n source: 'planning-action-param',\n preserveStringValueKeys:\n type.toLowerCase() === 'input' ? ['value'] : undefined,\n });\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: TUserPrompt,\n opts: PlanOptions,\n): Promise<PlanningAIResponse> {\n const { context, conversationHistory } = opts;\n const modelRuntime = opts.modelRuntime;\n const { adapter } = modelRuntime;\n const { shotSize } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n if (opts.includeLocateInPlanning && !modelRuntime.config.modelFamily) {\n throw new Error(\n planningModelFamilyRequiredForLocateMessage(modelRuntime.config.slot),\n );\n }\n\n const locateResultAdapter =\n modelRuntime.config.modelFamily && adapter.locate.kind === 'standard'\n ? adapter.locate.resultAdapter\n : undefined;\n\n // Only enable sub-goals when aiAct is in deep-thinking planning mode.\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n locatePromptSpec: locateResultAdapter?.promptSpec,\n includeLocateInPlanning: opts.includeLocateInPlanning,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n const preparedImage = await prepareModelImage({\n imageBase64: screenshotBase64,\n width: shotSize.width,\n height: shotSize.height,\n policy: adapter.imagePreprocess,\n });\n const imagePayload = preparedImage.imageBase64;\n\n const userInstructionText = userPromptToString(userInstruction);\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const referenceImageMessages = opts.referenceImageMessages ?? [];\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstructionText}</user_instruction>`,\n },\n ],\n },\n ...referenceImageMessages,\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In planning deep-think mode: show full sub-goals with logs\n // Otherwise: show historical execution logs\n const executionProgressText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const executionProgressSection = executionProgressText\n ? `\\n\\n${executionProgressText}`\n : conversationHistory.pendingFeedbackMessage\n ? ''\n : `\\n\\n${noPreviousActionsText}`;\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `This is the current screenshot.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n let {\n content: rawResponse,\n usage,\n reasoning_content,\n rawChoiceMessage,\n } = await callAI(msgs, modelRuntime, {\n abortSignal: opts.abortSignal,\n // Planning with locate results is localization-sensitive. Adapters decide\n // whether this should request original image detail.\n requiresOriginalImageDetail: opts.includeLocateInPlanning,\n });\n\n // Parse XML response to JSON object, retry once on parse failure\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);\n } catch {\n const retry = await callAI(msgs, modelRuntime, {\n abortSignal: opts.abortSignal,\n // Keep retry requests consistent with the initial planning call.\n requiresOriginalImageDetail: opts.includeLocateInPlanning,\n });\n rawResponse = retry.content;\n usage = retry.usage;\n reasoning_content = retry.reasoning_content;\n rawChoiceMessage = retry.rawChoiceMessage;\n planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed in planning deep-think mode.\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n rawChoiceMessage,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (!opts.includeLocateInPlanning) {\n if (typeof locateResult === 'object') {\n // In prompt-only planning mode, ignore any accidental coordinates from the model.\n action.param[field] = { prompt: locateResult.prompt };\n }\n return;\n }\n\n assert(\n locateResultAdapter,\n 'generic planning locate normalization requires a standard locate adapter',\n );\n action.param[field] = {\n ...locateResult,\n locatedPixelBbox: locateResultAdapter.adaptPlanningParamToPixelBbox(\n locateResult,\n {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n },\n ),\n };\n }\n });\n });\n\n // Update sub-goals in conversation history only in planning deep-think mode.\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // Without planning deep-think mode, accumulate logs as historical execution steps.\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n rawChoiceMessage,\n );\n }\n}\n"],"names":["debug","getDebug","warnLog","noPreviousActionsText","parseXMLPlanningResponse","xmlString","jsonParser","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","e","Error","plan","userInstruction","opts","context","conversationHistory","modelRuntime","adapter","shotSize","screenshotBase64","planningModelFamilyRequiredForLocateMessage","locateResultAdapter","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","preparedImage","prepareModelImage","imagePayload","userInstructionText","userPromptToString","actionContext","referenceImageMessages","instruction","latestFeedbackMessage","executionProgressText","executionProgressSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","rawChoiceMessage","callAI","planFromAI","retry","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;;;;;;AAqBA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,UAAUD,SAAS,YAAY;IAAE,SAAS;AAAK;AAErD,MAAME,wBACJ;AAKK,SAASC,yBACdC,SAAiB,EACjBC,UAAsB;IAEtB,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,SAASD,cAAcH,WAAW;IACxC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QAGrD,MAAMc,OAAOd,WAAW,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI;QAC1C,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQrB,WAAWO,gBAAgB;gBACjC,QAAQ;gBACR,yBACEa,AAAuB,YAAvBA,KAAK,WAAW,KAAiB;oBAAC;iBAAQ,GAAGR;YACjD;QACF,EAAE,OAAOU,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFH,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeO,KACpBC,eAA4B,EAC5BC,IAAiB;IAEjB,MAAM,EAAEC,OAAO,EAAEC,mBAAmB,EAAE,GAAGF;IACzC,MAAMG,eAAeH,KAAK,YAAY;IACtC,MAAM,EAAEI,OAAO,EAAE,GAAGD;IACpB,MAAM,EAAEE,QAAQ,EAAE,GAAGJ;IACrB,MAAMK,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,IAAID,KAAK,uBAAuB,IAAI,CAACG,aAAa,MAAM,CAAC,WAAW,EAClE,MAAM,IAAIN,MACRU,4CAA4CJ,aAAa,MAAM,CAAC,IAAI;IAIxE,MAAMK,sBACJL,aAAa,MAAM,CAAC,WAAW,IAAIC,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,GAClDA,QAAQ,MAAM,CAAC,aAAa,GAC5BlB;IAGN,MAAMuB,kBAAkBT,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMU,eAAe,MAAMC,2BAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,kBAAkBQ,qBAAqB;QACvC,yBAAyBR,KAAK,uBAAuB;QACrD,gBAAgB;QAChBS;IACF;IAEA,MAAMG,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaP;QACb,OAAOD,SAAS,KAAK;QACrB,QAAQA,SAAS,MAAM;QACvB,QAAQD,QAAQ,eAAe;IACjC;IACA,MAAMU,eAAeF,cAAc,WAAW;IAE9C,MAAMG,sBAAsBC,mBAAmBjB;IAC/C,MAAMkB,gBAAgBjB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMkB,yBAAyBlB,KAAK,sBAAsB,IAAI,EAAE;IAChE,MAAMmB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGF,cAAc,kBAAkB,EAAEF,oBAAoB,mBAAmB,CAAC;gBACrF;aACD;QACH;WACGG;KACJ;IAED,IAAIE;IAKJ,MAAMC,wBAAwBZ,kBAC1BP,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMoB,2BAA2BD,wBAC7B,CAAC,IAAI,EAAEA,uBAAuB,GAC9BnB,oBAAoB,sBAAsB,GACxC,KACA,CAAC,IAAI,EAAE/B,uBAAuB;IAGpC,MAAMoD,eAAerB,oBAAoB,cAAc;IACvD,MAAMsB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIrB,oBAAoB,sBAAsB,EAAE;QAC9CkB,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGlB,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEsB,kBAAkBF,0BAA0B;gBACzN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAZ,oBAAoB,mCAAmC;IACzD,OACEkB,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,+BAA+B,EAAEI,kBAAkBF,0BAA0B;YACtF;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKR;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFZ,oBAAoB,MAAM,CAACkB;IAG3BlB,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMuB,aAAavB,oBAAoB,QAAQ,CAACF,KAAK,kBAAkB;IAEvE,MAAM0B,OAAqC;QACzC;YAAE,MAAM;YAAU,SAAShB;QAAa;WACrCS;WACAM;KACJ;IAED,IAAI,EACF,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EACjBC,gBAAgB,EACjB,GAAG,MAAMC,OAAOL,MAAMvB,cAAc;QACnC,aAAaH,KAAK,WAAW;QAG7B,6BAA6BA,KAAK,uBAAuB;IAC3D;IAGA,IAAIgC;IACJ,IAAI;QACF,IAAI;YACFA,aAAa5D,yBAAyBuD,aAAavB,QAAQ,UAAU;QACvE,EAAE,OAAM;YACN,MAAM6B,QAAQ,MAAMF,OAAOL,MAAMvB,cAAc;gBAC7C,aAAaH,KAAK,WAAW;gBAE7B,6BAA6BA,KAAK,uBAAuB;YAC3D;YACA2B,cAAcM,MAAM,OAAO;YAC3BL,QAAQK,MAAM,KAAK;YACnBJ,oBAAoBI,MAAM,iBAAiB;YAC3CH,mBAAmBG,MAAM,gBAAgB;YACzCD,aAAa5D,yBAAyBuD,aAAavB,QAAQ,UAAU;QACvE;QAEA,IAAI4B,WAAW,MAAM,IAAIA,AAA+B9C,WAA/B8C,WAAW,eAAe,EAAgB;YACjE9D,QACE;YAEF8D,WAAW,eAAe,GAAG9C;YAC7B8C,WAAW,eAAe,GAAG9C;QAC/B;QAEA,MAAMgD,UAAUF,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIG,yBAAyB;QAG7B,IAAIH,AAA+B9C,WAA/B8C,WAAW,eAAe,EAAgB;YAC5ChE,MAAM;YACNmE,yBAAyB;YAEzB,IAAI1B,iBACFP,oBAAoB,uBAAuB;QAE/C;QAEA,MAAMkC,cAAkC;YACtC,GAAGJ,UAAU;YACbE;YACAP;YACAG;YACAF;YACAC;YACA,UAAUQ,uBAAuBH,SAASlC,KAAK,WAAW;YAC1DmC;QACF;QAEAG,OAAON,YAAY;QAEnBE,QAAQ,OAAO,CAAC,CAACzC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAM8C,sBAAsBvC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACP,SAAWA,OAAO,IAAI,KAAKC;YAG9B1B,MAAM,+BAA+BuE;YACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENvE,MAAM,gBAAgBwE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAelD,OAAO,KAAK,CAACiD,MAAM;gBACxC,IAAIC,cAAc;oBAChB,IAAI,CAAC3C,KAAK,uBAAuB,EAAE;wBACjC,IAAI,AAAwB,YAAxB,OAAO2C,cAETlD,OAAO,KAAK,CAACiD,MAAM,GAAG;4BAAE,QAAQC,aAAa,MAAM;wBAAC;wBAEtD;oBACF;oBAEAL,OACE9B,qBACA;oBAEFf,OAAO,KAAK,CAACiD,MAAM,GAAG;wBACpB,GAAGC,YAAY;wBACf,kBAAkBnC,oBAAoB,6BAA6B,CACjEmC,cACA;4BACE,cAAc/B,cAAc,YAAY;4BACxC,aAAaA,cAAc,WAAW;wBACxC;oBAEJ;gBACF;YACF;QACF;QAGA,IAAIH,iBAAiB;YACnB,IAAIuB,WAAW,cAAc,EAAE,QAC7B9B,oBAAoB,aAAa,CAAC8B,WAAW,cAAc;YAE7D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMY,SAASZ,WAAW,mBAAmB,CAChD9B,oBAAoB,mBAAmB,CAAC0C;YAI5C,IAAIZ,WAAW,GAAG,EAChB9B,oBAAoB,gBAAgB,CAAC8B,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChB9B,oBAAoB,mBAAmB,CAAC8B,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnB9B,oBAAoB,YAAY,CAAC8B,WAAW,MAAM;QAGpD9B,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMyB;gBACR;aACD;QACH;QAEA,OAAOS;IACT,EAAE,OAAOS,YAAY;QAEnB,MAAMC,eACJD,sBAAsBhD,QAAQgD,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClCnB,aACAC,OACAE;IAEJ;AACF"}
@@ -53,7 +53,7 @@ async function autoGlmLocate(elementDescription, options, getSystemPrompt) {
53
53
  });
54
54
  msgs.push(...addOns);
55
55
  }
56
- const { content: rawResponseContent, usage } = await callAIWithStringResponse(msgs, modelRuntime, {
56
+ const { content: rawResponseContent, usage, rawChoiceMessage } = await callAIWithStringResponse(msgs, modelRuntime, {
57
57
  abortSignal: options.abortSignal
58
58
  });
59
59
  debugInspect('auto-glm rawResponse:', rawResponseContent);
@@ -103,6 +103,7 @@ async function autoGlmLocate(elementDescription, options, getSystemPrompt) {
103
103
  errors
104
104
  },
105
105
  rawResponse: rawResponseContent,
106
+ rawChoiceMessage,
106
107
  usage,
107
108
  reasoning_content: parsed.think
108
109
  };
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/models/auto-glm/locate.mjs","sources":["../../../../../src/ai-model/models/auto-glm/locate.ts"],"sourcesContent":["import type { Rect } from '@/types';\nimport { generateElementByRect } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type { TUserPrompt } from '../../../common';\nimport {\n type InspectAIArgs,\n extraTextFromUserPrompt,\n promptsToChatParam,\n} from '../../inspect';\nimport { findElementPrompt } from '../../prompt/llm-locator';\nimport { callAIWithStringResponse } from '../../service-caller/index';\nimport { finalizePixelBbox } from '../../shared/model-locate-result/bbox';\nimport { mapLocateResultToPixelBboxByCoordinates } from '../../shared/model-locate-result/pixel-bbox-mapper';\nimport { pixelBboxToRect } from '../../workflows/inspect/locate-result-rect';\nimport { mapSearchAreaPixelBboxToOriginalPixelBbox } from '../../workflows/inspect/search-area-mapping';\nimport type {\n LocateOptions,\n LocateResult,\n} from '../../workflows/inspect/types';\nimport { parseAutoGLMLocateResponse } from './parser';\n\nconst debugInspect = getDebug('ai:inspect');\n\nexport async function autoGlmLocate(\n elementDescription: TUserPrompt,\n options: LocateOptions,\n getSystemPrompt: () => string,\n): Promise<LocateResult> {\n const { context, modelRuntime } = options;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(elementDescription, 'cannot find the target element description');\n const elementDescriptionText = extraTextFromUserPrompt(elementDescription);\n const userInstructionPrompt = findElementPrompt(elementDescriptionText);\n\n const locateImage = options.searchConfig?.image ?? {\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n };\n const imagePayload = locateImage.imageBase64;\n const imageWidth = locateImage.width;\n const imageHeight = locateImage.height;\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: getSystemPrompt() },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: `Tap: ${userInstructionPrompt}`,\n },\n ],\n },\n ];\n\n if (typeof elementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: elementDescription.images,\n convertHttpImage2Base64: elementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const { content: rawResponseContent, usage } = await callAIWithStringResponse(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n },\n );\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElement: LocateResultElement | undefined;\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n const ctx = { preparedSize: { width: imageWidth, height: imageHeight } };\n const targetPixelBbox = finalizePixelBbox(\n mapLocateResultToPixelBboxByCoordinates(\n { type: 'point', coordinates: [x, y] },\n ctx,\n { shape: 'point', order: 'xy', normalizedBy: 1000 },\n ),\n parsed.coordinates,\n ctx,\n );\n resRect = pixelBboxToRect(\n mapSearchAreaPixelBboxToOriginalPixelBbox(\n targetPixelBbox,\n options.searchConfig?.mapping,\n ),\n );\n\n debugInspect('auto-glm resRect:', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n elementDescriptionText as string,\n );\n\n if (element) {\n matchedElement = element;\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n element: matchedElement,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n}\n"],"names":["debugInspect","getDebug","autoGlmLocate","elementDescription","options","getSystemPrompt","context","modelRuntime","screenshotBase64","assert","elementDescriptionText","extraTextFromUserPrompt","userInstructionPrompt","findElementPrompt","locateImage","imagePayload","imageWidth","imageHeight","msgs","addOns","promptsToChatParam","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElement","errors","x","y","ctx","targetPixelBbox","finalizePixelBbox","mapLocateResultToPixelBboxByCoordinates","pixelBboxToRect","mapSearchAreaPixelBboxToOriginalPixelBbox","element","generateElementByRect"],"mappings":";;;;;;;;;;;AAuBA,MAAMA,eAAeC,SAAS;AAEvB,eAAeC,cACpBC,kBAA+B,EAC/BC,OAAsB,EACtBC,eAA6B;IAE7B,MAAM,EAAEC,OAAO,EAAEC,YAAY,EAAE,GAAGH;IAClC,MAAMI,mBAAmBF,QAAQ,UAAU,CAAC,MAAM;IAElDG,OAAON,oBAAoB;IAC3B,MAAMO,yBAAyBC,wBAAwBR;IACvD,MAAMS,wBAAwBC,kBAAkBH;IAEhD,MAAMI,cAAcV,QAAQ,YAAY,EAAE,SAAS;QACjD,aAAaI;QACb,OAAOF,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;IACjC;IACA,MAAMS,eAAeD,YAAY,WAAW;IAC5C,MAAME,aAAaF,YAAY,KAAK;IACpC,MAAMG,cAAcH,YAAY,MAAM;IAEtC,MAAMI,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASb;QAAkB;QAC7C;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKU;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM,CAAC,KAAK,EAAEH,uBAAuB;gBACvC;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOT,oBAAiC;QAC1C,MAAMgB,SAAS,MAAMC,mBAAmB;YACtC,QAAQjB,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAe,KAAK,IAAI,IAAIC;IACf;IAEA,MAAM,EAAE,SAASE,kBAAkB,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBACnDL,MACAX,cACA;QACE,aAAaH,QAAQ,WAAW;IAClC;IAGFJ,aAAa,yBAAyBqB;IAEtC,MAAMG,SAASC,2BAA2BJ;IAE1CrB,aAAa,sBAAsBwB,OAAO,KAAK;IAC/CxB,aAAa,yBAAyBwB,OAAO,WAAW;IAExD,IAAIE;IACJ,IAAIC;IACJ,IAAIC,SAAmB,EAAE;IAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;QACvCI,SAAS;YAACJ,OAAO,KAAK,IAAI;SAAoC;QAC9DxB,aAAa,yBAAyB4B,MAAM,CAAC,EAAE;IACjD,OAAO;QACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;QAEnCxB,aAAa,iCAAiC;YAAE6B;YAAGC;QAAE;QAErD,MAAMC,MAAM;YAAE,cAAc;gBAAE,OAAOf;gBAAY,QAAQC;YAAY;QAAE;QACvE,MAAMe,kBAAkBC,kBACtBC,wCACE;YAAE,MAAM;YAAS,aAAa;gBAACL;gBAAGC;aAAE;QAAC,GACrCC,KACA;YAAE,OAAO;YAAS,OAAO;YAAM,cAAc;QAAK,IAEpDP,OAAO,WAAW,EAClBO;QAEFL,UAAUS,gBACRC,0CACEJ,iBACA5B,QAAQ,YAAY,EAAE;QAI1BJ,aAAa,qBAAqB0B;QAElC,MAAMW,UAA+BC,sBACnCZ,SACAhB;QAGF,IAAI2B,SACFV,iBAAiBU;IAErB;IAEA,OAAO;QACL,MAAMX;QACN,aAAa;YACX,SAASC;YACTC;QACF;QACA,aAAaP;QACbC;QACA,mBAAmBE,OAAO,KAAK;IACjC;AACF"}
1
+ {"version":3,"file":"ai-model/models/auto-glm/locate.mjs","sources":["../../../../../src/ai-model/models/auto-glm/locate.ts"],"sourcesContent":["import type { Rect } from '@/types';\nimport { generateElementByRect } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type { TUserPrompt } from '../../../common';\nimport {\n type InspectAIArgs,\n extraTextFromUserPrompt,\n promptsToChatParam,\n} from '../../inspect';\nimport { findElementPrompt } from '../../prompt/llm-locator';\nimport { callAIWithStringResponse } from '../../service-caller/index';\nimport { finalizePixelBbox } from '../../shared/model-locate-result/bbox';\nimport { mapLocateResultToPixelBboxByCoordinates } from '../../shared/model-locate-result/pixel-bbox-mapper';\nimport { pixelBboxToRect } from '../../workflows/inspect/locate-result-rect';\nimport { mapSearchAreaPixelBboxToOriginalPixelBbox } from '../../workflows/inspect/search-area-mapping';\nimport type {\n LocateOptions,\n LocateResult,\n} from '../../workflows/inspect/types';\nimport { parseAutoGLMLocateResponse } from './parser';\n\nconst debugInspect = getDebug('ai:inspect');\n\nexport async function autoGlmLocate(\n elementDescription: TUserPrompt,\n options: LocateOptions,\n getSystemPrompt: () => string,\n): Promise<LocateResult> {\n const { context, modelRuntime } = options;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(elementDescription, 'cannot find the target element description');\n const elementDescriptionText = extraTextFromUserPrompt(elementDescription);\n const userInstructionPrompt = findElementPrompt(elementDescriptionText);\n\n const locateImage = options.searchConfig?.image ?? {\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n };\n const imagePayload = locateImage.imageBase64;\n const imageWidth = locateImage.width;\n const imageHeight = locateImage.height;\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: getSystemPrompt() },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: `Tap: ${userInstructionPrompt}`,\n },\n ],\n },\n ];\n\n if (typeof elementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: elementDescription.images,\n convertHttpImage2Base64: elementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponseContent,\n usage,\n rawChoiceMessage,\n } = await callAIWithStringResponse(msgs, modelRuntime, {\n abortSignal: options.abortSignal,\n });\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElement: LocateResultElement | undefined;\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n const ctx = { preparedSize: { width: imageWidth, height: imageHeight } };\n const targetPixelBbox = finalizePixelBbox(\n mapLocateResultToPixelBboxByCoordinates(\n { type: 'point', coordinates: [x, y] },\n ctx,\n { shape: 'point', order: 'xy', normalizedBy: 1000 },\n ),\n parsed.coordinates,\n ctx,\n );\n resRect = pixelBboxToRect(\n mapSearchAreaPixelBboxToOriginalPixelBbox(\n targetPixelBbox,\n options.searchConfig?.mapping,\n ),\n );\n\n debugInspect('auto-glm resRect:', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n elementDescriptionText as string,\n );\n\n if (element) {\n matchedElement = element;\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n element: matchedElement,\n errors,\n },\n rawResponse: rawResponseContent,\n rawChoiceMessage,\n usage,\n reasoning_content: parsed.think,\n };\n}\n"],"names":["debugInspect","getDebug","autoGlmLocate","elementDescription","options","getSystemPrompt","context","modelRuntime","screenshotBase64","assert","elementDescriptionText","extraTextFromUserPrompt","userInstructionPrompt","findElementPrompt","locateImage","imagePayload","imageWidth","imageHeight","msgs","addOns","promptsToChatParam","rawResponseContent","usage","rawChoiceMessage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElement","errors","x","y","ctx","targetPixelBbox","finalizePixelBbox","mapLocateResultToPixelBboxByCoordinates","pixelBboxToRect","mapSearchAreaPixelBboxToOriginalPixelBbox","element","generateElementByRect"],"mappings":";;;;;;;;;;;AAuBA,MAAMA,eAAeC,SAAS;AAEvB,eAAeC,cACpBC,kBAA+B,EAC/BC,OAAsB,EACtBC,eAA6B;IAE7B,MAAM,EAAEC,OAAO,EAAEC,YAAY,EAAE,GAAGH;IAClC,MAAMI,mBAAmBF,QAAQ,UAAU,CAAC,MAAM;IAElDG,OAAON,oBAAoB;IAC3B,MAAMO,yBAAyBC,wBAAwBR;IACvD,MAAMS,wBAAwBC,kBAAkBH;IAEhD,MAAMI,cAAcV,QAAQ,YAAY,EAAE,SAAS;QACjD,aAAaI;QACb,OAAOF,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;IACjC;IACA,MAAMS,eAAeD,YAAY,WAAW;IAC5C,MAAME,aAAaF,YAAY,KAAK;IACpC,MAAMG,cAAcH,YAAY,MAAM;IAEtC,MAAMI,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASb;QAAkB;QAC7C;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKU;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM,CAAC,KAAK,EAAEH,uBAAuB;gBACvC;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOT,oBAAiC;QAC1C,MAAMgB,SAAS,MAAMC,mBAAmB;YACtC,QAAQjB,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAe,KAAK,IAAI,IAAIC;IACf;IAEA,MAAM,EACJ,SAASE,kBAAkB,EAC3BC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,yBAAyBN,MAAMX,cAAc;QACrD,aAAaH,QAAQ,WAAW;IAClC;IAEAJ,aAAa,yBAAyBqB;IAEtC,MAAMI,SAASC,2BAA2BL;IAE1CrB,aAAa,sBAAsByB,OAAO,KAAK;IAC/CzB,aAAa,yBAAyByB,OAAO,WAAW;IAExD,IAAIE;IACJ,IAAIC;IACJ,IAAIC,SAAmB,EAAE;IAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;QACvCI,SAAS;YAACJ,OAAO,KAAK,IAAI;SAAoC;QAC9DzB,aAAa,yBAAyB6B,MAAM,CAAC,EAAE;IACjD,OAAO;QACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;QAEnCzB,aAAa,iCAAiC;YAAE8B;YAAGC;QAAE;QAErD,MAAMC,MAAM;YAAE,cAAc;gBAAE,OAAOhB;gBAAY,QAAQC;YAAY;QAAE;QACvE,MAAMgB,kBAAkBC,kBACtBC,wCACE;YAAE,MAAM;YAAS,aAAa;gBAACL;gBAAGC;aAAE;QAAC,GACrCC,KACA;YAAE,OAAO;YAAS,OAAO;YAAM,cAAc;QAAK,IAEpDP,OAAO,WAAW,EAClBO;QAEFL,UAAUS,gBACRC,0CACEJ,iBACA7B,QAAQ,YAAY,EAAE;QAI1BJ,aAAa,qBAAqB2B;QAElC,MAAMW,UAA+BC,sBACnCZ,SACAjB;QAGF,IAAI4B,SACFV,iBAAiBU;IAErB;IAEA,OAAO;QACL,MAAMX;QACN,aAAa;YACX,SAASC;YACTC;QACF;QACA,aAAaR;QACbE;QACAD;QACA,mBAAmBG,OAAO,KAAK;IACjC;AACF"}
@@ -39,7 +39,7 @@ async function autoGlmPlanning(userInstruction, options, getSystemPrompt) {
39
39
  ...referenceImageMessages,
40
40
  ...conversationHistory.snapshot(1)
41
41
  ];
42
- const { content: rawResponse, usage } = await callAIWithStringResponse(msgs, options.modelRuntime, {
42
+ const { content: rawResponse, usage, rawChoiceMessage } = await callAIWithStringResponse(msgs, options.modelRuntime, {
43
43
  abortSignal: options.abortSignal
44
44
  });
45
45
  debug('autoGLMPlanning rawResponse:', rawResponse);
@@ -55,7 +55,7 @@ async function autoGlmPlanning(userInstruction, options, getSystemPrompt) {
55
55
  debug('Transformed actions:', transformedActions);
56
56
  } catch (parseError) {
57
57
  const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
58
- throw new AIResponseParseError(`Parse error: ${errorMessage}`, JSON.stringify(rawResponse, void 0, 2), usage);
58
+ throw new AIResponseParseError(`Parse error: ${errorMessage}`, JSON.stringify(rawResponse, void 0, 2), usage, rawChoiceMessage);
59
59
  }
60
60
  conversationHistory.append({
61
61
  role: 'assistant',
@@ -67,7 +67,8 @@ async function autoGlmPlanning(userInstruction, options, getSystemPrompt) {
67
67
  log: rawResponse,
68
68
  usage,
69
69
  shouldContinuePlanning,
70
- rawResponse: JSON.stringify(rawResponse, void 0, 2)
70
+ rawResponse: JSON.stringify(rawResponse, void 0, 2),
71
+ rawChoiceMessage
71
72
  };
72
73
  }
73
74
  export { autoGlmPlanning };
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/models/auto-glm/planning.mjs","sources":["../../../../../src/ai-model/models/auto-glm/planning.ts"],"sourcesContent":["import { type TUserPrompt, userPromptToString } from '@/common';\nimport type { PlanningAIResponse } from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIResponseParseError,\n callAIWithStringResponse,\n} from '../../service-caller/index';\nimport type { PlanOptions } from '../../workflows/planning/types';\nimport { transformAutoGLMAction } from './actions';\nimport { parseAction, parseAutoGLMResponse } from './parser';\n\nconst debug = getDebug('auto-glm-planning');\n\nexport async function autoGlmPlanning(\n userInstruction: TUserPrompt,\n options: PlanOptions,\n getSystemPrompt: () => string,\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, actionContext } = options;\n\n const systemPrompt =\n getSystemPrompt() +\n (actionContext\n ? `<high_priority_knowledge>${actionContext}</high_priority_knowledge>`\n : '');\n\n const imagePayloadBase64 = context.screenshot.base64;\n const userInstructionText = userPromptToString(userInstruction);\n const referenceImageMessages = options.referenceImageMessages ?? [];\n\n const userInstructionMessage: ChatCompletionMessageParam = {\n role: 'user',\n content: [{ type: 'text', text: userInstructionText }],\n };\n conversationHistory.append({\n role: 'user',\n content: [{ type: 'image_url', image_url: { url: imagePayloadBase64 } }],\n });\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n userInstructionMessage,\n ...referenceImageMessages,\n ...conversationHistory.snapshot(1),\n ];\n\n const { content: rawResponse, usage } = await callAIWithStringResponse(\n msgs,\n options.modelRuntime,\n {\n abortSignal: options.abortSignal,\n },\n );\n\n debug('autoGLMPlanning rawResponse:', rawResponse);\n\n let parsedResponse: ReturnType<typeof parseAutoGLMResponse>;\n let transformedActions: ReturnType<typeof transformAutoGLMAction>;\n\n try {\n parsedResponse = parseAutoGLMResponse(rawResponse);\n debug('thinking in response:', parsedResponse.think);\n debug('action in response:', parsedResponse.content);\n\n const parsedAction = parseAction(parsedResponse);\n debug('Parsed action object:', parsedAction);\n transformedActions = transformAutoGLMAction(\n parsedAction,\n context.shotSize,\n options.actionSpace,\n );\n debug('Transformed actions:', transformedActions);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `Parse error: ${errorMessage}`,\n JSON.stringify(rawResponse, undefined, 2),\n usage,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: `<think>${parsedResponse.think}</think><answer>${parsedResponse.content}</answer>`,\n });\n\n const shouldContinuePlanning = !parsedResponse.content.startsWith('finish(');\n\n return {\n actions: transformedActions,\n log: rawResponse,\n usage,\n shouldContinuePlanning,\n rawResponse: JSON.stringify(rawResponse, undefined, 2),\n };\n}\n"],"names":["debug","getDebug","autoGlmPlanning","userInstruction","options","getSystemPrompt","conversationHistory","context","actionContext","systemPrompt","imagePayloadBase64","userInstructionText","userPromptToString","referenceImageMessages","userInstructionMessage","msgs","rawResponse","usage","callAIWithStringResponse","parsedResponse","transformedActions","parseAutoGLMResponse","parsedAction","parseAction","transformAutoGLMAction","parseError","errorMessage","Error","String","AIResponseParseError","JSON","undefined","shouldContinuePlanning"],"mappings":";;;;;AAYA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,gBACpBC,eAA4B,EAC5BC,OAAoB,EACpBC,eAA6B;IAE7B,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,aAAa,EAAE,GAAGJ;IAExD,MAAMK,eACJJ,oBACCG,CAAAA,gBACG,CAAC,yBAAyB,EAAEA,cAAc,0BAA0B,CAAC,GACrE,EAAC;IAEP,MAAME,qBAAqBH,QAAQ,UAAU,CAAC,MAAM;IACpD,MAAMI,sBAAsBC,mBAAmBT;IAC/C,MAAMU,yBAAyBT,QAAQ,sBAAsB,IAAI,EAAE;IAEnE,MAAMU,yBAAqD;QACzD,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAQ,MAAMH;YAAoB;SAAE;IACxD;IACAL,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAa,WAAW;oBAAE,KAAKI;gBAAmB;YAAE;SAAE;IAC1E;IAEA,MAAMK,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASN;QAAa;QACxCK;WACGD;WACAP,oBAAoB,QAAQ,CAAC;KACjC;IAED,MAAM,EAAE,SAASU,WAAW,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC5CH,MACAX,QAAQ,YAAY,EACpB;QACE,aAAaA,QAAQ,WAAW;IAClC;IAGFJ,MAAM,gCAAgCgB;IAEtC,IAAIG;IACJ,IAAIC;IAEJ,IAAI;QACFD,iBAAiBE,qBAAqBL;QACtChB,MAAM,yBAAyBmB,eAAe,KAAK;QACnDnB,MAAM,uBAAuBmB,eAAe,OAAO;QAEnD,MAAMG,eAAeC,YAAYJ;QACjCnB,MAAM,yBAAyBsB;QAC/BF,qBAAqBI,uBACnBF,cACAf,QAAQ,QAAQ,EAChBH,QAAQ,WAAW;QAErBJ,MAAM,wBAAwBoB;IAChC,EAAE,OAAOK,YAAY;QAEnB,MAAMC,eACJD,sBAAsBE,QAAQF,WAAW,OAAO,GAAGG,OAAOH;QAC5D,MAAM,IAAII,qBACR,CAAC,aAAa,EAAEH,cAAc,EAC9BI,KAAK,SAAS,CAACd,aAAae,QAAW,IACvCd;IAEJ;IAEAX,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS,CAAC,OAAO,EAAEa,eAAe,KAAK,CAAC,gBAAgB,EAAEA,eAAe,OAAO,CAAC,SAAS,CAAC;IAC7F;IAEA,MAAMa,yBAAyB,CAACb,eAAe,OAAO,CAAC,UAAU,CAAC;IAElE,OAAO;QACL,SAASC;QACT,KAAKJ;QACLC;QACAe;QACA,aAAaF,KAAK,SAAS,CAACd,aAAae,QAAW;IACtD;AACF"}
1
+ {"version":3,"file":"ai-model/models/auto-glm/planning.mjs","sources":["../../../../../src/ai-model/models/auto-glm/planning.ts"],"sourcesContent":["import { type TUserPrompt, userPromptToString } from '@/common';\nimport type { PlanningAIResponse } from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIResponseParseError,\n callAIWithStringResponse,\n} from '../../service-caller/index';\nimport type { PlanOptions } from '../../workflows/planning/types';\nimport { transformAutoGLMAction } from './actions';\nimport { parseAction, parseAutoGLMResponse } from './parser';\n\nconst debug = getDebug('auto-glm-planning');\n\nexport async function autoGlmPlanning(\n userInstruction: TUserPrompt,\n options: PlanOptions,\n getSystemPrompt: () => string,\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, actionContext } = options;\n\n const systemPrompt =\n getSystemPrompt() +\n (actionContext\n ? `<high_priority_knowledge>${actionContext}</high_priority_knowledge>`\n : '');\n\n const imagePayloadBase64 = context.screenshot.base64;\n const userInstructionText = userPromptToString(userInstruction);\n const referenceImageMessages = options.referenceImageMessages ?? [];\n\n const userInstructionMessage: ChatCompletionMessageParam = {\n role: 'user',\n content: [{ type: 'text', text: userInstructionText }],\n };\n conversationHistory.append({\n role: 'user',\n content: [{ type: 'image_url', image_url: { url: imagePayloadBase64 } }],\n });\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n userInstructionMessage,\n ...referenceImageMessages,\n ...conversationHistory.snapshot(1),\n ];\n\n const {\n content: rawResponse,\n usage,\n rawChoiceMessage,\n } = await callAIWithStringResponse(msgs, options.modelRuntime, {\n abortSignal: options.abortSignal,\n });\n\n debug('autoGLMPlanning rawResponse:', rawResponse);\n\n let parsedResponse: ReturnType<typeof parseAutoGLMResponse>;\n let transformedActions: ReturnType<typeof transformAutoGLMAction>;\n\n try {\n parsedResponse = parseAutoGLMResponse(rawResponse);\n debug('thinking in response:', parsedResponse.think);\n debug('action in response:', parsedResponse.content);\n\n const parsedAction = parseAction(parsedResponse);\n debug('Parsed action object:', parsedAction);\n transformedActions = transformAutoGLMAction(\n parsedAction,\n context.shotSize,\n options.actionSpace,\n );\n debug('Transformed actions:', transformedActions);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `Parse error: ${errorMessage}`,\n JSON.stringify(rawResponse, undefined, 2),\n usage,\n rawChoiceMessage,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: `<think>${parsedResponse.think}</think><answer>${parsedResponse.content}</answer>`,\n });\n\n const shouldContinuePlanning = !parsedResponse.content.startsWith('finish(');\n\n return {\n actions: transformedActions,\n log: rawResponse,\n usage,\n shouldContinuePlanning,\n rawResponse: JSON.stringify(rawResponse, undefined, 2),\n rawChoiceMessage,\n };\n}\n"],"names":["debug","getDebug","autoGlmPlanning","userInstruction","options","getSystemPrompt","conversationHistory","context","actionContext","systemPrompt","imagePayloadBase64","userInstructionText","userPromptToString","referenceImageMessages","userInstructionMessage","msgs","rawResponse","usage","rawChoiceMessage","callAIWithStringResponse","parsedResponse","transformedActions","parseAutoGLMResponse","parsedAction","parseAction","transformAutoGLMAction","parseError","errorMessage","Error","String","AIResponseParseError","JSON","undefined","shouldContinuePlanning"],"mappings":";;;;;AAYA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,gBACpBC,eAA4B,EAC5BC,OAAoB,EACpBC,eAA6B;IAE7B,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,aAAa,EAAE,GAAGJ;IAExD,MAAMK,eACJJ,oBACCG,CAAAA,gBACG,CAAC,yBAAyB,EAAEA,cAAc,0BAA0B,CAAC,GACrE,EAAC;IAEP,MAAME,qBAAqBH,QAAQ,UAAU,CAAC,MAAM;IACpD,MAAMI,sBAAsBC,mBAAmBT;IAC/C,MAAMU,yBAAyBT,QAAQ,sBAAsB,IAAI,EAAE;IAEnE,MAAMU,yBAAqD;QACzD,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAQ,MAAMH;YAAoB;SAAE;IACxD;IACAL,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAa,WAAW;oBAAE,KAAKI;gBAAmB;YAAE;SAAE;IAC1E;IAEA,MAAMK,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASN;QAAa;QACxCK;WACGD;WACAP,oBAAoB,QAAQ,CAAC;KACjC;IAED,MAAM,EACJ,SAASU,WAAW,EACpBC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,yBAAyBJ,MAAMX,QAAQ,YAAY,EAAE;QAC7D,aAAaA,QAAQ,WAAW;IAClC;IAEAJ,MAAM,gCAAgCgB;IAEtC,IAAII;IACJ,IAAIC;IAEJ,IAAI;QACFD,iBAAiBE,qBAAqBN;QACtChB,MAAM,yBAAyBoB,eAAe,KAAK;QACnDpB,MAAM,uBAAuBoB,eAAe,OAAO;QAEnD,MAAMG,eAAeC,YAAYJ;QACjCpB,MAAM,yBAAyBuB;QAC/BF,qBAAqBI,uBACnBF,cACAhB,QAAQ,QAAQ,EAChBH,QAAQ,WAAW;QAErBJ,MAAM,wBAAwBqB;IAChC,EAAE,OAAOK,YAAY;QAEnB,MAAMC,eACJD,sBAAsBE,QAAQF,WAAW,OAAO,GAAGG,OAAOH;QAC5D,MAAM,IAAII,qBACR,CAAC,aAAa,EAAEH,cAAc,EAC9BI,KAAK,SAAS,CAACf,aAAagB,QAAW,IACvCf,OACAC;IAEJ;IAEAZ,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS,CAAC,OAAO,EAAEc,eAAe,KAAK,CAAC,gBAAgB,EAAEA,eAAe,OAAO,CAAC,SAAS,CAAC;IAC7F;IAEA,MAAMa,yBAAyB,CAACb,eAAe,OAAO,CAAC,UAAU,CAAC;IAElE,OAAO;QACL,SAASC;QACT,KAAKL;QACLC;QACAgB;QACA,aAAaF,KAAK,SAAS,CAACf,aAAagB,QAAW;QACpDd;IACF;AACF"}
@@ -1,16 +1,22 @@
1
1
  const originalImageDetailForDefaultIntent = (input)=>'default' === input.intent || input.requiresOriginalImageDetail ? 'original' : void 0;
2
- const buildGpt5ChatCompletionParams = ()=>({
2
+ const buildGpt5ChatCompletionParams = (input)=>{
3
+ const { midsceneDefaults, userConfig } = input;
4
+ const { reasoningEnabled, reasoningEffort } = userConfig;
5
+ const commonOverrideConfig = {};
6
+ if (void 0 !== userConfig.temperature) commonOverrideConfig.temperature = userConfig.temperature;
7
+ const effectiveReasoningEffort = true === reasoningEnabled ? reasoningEffort ?? 'medium' : 'none';
8
+ return {
3
9
  config: {
4
- temperature: void 0
10
+ ...midsceneDefaults,
11
+ ...commonOverrideConfig,
12
+ reasoning_effort: effectiveReasoningEffort
5
13
  }
6
- });
14
+ };
15
+ };
7
16
  const gptAdapters = {
8
17
  'gpt-5': {
9
18
  chatCompletion: {
10
19
  unsupportedUserConfig: [
11
- 'temperature',
12
- 'reasoningEnabled',
13
- 'reasoningEffort',
14
20
  'reasoningBudget'
15
21
  ],
16
22
  buildChatCompletionParams: buildGpt5ChatCompletionParams,
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/models/gpt.mjs","sources":["../../../../src/ai-model/models/gpt.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nimport type {\n ChatCompletionCallContext,\n ChatCompletionParamsResult,\n ImageDetail,\n ModelAdapterDefinition,\n} from './types';\n\nconst originalImageDetailForDefaultIntent = (\n input: ChatCompletionCallContext,\n): ImageDetail | undefined =>\n input.intent === 'default' || input.requiresOriginalImageDetail\n ? 'original'\n : undefined;\n\nconst buildGpt5ChatCompletionParams = (): ChatCompletionParamsResult => {\n return {\n config: {\n // GPT-5 Chat Completions does not support temperature control.\n temperature: undefined,\n },\n };\n};\n\nexport const gptAdapters = {\n 'gpt-5': {\n chatCompletion: {\n unsupportedUserConfig: [\n 'temperature',\n 'reasoningEnabled',\n 'reasoningEffort',\n 'reasoningBudget',\n ],\n buildChatCompletionParams: buildGpt5ChatCompletionParams,\n resolveImageDetail: originalImageDetailForDefaultIntent,\n },\n locate: {\n resultAdapter: {\n coordinates: { shape: 'bbox', order: 'xy' },\n },\n },\n },\n} satisfies Pick<Record<TModelFamily, ModelAdapterDefinition>, 'gpt-5'>;\n"],"names":["originalImageDetailForDefaultIntent","input","undefined","buildGpt5ChatCompletionParams","gptAdapters"],"mappings":"AAQA,MAAMA,sCAAsC,CAC1CC,QAEAA,AAAiB,cAAjBA,MAAM,MAAM,IAAkBA,MAAM,2BAA2B,GAC3D,aACAC;AAEN,MAAMC,gCAAgC,IAC7B;QACL,QAAQ;YAEN,aAAaD;QACf;IACF;AAGK,MAAME,cAAc;IACzB,SAAS;QACP,gBAAgB;YACd,uBAAuB;gBACrB;gBACA;gBACA;gBACA;aACD;YACD,2BAA2BD;YAC3B,oBAAoBH;QACtB;QACA,QAAQ;YACN,eAAe;gBACb,aAAa;oBAAE,OAAO;oBAAQ,OAAO;gBAAK;YAC5C;QACF;IACF;AACF"}
1
+ {"version":3,"file":"ai-model/models/gpt.mjs","sources":["../../../../src/ai-model/models/gpt.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nimport type {\n ChatCompletionCallContext,\n ChatCompletionParamsResult,\n ImageDetail,\n ModelAdapterDefinition,\n} from './types';\n\nconst originalImageDetailForDefaultIntent = (\n input: ChatCompletionCallContext,\n): ImageDetail | undefined =>\n input.intent === 'default' || input.requiresOriginalImageDetail\n ? 'original'\n : undefined;\n\nconst buildGpt5ChatCompletionParams = (\n input: ChatCompletionCallContext,\n): ChatCompletionParamsResult => {\n const { midsceneDefaults, userConfig } = input;\n const { reasoningEnabled, reasoningEffort } = userConfig;\n const commonOverrideConfig: Record<string, unknown> = {};\n\n if (userConfig.temperature !== undefined) {\n commonOverrideConfig.temperature = userConfig.temperature;\n }\n\n const effectiveReasoningEffort =\n reasoningEnabled === true ? (reasoningEffort ?? 'medium') : 'none';\n\n return {\n config: {\n ...midsceneDefaults,\n ...commonOverrideConfig,\n reasoning_effort: effectiveReasoningEffort,\n },\n };\n};\n\nexport const gptAdapters = {\n 'gpt-5': {\n chatCompletion: {\n unsupportedUserConfig: ['reasoningBudget'],\n buildChatCompletionParams: buildGpt5ChatCompletionParams,\n resolveImageDetail: originalImageDetailForDefaultIntent,\n },\n locate: {\n resultAdapter: {\n coordinates: { shape: 'bbox', order: 'xy' },\n },\n },\n },\n} satisfies Pick<Record<TModelFamily, ModelAdapterDefinition>, 'gpt-5'>;\n"],"names":["originalImageDetailForDefaultIntent","input","undefined","buildGpt5ChatCompletionParams","midsceneDefaults","userConfig","reasoningEnabled","reasoningEffort","commonOverrideConfig","effectiveReasoningEffort","gptAdapters"],"mappings":"AAQA,MAAMA,sCAAsC,CAC1CC,QAEAA,AAAiB,cAAjBA,MAAM,MAAM,IAAkBA,MAAM,2BAA2B,GAC3D,aACAC;AAEN,MAAMC,gCAAgC,CACpCF;IAEA,MAAM,EAAEG,gBAAgB,EAAEC,UAAU,EAAE,GAAGJ;IACzC,MAAM,EAAEK,gBAAgB,EAAEC,eAAe,EAAE,GAAGF;IAC9C,MAAMG,uBAAgD,CAAC;IAEvD,IAAIH,AAA2BH,WAA3BG,WAAW,WAAW,EACxBG,qBAAqB,WAAW,GAAGH,WAAW,WAAW;IAG3D,MAAMI,2BACJH,AAAqB,SAArBA,mBAA6BC,mBAAmB,WAAY;IAE9D,OAAO;QACL,QAAQ;YACN,GAAGH,gBAAgB;YACnB,GAAGI,oBAAoB;YACvB,kBAAkBC;QACpB;IACF;AACF;AAEO,MAAMC,cAAc;IACzB,SAAS;QACP,gBAAgB;YACd,uBAAuB;gBAAC;aAAkB;YAC1C,2BAA2BP;YAC3B,oBAAoBH;QACtB;QACA,QAAQ;YACN,eAAe;gBACb,aAAa;oBAAE,OAAO;oBAAQ,OAAO;gBAAK;YAC5C;QACF;IACF;AACF"}
@@ -0,0 +1,42 @@
1
+ const buildKimiChatCompletionParams = (input)=>{
2
+ const { midsceneDefaults, userConfig } = input;
3
+ const { reasoningEnabled } = userConfig;
4
+ const effectiveReasoningEnabled = reasoningEnabled ?? false;
5
+ const commonOverrideConfig = {};
6
+ commonOverrideConfig.temperature = void 0;
7
+ const modelSpecificConfig = {
8
+ thinking: {
9
+ type: effectiveReasoningEnabled ? 'enabled' : 'disabled'
10
+ }
11
+ };
12
+ return {
13
+ config: {
14
+ ...midsceneDefaults,
15
+ ...commonOverrideConfig,
16
+ ...modelSpecificConfig
17
+ }
18
+ };
19
+ };
20
+ const kimiAdapters = {
21
+ kimi: {
22
+ chatCompletion: {
23
+ unsupportedUserConfig: [
24
+ 'reasoningEffort',
25
+ 'reasoningBudget'
26
+ ],
27
+ buildChatCompletionParams: buildKimiChatCompletionParams
28
+ },
29
+ locate: {
30
+ resultAdapter: {
31
+ coordinates: {
32
+ shape: 'point',
33
+ order: 'xy',
34
+ normalizedBy: 1
35
+ }
36
+ }
37
+ }
38
+ }
39
+ };
40
+ export { kimiAdapters };
41
+
42
+ //# sourceMappingURL=kimi.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/models/kimi.mjs","sources":["../../../../src/ai-model/models/kimi.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nimport type {\n ChatCompletionCallContext,\n ChatCompletionParamsResult,\n ModelAdapterDefinition,\n} from './types';\n\nconst buildKimiChatCompletionParams = (\n input: ChatCompletionCallContext,\n): ChatCompletionParamsResult => {\n const { midsceneDefaults, userConfig } = input;\n const { reasoningEnabled } = userConfig;\n const effectiveReasoningEnabled = reasoningEnabled ?? false;\n const commonOverrideConfig: Record<string, unknown> = {};\n\n // kimi disallow custom temperature\n commonOverrideConfig.temperature = undefined;\n\n const modelSpecificConfig: Record<string, unknown> = {\n thinking: {\n type: effectiveReasoningEnabled ? 'enabled' : 'disabled',\n },\n };\n\n return {\n config: {\n ...midsceneDefaults,\n ...commonOverrideConfig,\n ...modelSpecificConfig,\n },\n };\n};\n\nexport const kimiAdapters = {\n kimi: {\n chatCompletion: {\n unsupportedUserConfig: ['reasoningEffort', 'reasoningBudget'],\n buildChatCompletionParams: buildKimiChatCompletionParams,\n },\n locate: {\n resultAdapter: {\n coordinates: { shape: 'point', order: 'xy', normalizedBy: 1 },\n },\n },\n },\n} satisfies Pick<Record<TModelFamily, ModelAdapterDefinition>, 'kimi'>;\n"],"names":["buildKimiChatCompletionParams","input","midsceneDefaults","userConfig","reasoningEnabled","effectiveReasoningEnabled","commonOverrideConfig","undefined","modelSpecificConfig","kimiAdapters"],"mappings":"AAOA,MAAMA,gCAAgC,CACpCC;IAEA,MAAM,EAAEC,gBAAgB,EAAEC,UAAU,EAAE,GAAGF;IACzC,MAAM,EAAEG,gBAAgB,EAAE,GAAGD;IAC7B,MAAME,4BAA4BD,oBAAoB;IACtD,MAAME,uBAAgD,CAAC;IAGvDA,qBAAqB,WAAW,GAAGC;IAEnC,MAAMC,sBAA+C;QACnD,UAAU;YACR,MAAMH,4BAA4B,YAAY;QAChD;IACF;IAEA,OAAO;QACL,QAAQ;YACN,GAAGH,gBAAgB;YACnB,GAAGI,oBAAoB;YACvB,GAAGE,mBAAmB;QACxB;IACF;AACF;AAEO,MAAMC,eAAe;IAC1B,MAAM;QACJ,gBAAgB;YACd,uBAAuB;gBAAC;gBAAmB;aAAkB;YAC7D,2BAA2BT;QAC7B;QACA,QAAQ;YACN,eAAe;gBACb,aAAa;oBAAE,OAAO;oBAAS,OAAO;oBAAM,cAAc;gBAAE;YAC9D;QACF;IACF;AACF"}
@@ -5,6 +5,7 @@ import { doubaoAdapters } from "./doubao.mjs";
5
5
  import { geminiAdapters } from "./gemini.mjs";
6
6
  import { glmAdapters } from "./glm.mjs";
7
7
  import { gptAdapters } from "./gpt.mjs";
8
+ import { kimiAdapters } from "./kimi.mjs";
8
9
  import { qwenAdapters } from "./qwen.mjs";
9
10
  import { ResolvedModelAdapter } from "./resolved.mjs";
10
11
  import { uiTarsAdapters } from "./ui-tars/adapter.mjs";
@@ -15,7 +16,8 @@ const MODEL_ADAPTER_CONFIGS = {
15
16
  ...uiTarsAdapters,
16
17
  ...glmAdapters,
17
18
  ...autoGlmAdapters,
18
- ...gptAdapters
19
+ ...gptAdapters,
20
+ ...kimiAdapters
19
21
  };
20
22
  const modelAdapterCache = new Map();
21
23
  const debugModelAdapter = getDebug('ai:model-adapter');
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/models/registry.mjs","sources":["../../../../src/ai-model/models/registry.ts"],"sourcesContent":["import type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { autoGlmAdapters } from './auto-glm/adapter';\nimport { defaultOpenAICompatibleAdapterConfig } from './default';\nimport { doubaoAdapters } from './doubao';\nimport { geminiAdapters } from './gemini';\nimport { glmAdapters } from './glm';\nimport { gptAdapters } from './gpt';\nimport { qwenAdapters } from './qwen';\nimport { ResolvedModelAdapter } from './resolved';\nimport type {\n ModelAdapter,\n ModelAdapterDefinition,\n ModelRuntime,\n} from './types';\nimport { uiTarsAdapters } from './ui-tars/adapter';\n\nexport const MODEL_ADAPTER_CONFIGS = {\n ...qwenAdapters,\n ...doubaoAdapters,\n ...geminiAdapters,\n ...uiTarsAdapters,\n ...glmAdapters,\n ...autoGlmAdapters,\n ...gptAdapters,\n} satisfies Record<TModelFamily, ModelAdapterDefinition>;\n\ntype ModelAdapterCacheKey = TModelFamily | 'default';\n\nconst modelAdapterCache = new Map<ModelAdapterCacheKey, ModelAdapter>();\nconst debugModelAdapter = getDebug('ai:model-adapter');\n\nfunction debugAdapterUnsupportedUserConfig(\n modelFamily: ModelAdapterCacheKey,\n adapter: ModelAdapter,\n): void {\n if (adapter.chatCompletion.unsupportedUserConfig.length === 0) {\n return;\n }\n\n debugModelAdapter(\n `model adapter \"${modelFamily}\" unsupportedUserConfig: ${JSON.stringify(\n adapter.chatCompletion.unsupportedUserConfig,\n )}`,\n );\n}\n\nexport function getModelAdapter(modelFamily?: TModelFamily): ModelAdapter {\n const cacheKey: ModelAdapterCacheKey = modelFamily ?? 'default';\n let adapter = modelAdapterCache.get(cacheKey);\n if (adapter) {\n return adapter;\n }\n\n const config = modelFamily\n ? MODEL_ADAPTER_CONFIGS[modelFamily]\n : defaultOpenAICompatibleAdapterConfig;\n if (!config) {\n throw new Error(\n `No model adapter registered for modelFamily: ${modelFamily}`,\n );\n }\n\n adapter = new ResolvedModelAdapter(config, cacheKey);\n modelAdapterCache.set(cacheKey, adapter);\n debugAdapterUnsupportedUserConfig(cacheKey, adapter);\n\n return adapter;\n}\n\nexport function getModelRuntime(config: IModelConfig): ModelRuntime {\n return {\n config,\n adapter: getModelAdapter(config.modelFamily),\n };\n}\n"],"names":["MODEL_ADAPTER_CONFIGS","qwenAdapters","doubaoAdapters","geminiAdapters","uiTarsAdapters","glmAdapters","autoGlmAdapters","gptAdapters","modelAdapterCache","Map","debugModelAdapter","getDebug","debugAdapterUnsupportedUserConfig","modelFamily","adapter","JSON","getModelAdapter","cacheKey","config","defaultOpenAICompatibleAdapterConfig","Error","ResolvedModelAdapter","getModelRuntime"],"mappings":";;;;;;;;;;AAiBO,MAAMA,wBAAwB;IACnC,GAAGC,YAAY;IACf,GAAGC,cAAc;IACjB,GAAGC,cAAc;IACjB,GAAGC,cAAc;IACjB,GAAGC,WAAW;IACd,GAAGC,eAAe;IAClB,GAAGC,WAAW;AAChB;AAIA,MAAMC,oBAAoB,IAAIC;AAC9B,MAAMC,oBAAoBC,SAAS;AAEnC,SAASC,kCACPC,WAAiC,EACjCC,OAAqB;IAErB,IAAIA,AAAwD,MAAxDA,QAAQ,cAAc,CAAC,qBAAqB,CAAC,MAAM,EACrD;IAGFJ,kBACE,CAAC,eAAe,EAAEG,YAAY,yBAAyB,EAAEE,KAAK,SAAS,CACrED,QAAQ,cAAc,CAAC,qBAAqB,GAC3C;AAEP;AAEO,SAASE,gBAAgBH,WAA0B;IACxD,MAAMI,WAAiCJ,eAAe;IACtD,IAAIC,UAAUN,kBAAkB,GAAG,CAACS;IACpC,IAAIH,SACF,OAAOA;IAGT,MAAMI,SAASL,cACXb,qBAAqB,CAACa,YAAY,GAClCM;IACJ,IAAI,CAACD,QACH,MAAM,IAAIE,MACR,CAAC,6CAA6C,EAAEP,aAAa;IAIjEC,UAAU,IAAIO,qBAAqBH,QAAQD;IAC3CT,kBAAkB,GAAG,CAACS,UAAUH;IAChCF,kCAAkCK,UAAUH;IAE5C,OAAOA;AACT;AAEO,SAASQ,gBAAgBJ,MAAoB;IAClD,OAAO;QACLA;QACA,SAASF,gBAAgBE,OAAO,WAAW;IAC7C;AACF"}
1
+ {"version":3,"file":"ai-model/models/registry.mjs","sources":["../../../../src/ai-model/models/registry.ts"],"sourcesContent":["import type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { autoGlmAdapters } from './auto-glm/adapter';\nimport { defaultOpenAICompatibleAdapterConfig } from './default';\nimport { doubaoAdapters } from './doubao';\nimport { geminiAdapters } from './gemini';\nimport { glmAdapters } from './glm';\nimport { gptAdapters } from './gpt';\nimport { kimiAdapters } from './kimi';\nimport { qwenAdapters } from './qwen';\nimport { ResolvedModelAdapter } from './resolved';\nimport type {\n ModelAdapter,\n ModelAdapterDefinition,\n ModelRuntime,\n} from './types';\nimport { uiTarsAdapters } from './ui-tars/adapter';\n\nexport const MODEL_ADAPTER_CONFIGS = {\n ...qwenAdapters,\n ...doubaoAdapters,\n ...geminiAdapters,\n ...uiTarsAdapters,\n ...glmAdapters,\n ...autoGlmAdapters,\n ...gptAdapters,\n ...kimiAdapters,\n} satisfies Record<TModelFamily, ModelAdapterDefinition>;\n\ntype ModelAdapterCacheKey = TModelFamily | 'default';\n\nconst modelAdapterCache = new Map<ModelAdapterCacheKey, ModelAdapter>();\nconst debugModelAdapter = getDebug('ai:model-adapter');\n\nfunction debugAdapterUnsupportedUserConfig(\n modelFamily: ModelAdapterCacheKey,\n adapter: ModelAdapter,\n): void {\n if (adapter.chatCompletion.unsupportedUserConfig.length === 0) {\n return;\n }\n\n debugModelAdapter(\n `model adapter \"${modelFamily}\" unsupportedUserConfig: ${JSON.stringify(\n adapter.chatCompletion.unsupportedUserConfig,\n )}`,\n );\n}\n\nexport function getModelAdapter(modelFamily?: TModelFamily): ModelAdapter {\n const cacheKey: ModelAdapterCacheKey = modelFamily ?? 'default';\n let adapter = modelAdapterCache.get(cacheKey);\n if (adapter) {\n return adapter;\n }\n\n const config = modelFamily\n ? MODEL_ADAPTER_CONFIGS[modelFamily]\n : defaultOpenAICompatibleAdapterConfig;\n if (!config) {\n throw new Error(\n `No model adapter registered for modelFamily: ${modelFamily}`,\n );\n }\n\n adapter = new ResolvedModelAdapter(config, cacheKey);\n modelAdapterCache.set(cacheKey, adapter);\n debugAdapterUnsupportedUserConfig(cacheKey, adapter);\n\n return adapter;\n}\n\nexport function getModelRuntime(config: IModelConfig): ModelRuntime {\n return {\n config,\n adapter: getModelAdapter(config.modelFamily),\n };\n}\n"],"names":["MODEL_ADAPTER_CONFIGS","qwenAdapters","doubaoAdapters","geminiAdapters","uiTarsAdapters","glmAdapters","autoGlmAdapters","gptAdapters","kimiAdapters","modelAdapterCache","Map","debugModelAdapter","getDebug","debugAdapterUnsupportedUserConfig","modelFamily","adapter","JSON","getModelAdapter","cacheKey","config","defaultOpenAICompatibleAdapterConfig","Error","ResolvedModelAdapter","getModelRuntime"],"mappings":";;;;;;;;;;;AAkBO,MAAMA,wBAAwB;IACnC,GAAGC,YAAY;IACf,GAAGC,cAAc;IACjB,GAAGC,cAAc;IACjB,GAAGC,cAAc;IACjB,GAAGC,WAAW;IACd,GAAGC,eAAe;IAClB,GAAGC,WAAW;IACd,GAAGC,YAAY;AACjB;AAIA,MAAMC,oBAAoB,IAAIC;AAC9B,MAAMC,oBAAoBC,SAAS;AAEnC,SAASC,kCACPC,WAAiC,EACjCC,OAAqB;IAErB,IAAIA,AAAwD,MAAxDA,QAAQ,cAAc,CAAC,qBAAqB,CAAC,MAAM,EACrD;IAGFJ,kBACE,CAAC,eAAe,EAAEG,YAAY,yBAAyB,EAAEE,KAAK,SAAS,CACrED,QAAQ,cAAc,CAAC,qBAAqB,GAC3C;AAEP;AAEO,SAASE,gBAAgBH,WAA0B;IACxD,MAAMI,WAAiCJ,eAAe;IACtD,IAAIC,UAAUN,kBAAkB,GAAG,CAACS;IACpC,IAAIH,SACF,OAAOA;IAGT,MAAMI,SAASL,cACXd,qBAAqB,CAACc,YAAY,GAClCM;IACJ,IAAI,CAACD,QACH,MAAM,IAAIE,MACR,CAAC,6CAA6C,EAAEP,aAAa;IAIjEC,UAAU,IAAIO,qBAAqBH,QAAQD;IAC3CT,kBAAkB,GAAG,CAACS,UAAUH;IAChCF,kCAAkCK,UAAUH;IAE5C,OAAOA;AACT;AAEO,SAASQ,gBAAgBJ,MAAoB;IAClD,OAAO;QACLA;QACA,SAASF,gBAAgBE,OAAO,WAAW;IAC7C;AACF"}
@@ -77,7 +77,7 @@ async function uiTarsPlanning(userInstruction, options, uiTarsModelVersion) {
77
77
  parsed = parseResult.parsed;
78
78
  } catch (parseError) {
79
79
  const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
80
- throw new AIResponseParseError(`Parse error: ${errorMessage}`, JSON.stringify(res.content, void 0, 2), res.usage);
80
+ throw new AIResponseParseError(`Parse error: ${errorMessage}`, JSON.stringify(res.content, void 0, 2), res.usage, res.rawChoiceMessage);
81
81
  }
82
82
  const { shotSize } = context;
83
83
  debug('ui-tars modelVer', uiTarsModelVersion, ', parsed', JSON.stringify(parsed));
@@ -192,7 +192,7 @@ async function uiTarsPlanning(userInstruction, options, uiTarsModelVersion) {
192
192
  'No actions found in UI-TARS response.',
193
193
  ...errorDetails
194
194
  ].join('\n');
195
- throw new AIResponseParseError(errorMessage, JSON.stringify(res.content, void 0, 2), res.usage);
195
+ throw new AIResponseParseError(errorMessage, JSON.stringify(res.content, void 0, 2), res.usage, res.rawChoiceMessage);
196
196
  }
197
197
  debug('transformActions', JSON.stringify(transformActions, null, 2));
198
198
  const log = getSummary(res.content);
@@ -205,6 +205,7 @@ async function uiTarsPlanning(userInstruction, options, uiTarsModelVersion) {
205
205
  log,
206
206
  usage: res.usage,
207
207
  rawResponse: JSON.stringify(res.content, void 0, 2),
208
+ rawChoiceMessage: res.rawChoiceMessage,
208
209
  shouldContinuePlanning: shouldContinue
209
210
  };
210
211
  }