@midscene/core 0.28.11-beta-20250922065131.0 → 0.28.12-beta-20250923080328.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/es/agent/agent.mjs +1 -1
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/tasks.mjs +37 -164
  4. package/dist/es/agent/tasks.mjs.map +1 -1
  5. package/dist/es/agent/ui-utils.mjs +27 -13
  6. package/dist/es/agent/ui-utils.mjs.map +1 -1
  7. package/dist/es/agent/utils.mjs +1 -1
  8. package/dist/es/ai-model/conversation-history.mjs +58 -0
  9. package/dist/es/ai-model/conversation-history.mjs.map +1 -0
  10. package/dist/es/ai-model/index.mjs +2 -1
  11. package/dist/es/ai-model/llm-planning.mjs +23 -3
  12. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  13. package/dist/es/ai-model/ui-tars-planning.mjs +26 -6
  14. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  15. package/dist/es/utils.mjs +2 -2
  16. package/dist/es/yaml/player.mjs +0 -9
  17. package/dist/es/yaml/player.mjs.map +1 -1
  18. package/dist/lib/agent/agent.js +1 -1
  19. package/dist/lib/agent/agent.js.map +1 -1
  20. package/dist/lib/agent/tasks.js +36 -163
  21. package/dist/lib/agent/tasks.js.map +1 -1
  22. package/dist/lib/agent/ui-utils.js +27 -13
  23. package/dist/lib/agent/ui-utils.js.map +1 -1
  24. package/dist/lib/agent/utils.js +1 -1
  25. package/dist/lib/ai-model/conversation-history.js +92 -0
  26. package/dist/lib/ai-model/conversation-history.js.map +1 -0
  27. package/dist/lib/ai-model/index.js +6 -2
  28. package/dist/lib/ai-model/llm-planning.js +23 -3
  29. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  30. package/dist/lib/ai-model/ui-tars-planning.js +26 -6
  31. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  32. package/dist/lib/utils.js +2 -2
  33. package/dist/lib/yaml/player.js +0 -9
  34. package/dist/lib/yaml/player.js.map +1 -1
  35. package/dist/types/agent/tasks.d.ts +3 -18
  36. package/dist/types/agent/ui-utils.d.ts +1 -1
  37. package/dist/types/ai-model/conversation-history.d.ts +18 -0
  38. package/dist/types/ai-model/index.d.ts +1 -0
  39. package/dist/types/ai-model/llm-planning.d.ts +2 -0
  40. package/dist/types/ai-model/ui-tars-planning.d.ts +6 -18
  41. package/package.json +3 -3
@@ -15,18 +15,31 @@ const pointToBbox = (point, width, height)=>[
15
15
  Math.round(Math.min(point.x + bboxSize / 2, width)),
16
16
  Math.round(Math.min(point.y + bboxSize / 2, height))
17
17
  ];
18
- async function vlmPlanning(options) {
19
- const { conversationHistory, userInstruction, size, modelConfig } = options;
18
+ async function vlmPlanning(userInstruction, options) {
19
+ const { conversationHistory, context, modelConfig } = options;
20
20
  const { uiTarsModelVersion } = modelConfig;
21
21
  const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
22
+ const imagePayload = await resizeImageForUiTars(context.screenshotBase64, context.size, uiTarsModelVersion);
23
+ conversationHistory.append({
24
+ role: 'user',
25
+ content: [
26
+ {
27
+ type: 'image_url',
28
+ image_url: {
29
+ url: imagePayload
30
+ }
31
+ }
32
+ ]
33
+ });
22
34
  const res = await callAIWithStringResponse([
23
35
  {
24
36
  role: 'user',
25
37
  content: systemPrompt
26
38
  },
27
- ...conversationHistory
39
+ ...conversationHistory.snapshot()
28
40
  ], AIActionType.INSPECT_ELEMENT, modelConfig);
29
41
  const convertedText = convertBboxToCoordinates(res.content);
42
+ const { size } = context;
30
43
  const { parsed } = actionParser({
31
44
  prediction: convertedText,
32
45
  factor: [
@@ -41,6 +54,7 @@ async function vlmPlanning(options) {
41
54
  });
42
55
  debug('ui-tars modelVer', uiTarsModelVersion, ', parsed', JSON.stringify(parsed));
43
56
  const transformActions = [];
57
+ let shouldContinue = true;
44
58
  parsed.forEach((action)=>{
45
59
  if ('click' === action.action_type) {
46
60
  assert(action.action_inputs.start_box, 'start_box is required');
@@ -118,6 +132,7 @@ async function vlmPlanning(options) {
118
132
  },
119
133
  thought: action.thought || ''
120
134
  });
135
+ else if ('finished' === action.action_type) shouldContinue = false;
121
136
  });
122
137
  if (0 === transformActions.length) throw new Error(`No actions found, response: ${res.content}`, {
123
138
  cause: {
@@ -126,12 +141,17 @@ async function vlmPlanning(options) {
126
141
  }
127
142
  });
128
143
  debug('transformActions', JSON.stringify(transformActions, null, 2));
144
+ const log = getSummary(res.content);
145
+ conversationHistory.append({
146
+ role: 'assistant',
147
+ content: log
148
+ });
129
149
  return {
130
150
  actions: transformActions,
131
- actionsFromModel: parsed,
132
- action_summary: getSummary(res.content),
151
+ log,
133
152
  usage: res.usage,
134
- rawResponse: JSON.stringify(res.content, void 0, 2)
153
+ rawResponse: JSON.stringify(res.content, void 0, 2),
154
+ more_actions_needed_by_instruction: shouldContinue
135
155
  };
136
156
  }
137
157
  function convertBboxToCoordinates(text) {
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { callAIWithStringResponse } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelConfig: IModelConfig;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelConfig } = options;\n const { uiTarsModelVersion } = modelConfig;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await callAIWithStringResponse(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelConfig,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: uiTarsModelVersion,\n });\n\n debug(\n 'ui-tars modelVer',\n uiTarsModelVersion,\n ', parsed',\n JSON.stringify(parsed),\n );\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys,\n },\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n uiTarsVersion: UITarsModelVersion | undefined,\n) {\n if (uiTarsVersion === UITarsModelVersion.V1_5) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelConfig","uiTarsModelVersion","systemPrompt","getUiTarsPlanningPrompt","res","callAIWithStringResponse","AIActionType","convertedText","convertBboxToCoordinates","parsed","actionParser","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","Error","getSummary","undefined","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","uiTarsVersion","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AAyBA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,WAAW,EAAE,GAAGJ;IACpE,MAAM,EAAEK,kBAAkB,EAAE,GAAGD;IAC/B,MAAME,eAAeC,4BAA4BL;IAEjD,MAAMM,MAAM,MAAMC,yBAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGL;KACJ,EACDS,aAAa,eAAe,EAC5BN;IAEF,MAAMO,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAM,EAAEK,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYH;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOR,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUE;IACZ;IAEAd,MACE,oBACAc,oBACA,YACAU,KAAK,SAAS,CAACF;IAGjB,MAAMG,mBAAqC,EAAE;IAC7CH,OAAO,OAAO,CAAC,CAACI;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMtB,QAAQwB,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEd;YACvDa,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMvB,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIc,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEd;YAC5D,MAAMkB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEd;YACxDa,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMvB,YACJ;4BAAE,GAAG0B,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCjB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQc,OAAO,OAAO,IAAI;wBAC1B,MAAMvB,YACJ;4BAAE,GAAG2B,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjClB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASc,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASM;gBACX;gBACA,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEO,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASC,OAAO,OAAO,IAAI;QAC7B;IAEJ;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIS,MAAM,CAAC,4BAA4B,EAAEjB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBK;QACF;IACF;IAGFtB,MAAM,oBAAoBwB,KAAK,SAAS,CAACC,kBAAkB,MAAM;IAEjE,OAAO;QACL,SAASA;QACT,kBAAkBH;QAClB,gBAAgBa,WAAWlB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaO,KAAK,SAAS,CAACP,IAAI,OAAO,EAAEmB,QAAW;IACtD;AACF;AAOA,SAASf,yBAAyBgB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI3C,KAAK,KAAK,CAAEsC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI5C,KAAK,KAAK,CAAEwC,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASX,SAASyB,QAAgB,EAAEzC,IAAuC;IACzE,MAAM,CAACsC,GAAGC,EAAE,GAAG3B,KAAK,KAAK,CAAC6B;IAC1B,OAAO;QAACH,IAAItC,KAAK,KAAK;QAAEuC,IAAIvC,KAAK,MAAM;KAAC;AAC1C;AAkEO,eAAe0C,qBACpBC,WAAmB,EACnB3C,IAAU,EACV4C,aAA6C;IAE7C,IAAIA,kBAAkBC,mBAAmB,IAAI,EAAE;QAC7CzD,MAAM,uCAAuCY;QAC7C,MAAM8C,gBAAgB9C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAM+C,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAerD,KAAK,IAAI,CAACoD,YAAYD;YAC3C,MAAMG,WAAWtD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGgD;YACzC,MAAME,YAAYvD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGgD;YAC3C5D,MACE,2DACA6D,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
1
+ {"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n PlanningAIResponse,\n PlanningAction,\n Size,\n UIContext,\n} from '@/types';\nimport { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport { AIActionType } from './common';\nimport type { ConversationHistory } from './conversation-history';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { callAIWithStringResponse } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(\n userInstruction: string,\n options: {\n conversationHistory: ConversationHistory;\n context: UIContext;\n modelConfig: IModelConfig;\n },\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, modelConfig } = options;\n const { uiTarsModelVersion } = modelConfig;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const imagePayload = await resizeImageForUiTars(\n context.screenshotBase64,\n context.size,\n uiTarsModelVersion,\n );\n\n conversationHistory.append({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n },\n },\n ],\n });\n\n const res = await callAIWithStringResponse(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory.snapshot(),\n ],\n AIActionType.INSPECT_ELEMENT,\n modelConfig,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const { size } = context;\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: uiTarsModelVersion,\n });\n\n debug(\n 'ui-tars modelVer',\n uiTarsModelVersion,\n ', parsed',\n JSON.stringify(parsed),\n );\n\n const transformActions: PlanningAction[] = [];\n let shouldContinue = true;\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys,\n },\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n shouldContinue = false;\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n const log = getSummary(res.content);\n\n conversationHistory.append({\n role: 'assistant',\n content: log,\n });\n\n return {\n actions: transformActions,\n log,\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n more_actions_needed_by_instruction: shouldContinue,\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n uiTarsVersion: UITarsModelVersion | undefined,\n) {\n if (uiTarsVersion === UITarsModelVersion.V1_5) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","userInstruction","options","conversationHistory","context","modelConfig","uiTarsModelVersion","systemPrompt","getUiTarsPlanningPrompt","imagePayload","resizeImageForUiTars","res","callAIWithStringResponse","AIActionType","convertedText","convertBboxToCoordinates","size","parsed","actionParser","JSON","transformActions","shouldContinue","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","Error","log","getSummary","undefined","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","imageBase64","uiTarsVersion","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AAyBA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YACpBC,eAAuB,EACvBC,OAIC;IAED,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,WAAW,EAAE,GAAGH;IACtD,MAAM,EAAEI,kBAAkB,EAAE,GAAGD;IAC/B,MAAME,eAAeC,4BAA4BP;IAEjD,MAAMQ,eAAe,MAAMC,qBACzBN,QAAQ,gBAAgB,EACxBA,QAAQ,IAAI,EACZE;IAGFH,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKM;gBACP;YACF;SACD;IACH;IAEA,MAAME,MAAM,MAAMC,yBAChB;QACE;YACE,MAAM;YACN,SAASL;QACX;WACGJ,oBAAoB,QAAQ;KAChC,EACDU,aAAa,eAAe,EAC5BR;IAEF,MAAMS,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAM,EAAEK,IAAI,EAAE,GAAGZ;IACjB,MAAM,EAAEa,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYJ;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOE,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUV;IACZ;IAEAd,MACE,oBACAc,oBACA,YACAa,KAAK,SAAS,CAACF;IAGjB,MAAMG,mBAAqC,EAAE;IAC7C,IAAIC,iBAAiB;IACrBJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAM1B,QAAQ4B,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEN;YACvDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQE,OAAO,OAAO,IAAI;wBAC1B,MAAM3B,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BoB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIM,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEN;YAC5D,MAAMU,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEN;YACxDI,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQE,OAAO,OAAO,IAAI;wBAC1B,MAAM3B,YACJ;4BAAE,GAAG8B,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCT,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQM,OAAO,OAAO,IAAI;wBAC1B,MAAM3B,YACJ;4BAAE,GAAG+B,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjCV,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASM,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BF,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOE,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BF,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWE,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BF,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASE,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DF,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASO;gBACX;gBACA,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEO,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BF,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASE,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB;IAErB;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIU,MAAM,CAAC,4BAA4B,EAAEnB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBM;QACF;IACF;IAGFzB,MAAM,oBAAoB2B,KAAK,SAAS,CAACC,kBAAkB,MAAM;IACjE,MAAMW,MAAMC,WAAWrB,IAAI,OAAO;IAElCR,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS4B;IACX;IAEA,OAAO;QACL,SAASX;QACTW;QACA,OAAOpB,IAAI,KAAK;QAChB,aAAaQ,KAAK,SAAS,CAACR,IAAI,OAAO,EAAEsB,QAAW;QACpD,oCAAoCZ;IACtC;AACF;AAOA,SAASN,yBAAyBmB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAIhD,KAAK,KAAK,CAAE2C,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAIjD,KAAK,KAAK,CAAE6C,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASZ,SAAS0B,QAAgB,EAAElC,IAAuC;IACzE,MAAM,CAAC+B,GAAGC,EAAE,GAAG7B,KAAK,KAAK,CAAC+B;IAC1B,OAAO;QAACH,IAAI/B,KAAK,KAAK;QAAEgC,IAAIhC,KAAK,MAAM;KAAC;AAC1C;AAkEO,eAAeN,qBACpByC,WAAmB,EACnBnC,IAAU,EACVoC,aAA6C;IAE7C,IAAIA,kBAAkBC,mBAAmB,IAAI,EAAE;QAC7C7D,MAAM,uCAAuCwB;QAC7C,MAAMsC,gBAAgBtC,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMuC,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAezD,KAAK,IAAI,CAACwD,YAAYD;YAC3C,MAAMG,WAAW1D,KAAK,KAAK,CAACiB,KAAK,KAAK,GAAGwC;YACzC,MAAME,YAAY3D,KAAK,KAAK,CAACiB,KAAK,MAAM,GAAGwC;YAC3ChE,MACE,2DACAiE,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}