@midscene/core 0.30.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/dist/es/agent/agent.mjs +233 -144
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/execution-session.mjs +41 -0
  4. package/dist/es/agent/execution-session.mjs.map +1 -0
  5. package/dist/es/agent/index.mjs +3 -3
  6. package/dist/es/agent/task-builder.mjs +319 -0
  7. package/dist/es/agent/task-builder.mjs.map +1 -0
  8. package/dist/es/agent/task-cache.mjs +4 -4
  9. package/dist/es/agent/task-cache.mjs.map +1 -1
  10. package/dist/es/agent/tasks.mjs +197 -504
  11. package/dist/es/agent/tasks.mjs.map +1 -1
  12. package/dist/es/agent/ui-utils.mjs +54 -35
  13. package/dist/es/agent/ui-utils.mjs.map +1 -1
  14. package/dist/es/agent/utils.mjs +16 -58
  15. package/dist/es/agent/utils.mjs.map +1 -1
  16. package/dist/es/ai-model/conversation-history.mjs +25 -13
  17. package/dist/es/ai-model/conversation-history.mjs.map +1 -1
  18. package/dist/es/ai-model/index.mjs +4 -4
  19. package/dist/es/ai-model/inspect.mjs +45 -54
  20. package/dist/es/ai-model/inspect.mjs.map +1 -1
  21. package/dist/es/ai-model/llm-planning.mjs +47 -65
  22. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  23. package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
  24. package/dist/es/ai-model/prompt/common.mjs.map +1 -1
  25. package/dist/es/ai-model/prompt/describe.mjs.map +1 -1
  26. package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
  27. package/dist/es/ai-model/prompt/llm-locator.mjs +11 -235
  28. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
  29. package/dist/es/ai-model/prompt/llm-planning.mjs +76 -322
  30. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  31. package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -14
  32. package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
  33. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
  34. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
  35. package/dist/es/ai-model/prompt/playwright-generator.mjs +2 -2
  36. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  37. package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -1
  38. package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -1
  39. package/dist/es/ai-model/prompt/util.mjs +3 -88
  40. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  41. package/dist/es/ai-model/prompt/yaml-generator.mjs +10 -10
  42. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  43. package/dist/es/ai-model/service-caller/index.mjs +182 -274
  44. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  45. package/dist/es/ai-model/ui-tars-planning.mjs +69 -8
  46. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  47. package/dist/es/{ai-model/common.mjs → common.mjs} +18 -30
  48. package/dist/es/common.mjs.map +1 -0
  49. package/dist/es/device/device-options.mjs +0 -0
  50. package/dist/es/device/index.mjs +29 -12
  51. package/dist/es/device/index.mjs.map +1 -1
  52. package/dist/es/index.mjs +5 -4
  53. package/dist/es/index.mjs.map +1 -1
  54. package/dist/es/report.mjs.map +1 -1
  55. package/dist/es/{insight → service}/index.mjs +38 -51
  56. package/dist/es/service/index.mjs.map +1 -0
  57. package/dist/es/{insight → service}/utils.mjs +3 -3
  58. package/dist/es/service/utils.mjs.map +1 -0
  59. package/dist/es/task-runner.mjs +264 -0
  60. package/dist/es/task-runner.mjs.map +1 -0
  61. package/dist/es/tree.mjs +13 -2
  62. package/dist/es/tree.mjs.map +1 -0
  63. package/dist/es/types.mjs +18 -1
  64. package/dist/es/types.mjs.map +1 -1
  65. package/dist/es/utils.mjs +6 -7
  66. package/dist/es/utils.mjs.map +1 -1
  67. package/dist/es/yaml/builder.mjs.map +1 -1
  68. package/dist/es/yaml/player.mjs +121 -98
  69. package/dist/es/yaml/player.mjs.map +1 -1
  70. package/dist/es/yaml/utils.mjs +1 -1
  71. package/dist/es/yaml/utils.mjs.map +1 -1
  72. package/dist/lib/agent/agent.js +231 -142
  73. package/dist/lib/agent/agent.js.map +1 -1
  74. package/dist/lib/agent/common.js +1 -1
  75. package/dist/lib/agent/execution-session.js +75 -0
  76. package/dist/lib/agent/execution-session.js.map +1 -0
  77. package/dist/lib/agent/index.js +14 -14
  78. package/dist/lib/agent/index.js.map +1 -1
  79. package/dist/lib/agent/task-builder.js +356 -0
  80. package/dist/lib/agent/task-builder.js.map +1 -0
  81. package/dist/lib/agent/task-cache.js +8 -8
  82. package/dist/lib/agent/task-cache.js.map +1 -1
  83. package/dist/lib/agent/tasks.js +202 -506
  84. package/dist/lib/agent/tasks.js.map +1 -1
  85. package/dist/lib/agent/ui-utils.js +58 -36
  86. package/dist/lib/agent/ui-utils.js.map +1 -1
  87. package/dist/lib/agent/utils.js +26 -68
  88. package/dist/lib/agent/utils.js.map +1 -1
  89. package/dist/lib/ai-model/conversation-history.js +27 -15
  90. package/dist/lib/ai-model/conversation-history.js.map +1 -1
  91. package/dist/lib/ai-model/index.js +27 -27
  92. package/dist/lib/ai-model/index.js.map +1 -1
  93. package/dist/lib/ai-model/inspect.js +51 -57
  94. package/dist/lib/ai-model/inspect.js.map +1 -1
  95. package/dist/lib/ai-model/llm-planning.js +49 -67
  96. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  97. package/dist/lib/ai-model/prompt/assertion.js +2 -2
  98. package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
  99. package/dist/lib/ai-model/prompt/common.js +2 -2
  100. package/dist/lib/ai-model/prompt/common.js.map +1 -1
  101. package/dist/lib/ai-model/prompt/describe.js +2 -2
  102. package/dist/lib/ai-model/prompt/describe.js.map +1 -1
  103. package/dist/lib/ai-model/prompt/extraction.js +2 -2
  104. package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
  105. package/dist/lib/ai-model/prompt/llm-locator.js +14 -241
  106. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
  107. package/dist/lib/ai-model/prompt/llm-planning.js +79 -328
  108. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  109. package/dist/lib/ai-model/prompt/llm-section-locator.js +17 -16
  110. package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
  111. package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
  112. package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
  113. package/dist/lib/ai-model/prompt/playwright-generator.js +11 -11
  114. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  115. package/dist/lib/ai-model/prompt/ui-tars-locator.js +2 -2
  116. package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -1
  117. package/dist/lib/ai-model/prompt/ui-tars-planning.js +2 -2
  118. package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -1
  119. package/dist/lib/ai-model/prompt/util.js +7 -95
  120. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  121. package/dist/lib/ai-model/prompt/yaml-generator.js +18 -18
  122. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  123. package/dist/lib/ai-model/service-caller/index.js +288 -401
  124. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  125. package/dist/lib/ai-model/ui-tars-planning.js +71 -10
  126. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  127. package/dist/lib/{ai-model/common.js → common.js} +40 -55
  128. package/dist/lib/common.js.map +1 -0
  129. package/dist/lib/device/device-options.js +20 -0
  130. package/dist/lib/device/device-options.js.map +1 -0
  131. package/dist/lib/device/index.js +63 -40
  132. package/dist/lib/device/index.js.map +1 -1
  133. package/dist/lib/image/index.js +5 -5
  134. package/dist/lib/image/index.js.map +1 -1
  135. package/dist/lib/index.js +24 -20
  136. package/dist/lib/index.js.map +1 -1
  137. package/dist/lib/report.js +2 -2
  138. package/dist/lib/report.js.map +1 -1
  139. package/dist/lib/{insight → service}/index.js +41 -54
  140. package/dist/lib/service/index.js.map +1 -0
  141. package/dist/lib/{insight → service}/utils.js +7 -7
  142. package/dist/lib/service/utils.js.map +1 -0
  143. package/dist/lib/task-runner.js +301 -0
  144. package/dist/lib/task-runner.js.map +1 -0
  145. package/dist/lib/tree.js +13 -4
  146. package/dist/lib/tree.js.map +1 -1
  147. package/dist/lib/types.js +31 -12
  148. package/dist/lib/types.js.map +1 -1
  149. package/dist/lib/utils.js +16 -17
  150. package/dist/lib/utils.js.map +1 -1
  151. package/dist/lib/yaml/builder.js +2 -2
  152. package/dist/lib/yaml/builder.js.map +1 -1
  153. package/dist/lib/yaml/index.js +16 -22
  154. package/dist/lib/yaml/index.js.map +1 -1
  155. package/dist/lib/yaml/player.js +123 -100
  156. package/dist/lib/yaml/player.js.map +1 -1
  157. package/dist/lib/yaml/utils.js +6 -6
  158. package/dist/lib/yaml/utils.js.map +1 -1
  159. package/dist/lib/yaml.js +1 -1
  160. package/dist/lib/yaml.js.map +1 -1
  161. package/dist/types/agent/agent.d.ts +62 -17
  162. package/dist/types/agent/execution-session.d.ts +36 -0
  163. package/dist/types/agent/index.d.ts +3 -2
  164. package/dist/types/agent/task-builder.d.ts +35 -0
  165. package/dist/types/agent/tasks.d.ts +32 -23
  166. package/dist/types/agent/ui-utils.d.ts +9 -2
  167. package/dist/types/agent/utils.d.ts +9 -35
  168. package/dist/types/ai-model/conversation-history.d.ts +8 -4
  169. package/dist/types/ai-model/index.d.ts +5 -5
  170. package/dist/types/ai-model/inspect.d.ts +20 -12
  171. package/dist/types/ai-model/llm-planning.d.ts +3 -1
  172. package/dist/types/ai-model/prompt/llm-locator.d.ts +1 -6
  173. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -3
  174. package/dist/types/ai-model/prompt/llm-section-locator.d.ts +1 -3
  175. package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
  176. package/dist/types/ai-model/prompt/util.d.ts +2 -34
  177. package/dist/types/ai-model/service-caller/index.d.ts +2 -3
  178. package/dist/types/ai-model/ui-tars-planning.d.ts +15 -2
  179. package/dist/types/{ai-model/common.d.ts → common.d.ts} +6 -6
  180. package/dist/types/device/device-options.d.ts +57 -0
  181. package/dist/types/device/index.d.ts +55 -39
  182. package/dist/types/index.d.ts +7 -6
  183. package/dist/types/service/index.d.ts +26 -0
  184. package/dist/types/service/utils.d.ts +2 -0
  185. package/dist/types/task-runner.d.ts +49 -0
  186. package/dist/types/tree.d.ts +4 -1
  187. package/dist/types/types.d.ts +103 -66
  188. package/dist/types/yaml/utils.d.ts +1 -1
  189. package/dist/types/yaml.d.ts +68 -43
  190. package/package.json +9 -12
  191. package/dist/es/ai-model/action-executor.mjs +0 -129
  192. package/dist/es/ai-model/action-executor.mjs.map +0 -1
  193. package/dist/es/ai-model/common.mjs.map +0 -1
  194. package/dist/es/insight/index.mjs.map +0 -1
  195. package/dist/es/insight/utils.mjs.map +0 -1
  196. package/dist/lib/ai-model/action-executor.js +0 -163
  197. package/dist/lib/ai-model/action-executor.js.map +0 -1
  198. package/dist/lib/ai-model/common.js.map +0 -1
  199. package/dist/lib/insight/index.js.map +0 -1
  200. package/dist/lib/insight/utils.js.map +0 -1
  201. package/dist/types/ai-model/action-executor.d.ts +0 -19
  202. package/dist/types/insight/index.d.ts +0 -31
  203. package/dist/types/insight/utils.d.ts +0 -2
@@ -1,282 +1,207 @@
1
- import { AIResponseFormat } from "../../types.mjs";
2
- import { Anthropic } from "@anthropic-ai/sdk";
3
- import { DefaultAzureCredential, getBearerTokenProvider } from "@azure/identity";
4
- import { MIDSCENE_API_TYPE, MIDSCENE_LANGSMITH_DEBUG, OPENAI_MAX_TOKENS, globalConfigManager } from "@midscene/shared/env";
5
- import { parseBase64 } from "@midscene/shared/img";
1
+ import { MIDSCENE_LANGFUSE_DEBUG, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MODEL_MAX_TOKENS, OPENAI_MAX_TOKENS, globalConfigManager } from "@midscene/shared/env";
6
2
  import { getDebug } from "@midscene/shared/logger";
7
3
  import { assert, ifInBrowser } from "@midscene/shared/utils";
8
- import { HttpsProxyAgent } from "https-proxy-agent";
9
4
  import { jsonrepair } from "jsonrepair";
10
- import openai_0, { AzureOpenAI } from "openai";
11
- import { SocksProxyAgent } from "socks-proxy-agent";
12
- import { AIActionType } from "../common.mjs";
13
- import { assertSchema } from "../prompt/assertion.mjs";
14
- import { locatorSchema } from "../prompt/llm-locator.mjs";
15
- import { planSchema } from "../prompt/llm-planning.mjs";
5
+ import openai_0 from "openai";
16
6
  async function createChatClient({ AIActionTypeValue, modelConfig }) {
17
- const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, openaiUseAzureDeprecated, useAzureOpenai, azureOpenaiScope, azureOpenaiKey, azureOpenaiEndpoint, azureOpenaiApiVersion, azureOpenaiDeployment, azureExtraConfig, useAnthropicSdk, anthropicApiKey, modelDescription, uiTarsModelVersion: uiTarsVersion, vlMode } = modelConfig;
18
- let openai;
7
+ const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, modelDescription, uiTarsModelVersion: uiTarsVersion, vlMode, createOpenAIClient, timeout } = modelConfig;
19
8
  let proxyAgent;
20
9
  const debugProxy = getDebug('ai:call:proxy');
10
+ const sanitizeProxyUrl = (url)=>{
11
+ try {
12
+ const parsed = new URL(url);
13
+ if (parsed.username) {
14
+ parsed.password = '****';
15
+ return parsed.href;
16
+ }
17
+ return url;
18
+ } catch {
19
+ return url;
20
+ }
21
+ };
21
22
  if (httpProxy) {
22
- debugProxy('using http proxy', httpProxy);
23
- proxyAgent = new HttpsProxyAgent(httpProxy);
23
+ debugProxy('using http proxy', sanitizeProxyUrl(httpProxy));
24
+ if (ifInBrowser) console.warn('HTTP proxy is configured but not supported in browser environment');
25
+ else {
26
+ const moduleName = 'undici';
27
+ const { ProxyAgent } = await import(moduleName);
28
+ proxyAgent = new ProxyAgent({
29
+ uri: httpProxy
30
+ });
31
+ }
24
32
  } else if (socksProxy) {
25
- debugProxy('using socks proxy', socksProxy);
26
- proxyAgent = new SocksProxyAgent(socksProxy);
27
- }
28
- if (openaiUseAzureDeprecated) openai = new AzureOpenAI({
29
- baseURL: openaiBaseURL,
30
- apiKey: openaiApiKey,
31
- httpAgent: proxyAgent,
32
- ...openaiExtraConfig,
33
- dangerouslyAllowBrowser: true
34
- });
35
- else if (useAzureOpenai) {
36
- let tokenProvider;
37
- if (azureOpenaiScope) {
38
- assert(!ifInBrowser, 'Azure OpenAI is not supported in browser with Midscene.');
39
- const credential = new DefaultAzureCredential();
40
- tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);
41
- openai = new AzureOpenAI({
42
- azureADTokenProvider: tokenProvider,
43
- endpoint: azureOpenaiEndpoint,
44
- apiVersion: azureOpenaiApiVersion,
45
- deployment: azureOpenaiDeployment,
46
- ...openaiExtraConfig,
47
- ...azureExtraConfig
33
+ debugProxy('using socks proxy', sanitizeProxyUrl(socksProxy));
34
+ if (ifInBrowser) console.warn('SOCKS proxy is configured but not supported in browser environment');
35
+ else try {
36
+ const moduleName = 'fetch-socks';
37
+ const { socksDispatcher } = await import(moduleName);
38
+ const proxyUrl = new URL(socksProxy);
39
+ if (!proxyUrl.hostname) throw new Error('SOCKS proxy URL must include a valid hostname');
40
+ const port = Number.parseInt(proxyUrl.port, 10);
41
+ if (!proxyUrl.port || Number.isNaN(port)) throw new Error('SOCKS proxy URL must include a valid port');
42
+ const protocol = proxyUrl.protocol.replace(':', '');
43
+ const socksType = 'socks4' === protocol ? 4 : 'socks5' === protocol ? 5 : 5;
44
+ proxyAgent = socksDispatcher({
45
+ type: socksType,
46
+ host: proxyUrl.hostname,
47
+ port,
48
+ ...proxyUrl.username ? {
49
+ userId: decodeURIComponent(proxyUrl.username),
50
+ password: decodeURIComponent(proxyUrl.password || '')
51
+ } : {}
48
52
  });
49
- } else openai = new AzureOpenAI({
50
- apiKey: azureOpenaiKey,
51
- endpoint: azureOpenaiEndpoint,
52
- apiVersion: azureOpenaiApiVersion,
53
- deployment: azureOpenaiDeployment,
54
- dangerouslyAllowBrowser: true,
55
- ...openaiExtraConfig,
56
- ...azureExtraConfig
57
- });
58
- } else if (!useAnthropicSdk) openai = new openai_0({
53
+ debugProxy('socks proxy configured successfully', {
54
+ type: socksType,
55
+ host: proxyUrl.hostname,
56
+ port: port
57
+ });
58
+ } catch (error) {
59
+ console.error('Failed to configure SOCKS proxy:', error);
60
+ throw new Error(`Invalid SOCKS proxy URL: ${socksProxy}. Expected format: socks4://host:port, socks5://host:port, or with authentication: socks5://user:pass@host:port`);
61
+ }
62
+ }
63
+ const openAIOptions = {
59
64
  baseURL: openaiBaseURL,
60
65
  apiKey: openaiApiKey,
61
- httpAgent: proxyAgent,
66
+ ...proxyAgent ? {
67
+ fetchOptions: {
68
+ dispatcher: proxyAgent
69
+ }
70
+ } : {},
62
71
  ...openaiExtraConfig,
63
- defaultHeaders: {
64
- ...(null == openaiExtraConfig ? void 0 : openaiExtraConfig.defaultHeaders) || {},
65
- [MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
66
- },
72
+ ...'number' == typeof timeout ? {
73
+ timeout
74
+ } : {},
67
75
  dangerouslyAllowBrowser: true
68
- });
76
+ };
77
+ const baseOpenAI = new openai_0(openAIOptions);
78
+ let openai = baseOpenAI;
69
79
  if (openai && globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
70
80
  if (ifInBrowser) throw new Error('langsmith is not supported in browser');
71
81
  console.log('DEBUGGING MODE: langsmith wrapper enabled');
72
- const { wrapOpenAI } = await import("langsmith/wrappers");
82
+ const langsmithModule = 'langsmith/wrappers';
83
+ const { wrapOpenAI } = await import(langsmithModule);
73
84
  openai = wrapOpenAI(openai);
74
85
  }
75
- if (void 0 !== openai) return {
86
+ if (openai && globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGFUSE_DEBUG)) {
87
+ if (ifInBrowser) throw new Error('langfuse is not supported in browser');
88
+ console.log('DEBUGGING MODE: langfuse wrapper enabled');
89
+ const langfuseModule = 'langfuse';
90
+ const { observeOpenAI } = await import(langfuseModule);
91
+ openai = observeOpenAI(openai);
92
+ }
93
+ if (createOpenAIClient) {
94
+ const wrappedClient = await createOpenAIClient(baseOpenAI, openAIOptions);
95
+ if (wrappedClient) openai = wrappedClient;
96
+ }
97
+ return {
76
98
  completion: openai.chat.completions,
77
- style: 'openai',
78
- modelName,
79
- modelDescription,
80
- uiTarsVersion,
81
- vlMode
82
- };
83
- if (useAnthropicSdk) openai = new Anthropic({
84
- apiKey: anthropicApiKey,
85
- httpAgent: proxyAgent,
86
- dangerouslyAllowBrowser: true
87
- });
88
- if (void 0 !== openai && openai.messages) return {
89
- completion: openai.messages,
90
- style: 'anthropic',
91
99
  modelName,
92
100
  modelDescription,
93
101
  uiTarsVersion,
94
102
  vlMode
95
103
  };
96
- throw new Error('Openai SDK or Anthropic SDK is not initialized');
97
104
  }
98
105
  async function callAI(messages, AIActionTypeValue, modelConfig, options) {
99
- const { completion, style, modelName, modelDescription, uiTarsVersion, vlMode } = await createChatClient({
106
+ const { completion, modelName, modelDescription, uiTarsVersion, vlMode } = await createChatClient({
100
107
  AIActionTypeValue,
101
108
  modelConfig
102
109
  });
103
- const responseFormat = getResponseFormat(modelName, AIActionTypeValue);
104
- const maxTokens = globalConfigManager.getEnvConfigValue(OPENAI_MAX_TOKENS);
110
+ const maxTokens = globalConfigManager.getEnvConfigValue(MIDSCENE_MODEL_MAX_TOKENS) ?? globalConfigManager.getEnvConfigValue(OPENAI_MAX_TOKENS);
105
111
  const debugCall = getDebug('ai:call');
106
112
  const debugProfileStats = getDebug('ai:profile:stats');
107
113
  const debugProfileDetail = getDebug('ai:profile:detail');
108
114
  const startTime = Date.now();
109
- const isStreaming = (null == options ? void 0 : options.stream) && (null == options ? void 0 : options.onChunk);
115
+ const isStreaming = options?.stream && options?.onChunk;
110
116
  let content;
111
117
  let accumulated = '';
112
118
  let usage;
113
119
  let timeCost;
120
+ const buildUsageInfo = (usageData)=>{
121
+ if (!usageData) return;
122
+ const cachedInputTokens = usageData?.prompt_tokens_details?.cached_tokens;
123
+ return {
124
+ prompt_tokens: usageData.prompt_tokens ?? 0,
125
+ completion_tokens: usageData.completion_tokens ?? 0,
126
+ total_tokens: usageData.total_tokens ?? 0,
127
+ cached_input: cachedInputTokens ?? 0,
128
+ time_cost: timeCost ?? 0,
129
+ model_name: modelName,
130
+ model_description: modelDescription,
131
+ intent: modelConfig.intent
132
+ };
133
+ };
114
134
  const commonConfig = {
115
- temperature: 'vlm-ui-tars' === vlMode ? 0.0 : 0.1,
135
+ temperature: 'vlm-ui-tars' === vlMode ? 0.0 : void 0,
116
136
  stream: !!isStreaming,
117
- max_tokens: 'number' == typeof maxTokens ? maxTokens : Number.parseInt(maxTokens || '2048', 10),
118
- ...'qwen-vl' === vlMode || 'qwen3-vl' === vlMode ? {
137
+ max_tokens: 'number' == typeof maxTokens ? maxTokens : void 0,
138
+ ...'qwen2.5-vl' === vlMode ? {
119
139
  vl_high_resolution_images: true
120
140
  } : {}
121
141
  };
122
142
  try {
123
- if ('openai' === style) {
124
- debugCall(`sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`);
125
- if (isStreaming) {
126
- const stream = await completion.create({
127
- model: modelName,
128
- messages,
129
- response_format: responseFormat,
130
- ...commonConfig
131
- }, {
132
- stream: true
133
- });
134
- for await (const chunk of stream){
135
- var _chunk_choices__delta, _chunk_choices_, _chunk_choices, _chunk_choices__delta1, _chunk_choices_1, _chunk_choices1, _chunk_choices_2, _chunk_choices2;
136
- const content = (null == (_chunk_choices = chunk.choices) ? void 0 : null == (_chunk_choices_ = _chunk_choices[0]) ? void 0 : null == (_chunk_choices__delta = _chunk_choices_.delta) ? void 0 : _chunk_choices__delta.content) || '';
137
- const reasoning_content = (null == (_chunk_choices1 = chunk.choices) ? void 0 : null == (_chunk_choices_1 = _chunk_choices1[0]) ? void 0 : null == (_chunk_choices__delta1 = _chunk_choices_1.delta) ? void 0 : _chunk_choices__delta1.reasoning_content) || '';
138
- if (chunk.usage) usage = chunk.usage;
139
- if (content || reasoning_content) {
140
- accumulated += content;
141
- const chunkData = {
142
- content,
143
- reasoning_content,
144
- accumulated,
145
- isComplete: false,
146
- usage: void 0
147
- };
148
- options.onChunk(chunkData);
149
- }
150
- if (null == (_chunk_choices2 = chunk.choices) ? void 0 : null == (_chunk_choices_2 = _chunk_choices2[0]) ? void 0 : _chunk_choices_2.finish_reason) {
151
- timeCost = Date.now() - startTime;
152
- if (!usage) {
153
- const estimatedTokens = Math.max(1, Math.floor(accumulated.length / 4));
154
- usage = {
155
- prompt_tokens: estimatedTokens,
156
- completion_tokens: estimatedTokens,
157
- total_tokens: 2 * estimatedTokens
158
- };
159
- }
160
- const finalChunk = {
161
- content: '',
162
- accumulated,
163
- reasoning_content: '',
164
- isComplete: true,
165
- usage: {
166
- prompt_tokens: usage.prompt_tokens ?? 0,
167
- completion_tokens: usage.completion_tokens ?? 0,
168
- total_tokens: usage.total_tokens ?? 0,
169
- time_cost: timeCost ?? 0,
170
- model_name: modelName,
171
- model_description: modelDescription,
172
- intent: modelConfig.intent
173
- }
174
- };
175
- options.onChunk(finalChunk);
176
- break;
177
- }
178
- }
179
- content = accumulated;
180
- debugProfileStats(`streaming model, ${modelName}, mode, ${vlMode || 'default'}, cost-ms, ${timeCost}`);
181
- } else {
182
- var _result_usage, _result_usage1, _result_usage2;
183
- const result = await completion.create({
184
- model: modelName,
185
- messages,
186
- response_format: responseFormat,
187
- ...commonConfig
188
- });
189
- timeCost = Date.now() - startTime;
190
- debugProfileStats(`model, ${modelName}, mode, ${vlMode || 'default'}, ui-tars-version, ${uiTarsVersion}, prompt-tokens, ${(null == (_result_usage = result.usage) ? void 0 : _result_usage.prompt_tokens) || ''}, completion-tokens, ${(null == (_result_usage1 = result.usage) ? void 0 : _result_usage1.completion_tokens) || ''}, total-tokens, ${(null == (_result_usage2 = result.usage) ? void 0 : _result_usage2.total_tokens) || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`);
191
- debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);
192
- assert(result.choices, `invalid response from LLM service: ${JSON.stringify(result)}`);
193
- content = result.choices[0].message.content;
194
- usage = result.usage;
195
- }
196
- debugCall(`response: ${content}`);
197
- assert(content, 'empty content');
198
- } else if ('anthropic' === style) {
199
- const convertImageContent = (content)=>{
200
- if ('image_url' === content.type) {
201
- const imgBase64 = content.image_url.url;
202
- assert(imgBase64, 'image_url is required');
203
- const { mimeType, body } = parseBase64(content.image_url.url);
204
- return {
205
- source: {
206
- type: 'base64',
207
- media_type: mimeType,
208
- data: body
209
- },
210
- type: 'image'
143
+ debugCall(`sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`);
144
+ if (isStreaming) {
145
+ const stream = await completion.create({
146
+ model: modelName,
147
+ messages,
148
+ ...commonConfig
149
+ }, {
150
+ stream: true
151
+ });
152
+ for await (const chunk of stream){
153
+ const content = chunk.choices?.[0]?.delta?.content || '';
154
+ const reasoning_content = chunk.choices?.[0]?.delta?.reasoning_content || '';
155
+ if (chunk.usage) usage = chunk.usage;
156
+ if (content || reasoning_content) {
157
+ accumulated += content;
158
+ const chunkData = {
159
+ content,
160
+ reasoning_content,
161
+ accumulated,
162
+ isComplete: false,
163
+ usage: void 0
211
164
  };
165
+ options.onChunk(chunkData);
212
166
  }
213
- return content;
214
- };
215
- if (isStreaming) {
216
- const stream = await completion.create({
217
- model: modelName,
218
- system: 'You are a versatile professional in software UI automation',
219
- messages: messages.map((m)=>({
220
- role: 'user',
221
- content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
222
- })),
223
- response_format: responseFormat,
224
- ...commonConfig
225
- });
226
- for await (const chunk of stream){
227
- var _chunk_delta;
228
- const content = (null == (_chunk_delta = chunk.delta) ? void 0 : _chunk_delta.text) || '';
229
- if (content) {
230
- accumulated += content;
231
- const chunkData = {
232
- content,
233
- accumulated,
234
- reasoning_content: '',
235
- isComplete: false,
236
- usage: void 0
237
- };
238
- options.onChunk(chunkData);
239
- }
240
- if ('message_stop' === chunk.type) {
241
- timeCost = Date.now() - startTime;
242
- const anthropicUsage = chunk.usage;
243
- const finalChunk = {
244
- content: '',
245
- accumulated,
246
- reasoning_content: '',
247
- isComplete: true,
248
- usage: anthropicUsage ? {
249
- prompt_tokens: anthropicUsage.input_tokens ?? 0,
250
- completion_tokens: anthropicUsage.output_tokens ?? 0,
251
- total_tokens: (anthropicUsage.input_tokens ?? 0) + (anthropicUsage.output_tokens ?? 0),
252
- time_cost: timeCost ?? 0,
253
- model_name: modelName,
254
- model_description: modelDescription,
255
- intent: modelConfig.intent
256
- } : void 0
167
+ if (chunk.choices?.[0]?.finish_reason) {
168
+ timeCost = Date.now() - startTime;
169
+ if (!usage) {
170
+ const estimatedTokens = Math.max(1, Math.floor(accumulated.length / 4));
171
+ usage = {
172
+ prompt_tokens: estimatedTokens,
173
+ completion_tokens: estimatedTokens,
174
+ total_tokens: 2 * estimatedTokens
257
175
  };
258
- options.onChunk(finalChunk);
259
- break;
260
176
  }
177
+ const finalChunk = {
178
+ content: '',
179
+ accumulated,
180
+ reasoning_content: '',
181
+ isComplete: true,
182
+ usage: buildUsageInfo(usage)
183
+ };
184
+ options.onChunk(finalChunk);
185
+ break;
261
186
  }
262
- content = accumulated;
263
- } else {
264
- const result = await completion.create({
265
- model: modelName,
266
- system: 'You are a versatile professional in software UI automation',
267
- messages: messages.map((m)=>({
268
- role: 'user',
269
- content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
270
- })),
271
- response_format: responseFormat,
272
- ...commonConfig
273
- });
274
- timeCost = Date.now() - startTime;
275
- content = result.content[0].text;
276
- usage = result.usage;
277
187
  }
278
- assert(content, 'empty content');
188
+ content = accumulated;
189
+ debugProfileStats(`streaming model, ${modelName}, mode, ${vlMode || 'default'}, cost-ms, ${timeCost}`);
190
+ } else {
191
+ const result = await completion.create({
192
+ model: modelName,
193
+ messages,
194
+ ...commonConfig
195
+ });
196
+ timeCost = Date.now() - startTime;
197
+ debugProfileStats(`model, ${modelName}, mode, ${vlMode || 'default'}, ui-tars-version, ${uiTarsVersion}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`);
198
+ debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);
199
+ assert(result.choices, `invalid response from LLM service: ${JSON.stringify(result)}`);
200
+ content = result.choices[0].message.content;
201
+ usage = result.usage;
279
202
  }
203
+ debugCall(`response: ${content}`);
204
+ assert(content, 'empty content');
280
205
  if (isStreaming && !usage) {
281
206
  const estimatedTokens = Math.max(1, Math.floor((content || '').length / 4));
282
207
  usage = {
@@ -287,52 +212,17 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
287
212
  }
288
213
  return {
289
214
  content: content || '',
290
- usage: usage ? {
291
- prompt_tokens: usage.prompt_tokens ?? 0,
292
- completion_tokens: usage.completion_tokens ?? 0,
293
- total_tokens: usage.total_tokens ?? 0,
294
- time_cost: timeCost ?? 0,
295
- model_name: modelName,
296
- model_description: modelDescription,
297
- intent: modelConfig.intent
298
- } : void 0,
215
+ usage: buildUsageInfo(usage),
299
216
  isStreamed: !!isStreaming
300
217
  };
301
218
  } catch (e) {
302
219
  console.error(' call AI error', e);
303
- const newError = new Error(`failed to call ${isStreaming ? 'streaming ' : ''}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`, {
220
+ const newError = new Error(`failed to call ${isStreaming ? 'streaming ' : ''}AI model service (${modelName}): ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`, {
304
221
  cause: e
305
222
  });
306
223
  throw newError;
307
224
  }
308
225
  }
309
- const getResponseFormat = (modelName, AIActionTypeValue)=>{
310
- let responseFormat;
311
- if (modelName.includes('gpt-4')) switch(AIActionTypeValue){
312
- case AIActionType.ASSERT:
313
- responseFormat = assertSchema;
314
- break;
315
- case AIActionType.INSPECT_ELEMENT:
316
- responseFormat = locatorSchema;
317
- break;
318
- case AIActionType.PLAN:
319
- responseFormat = planSchema;
320
- break;
321
- case AIActionType.EXTRACT_DATA:
322
- case AIActionType.DESCRIBE_ELEMENT:
323
- responseFormat = {
324
- type: AIResponseFormat.JSON
325
- };
326
- break;
327
- case AIActionType.TEXT:
328
- responseFormat = void 0;
329
- break;
330
- }
331
- if ('gpt-4o-2024-05-13' === modelName && AIActionTypeValue !== AIActionType.TEXT) responseFormat = {
332
- type: AIResponseFormat.JSON
333
- };
334
- return responseFormat;
335
- };
336
226
  async function callAIWithObjectResponse(messages, AIActionTypeValue, modelConfig) {
337
227
  const response = await callAI(messages, AIActionTypeValue, modelConfig);
338
228
  assert(response, 'empty response');
@@ -340,6 +230,7 @@ async function callAIWithObjectResponse(messages, AIActionTypeValue, modelConfig
340
230
  const jsonContent = safeParseJson(response.content, vlMode);
341
231
  return {
342
232
  content: jsonContent,
233
+ contentString: response.content,
343
234
  usage: response.usage
344
235
  };
345
236
  }
@@ -365,24 +256,41 @@ function preprocessDoubaoBboxJson(input) {
365
256
  if (input.includes('bbox')) while(/\d+\s+\d+/.test(input))input = input.replace(/(\d+)\s+(\d+)/g, '$1,$2');
366
257
  return input;
367
258
  }
259
+ function normalizeJsonObject(obj) {
260
+ if (null == obj) return obj;
261
+ if (Array.isArray(obj)) return obj.map((item)=>normalizeJsonObject(item));
262
+ if ('object' == typeof obj) {
263
+ const normalized = {};
264
+ for (const [key, value] of Object.entries(obj)){
265
+ const trimmedKey = key.trim();
266
+ let normalizedValue = normalizeJsonObject(value);
267
+ if ('string' == typeof normalizedValue) normalizedValue = normalizedValue.trim();
268
+ normalized[trimmedKey] = normalizedValue;
269
+ }
270
+ return normalized;
271
+ }
272
+ if ('string' == typeof obj) return obj.trim();
273
+ return obj;
274
+ }
368
275
  function safeParseJson(input, vlMode) {
369
276
  const cleanJsonString = extractJSONFromCodeBlock(input);
370
- if (null == cleanJsonString ? void 0 : cleanJsonString.match(/\((\d+),(\d+)\)/)) {
371
- var _cleanJsonString_match;
372
- return null == (_cleanJsonString_match = cleanJsonString.match(/\((\d+),(\d+)\)/)) ? void 0 : _cleanJsonString_match.slice(1).map(Number);
373
- }
277
+ if (cleanJsonString?.match(/\((\d+),(\d+)\)/)) return cleanJsonString.match(/\((\d+),(\d+)\)/)?.slice(1).map(Number);
278
+ let parsed;
374
279
  try {
375
- return JSON.parse(cleanJsonString);
280
+ parsed = JSON.parse(cleanJsonString);
281
+ return normalizeJsonObject(parsed);
376
282
  } catch {}
377
283
  try {
378
- return JSON.parse(jsonrepair(cleanJsonString));
284
+ parsed = JSON.parse(jsonrepair(cleanJsonString));
285
+ return normalizeJsonObject(parsed);
379
286
  } catch (e) {}
380
287
  if ('doubao-vision' === vlMode || 'vlm-ui-tars' === vlMode) {
381
288
  const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
382
- return JSON.parse(jsonrepair(jsonString));
289
+ parsed = JSON.parse(jsonrepair(jsonString));
290
+ return normalizeJsonObject(parsed);
383
291
  }
384
292
  throw Error(`failed to parse json response: ${input}`);
385
293
  }
386
- export { callAI, callAIWithObjectResponse, callAIWithStringResponse, extractJSONFromCodeBlock, getResponseFormat, preprocessDoubaoBboxJson, safeParseJson };
294
+ export { callAI, callAIWithObjectResponse, callAIWithStringResponse, extractJSONFromCodeBlock, preprocessDoubaoBboxJson, safeParseJson };
387
295
 
388
296
  //# sourceMappingURL=index.mjs.map