@midscene/core 0.25.4-beta-20250807040242.0 → 0.25.4-beta-20250807062119.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +6 -7
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-I5LBWOQA.js → chunk-G2JTYWI6.js} +373 -156
  4. package/dist/es/chunk-G2JTYWI6.js.map +1 -0
  5. package/dist/es/{chunk-EK3JQ4ZV.js → chunk-JH54OF4E.js} +3 -3
  6. package/dist/es/index.d.ts +6 -6
  7. package/dist/es/index.js +5 -4
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-45dd50cd.d.ts → llm-planning-f449f3b8.d.ts} +3 -2
  10. package/dist/es/{types-da4fb35b.d.ts → types-7435eba0.d.ts} +8 -1
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +6 -7
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-I5LBWOQA.js → chunk-G2JTYWI6.js} +358 -141
  16. package/dist/lib/chunk-G2JTYWI6.js.map +1 -0
  17. package/dist/lib/{chunk-EK3JQ4ZV.js → chunk-JH54OF4E.js} +3 -3
  18. package/dist/lib/index.d.ts +6 -6
  19. package/dist/lib/index.js +15 -14
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-45dd50cd.d.ts → llm-planning-f449f3b8.d.ts} +3 -2
  22. package/dist/{types/types-da4fb35b.d.ts → lib/types-7435eba0.d.ts} +8 -1
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +6 -7
  26. package/dist/types/index.d.ts +6 -6
  27. package/dist/types/{llm-planning-45dd50cd.d.ts → llm-planning-f449f3b8.d.ts} +3 -2
  28. package/dist/{lib/types-da4fb35b.d.ts → types/types-7435eba0.d.ts} +8 -1
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-I5LBWOQA.js.map +0 -1
  32. package/dist/lib/chunk-I5LBWOQA.js.map +0 -1
  33. /package/dist/es/{chunk-EK3JQ4ZV.js.map → chunk-JH54OF4E.js.map} +0 -0
  34. /package/dist/lib/{chunk-EK3JQ4ZV.js.map → chunk-JH54OF4E.js.map} +0 -0
@@ -5,35 +5,16 @@ import {
5
5
  getBearerTokenProvider
6
6
  } from "@azure/identity";
7
7
  import {
8
- ANTHROPIC_API_KEY,
9
- AZURE_OPENAI_API_VERSION,
10
- AZURE_OPENAI_DEPLOYMENT,
11
- AZURE_OPENAI_ENDPOINT,
12
- AZURE_OPENAI_KEY,
13
8
  MIDSCENE_API_TYPE,
14
- MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
15
- MIDSCENE_AZURE_OPENAI_SCOPE,
16
- MIDSCENE_DEBUG_AI_PROFILE,
17
- MIDSCENE_DEBUG_AI_RESPONSE,
18
9
  MIDSCENE_LANGSMITH_DEBUG,
19
- MIDSCENE_MODEL_NAME,
20
- MIDSCENE_OPENAI_HTTP_PROXY,
21
- MIDSCENE_OPENAI_INIT_CONFIG_JSON,
22
- MIDSCENE_OPENAI_SOCKS_PROXY,
23
- MIDSCENE_USE_ANTHROPIC_SDK,
24
- MIDSCENE_USE_AZURE_OPENAI,
25
- OPENAI_API_KEY,
26
- OPENAI_BASE_URL,
27
10
  OPENAI_MAX_TOKENS,
28
- OPENAI_USE_AZURE,
29
- getAIConfig,
30
- getAIConfigInBoolean,
31
- getAIConfigInJson,
11
+ getAIConfig as getAIConfig2,
12
+ getAIConfigInBoolean as getAIConfigInBoolean2,
32
13
  uiTarsModelVersion,
33
14
  vlLocateMode as vlLocateMode3
34
15
  } from "@midscene/shared/env";
35
- import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
36
- import { assert as assert3 } from "@midscene/shared/utils";
16
+ import { getDebug as getDebug3 } from "@midscene/shared/logger";
17
+ import { assert as assert4 } from "@midscene/shared/utils";
37
18
  import { ifInBrowser } from "@midscene/shared/utils";
38
19
  import { HttpsProxyAgent } from "https-proxy-agent";
39
20
  import { jsonrepair } from "jsonrepair";
@@ -55,10 +36,11 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
55
36
  AIActionType2[AIActionType2["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
56
37
  return AIActionType2;
57
38
  })(AIActionType || {});
58
- async function callAiFn(msgs, AIActionTypeValue) {
39
+ async function callAiFn(msgs, AIActionTypeValue, modelPreferences) {
59
40
  const { content, usage } = await callToGetJSONObject(
60
41
  msgs,
61
- AIActionTypeValue
42
+ AIActionTypeValue,
43
+ modelPreferences
62
44
  );
63
45
  return { content, usage };
64
46
  }
@@ -1163,24 +1145,57 @@ pageDescription:
1163
1145
  });
1164
1146
  };
1165
1147
 
1166
- // src/ai-model/service-caller/index.ts
1167
- function checkAIConfig() {
1168
- const openaiKey = getAIConfig(OPENAI_API_KEY);
1169
- const azureConfig = getAIConfig(MIDSCENE_USE_AZURE_OPENAI);
1170
- const anthropicKey = getAIConfig(ANTHROPIC_API_KEY);
1171
- const initConfigJson = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1172
- if (openaiKey)
1173
- return true;
1174
- if (azureConfig)
1175
- return true;
1176
- if (anthropicKey)
1177
- return true;
1178
- return Boolean(initConfigJson);
1148
+ // src/ai-model/service-caller/utils.ts
1149
+ import {
1150
+ ANTHROPIC_API_KEY,
1151
+ AZURE_OPENAI_API_VERSION,
1152
+ AZURE_OPENAI_DEPLOYMENT,
1153
+ AZURE_OPENAI_ENDPOINT,
1154
+ AZURE_OPENAI_KEY,
1155
+ MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
1156
+ MIDSCENE_AZURE_OPENAI_SCOPE,
1157
+ MIDSCENE_DEBUG_AI_PROFILE,
1158
+ MIDSCENE_DEBUG_AI_RESPONSE,
1159
+ MIDSCENE_MODEL_NAME,
1160
+ MIDSCENE_OPENAI_HTTP_PROXY,
1161
+ MIDSCENE_OPENAI_INIT_CONFIG_JSON,
1162
+ MIDSCENE_OPENAI_SOCKS_PROXY,
1163
+ MIDSCENE_USE_ANTHROPIC_SDK,
1164
+ MIDSCENE_USE_AZURE_OPENAI,
1165
+ MIDSCENE_VQA_ANTHROPIC_API_KEY,
1166
+ MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
1167
+ MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
1168
+ MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
1169
+ MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
1170
+ MIDSCENE_VQA_AZURE_OPENAI_KEY,
1171
+ MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
1172
+ MIDSCENE_VQA_MODEL_NAME,
1173
+ MIDSCENE_VQA_OPENAI_API_KEY,
1174
+ MIDSCENE_VQA_OPENAI_BASE_URL,
1175
+ MIDSCENE_VQA_OPENAI_HTTP_PROXY,
1176
+ MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
1177
+ MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
1178
+ MIDSCENE_VQA_OPENAI_USE_AZURE,
1179
+ MIDSCENE_VQA_USE_ANTHROPIC_SDK,
1180
+ MIDSCENE_VQA_USE_AZURE_OPENAI,
1181
+ OPENAI_API_KEY,
1182
+ OPENAI_BASE_URL,
1183
+ OPENAI_USE_AZURE,
1184
+ getAIConfig,
1185
+ getAIConfigInBoolean,
1186
+ getAIConfigInJson
1187
+ } from "@midscene/shared/env";
1188
+ import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
1189
+ import { assert as assert3 } from "@midscene/shared/utils";
1190
+ function getModelName() {
1191
+ let modelName = "gpt-4o";
1192
+ const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1193
+ if (nameInConfig) {
1194
+ modelName = nameInConfig;
1195
+ }
1196
+ return modelName;
1179
1197
  }
1180
- var debugConfigInitialized = false;
1181
1198
  function initDebugConfig() {
1182
- if (debugConfigInitialized)
1183
- return;
1184
1199
  const shouldPrintTiming = getAIConfigInBoolean(MIDSCENE_DEBUG_AI_PROFILE);
1185
1200
  let debugConfig = "";
1186
1201
  if (shouldPrintTiming) {
@@ -1205,27 +1220,232 @@ function initDebugConfig() {
1205
1220
  if (debugConfig) {
1206
1221
  enableDebug(debugConfig);
1207
1222
  }
1208
- debugConfigInitialized = true;
1209
1223
  }
1210
- var defaultModel = "gpt-4o";
1211
- function getModelName() {
1212
- let modelName = defaultModel;
1213
- const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1214
- if (nameInConfig) {
1215
- modelName = nameInConfig;
1224
+ var createAssert = (modelNameKey, modelName) => (value, key, modelVendorFlag) => {
1225
+ if (modelVendorFlag) {
1226
+ assert3(
1227
+ value,
1228
+ `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName} and ${modelVendorFlag} has also been specified, but got: ${value}
1229
+ Please check your config.`
1230
+ );
1231
+ } else {
1232
+ assert3(
1233
+ value,
1234
+ `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName}, but got: ${value}
1235
+ Please check your config.`
1236
+ );
1216
1237
  }
1217
- return modelName;
1218
- }
1238
+ };
1239
+ var getModelConfigFromEnv = (modelName, keys, valueAssert) => {
1240
+ const socksProxy = getAIConfig(keys.socksProxy);
1241
+ const httpProxy = getAIConfig(keys.httpProxy);
1242
+ if (getAIConfig(keys.openaiUseAzureDeprecated)) {
1243
+ const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
1244
+ const openaiApiKey = getAIConfig(keys.openaiApiKey);
1245
+ const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1246
+ valueAssert(
1247
+ openaiBaseURL,
1248
+ keys.openaiBaseURL,
1249
+ keys.openaiUseAzureDeprecated
1250
+ );
1251
+ valueAssert(openaiApiKey, keys.openaiApiKey, keys.openaiUseAzureDeprecated);
1252
+ return {
1253
+ socksProxy,
1254
+ httpProxy,
1255
+ modelName,
1256
+ openaiUseAzureDeprecated: true,
1257
+ openaiApiKey,
1258
+ openaiBaseURL,
1259
+ openaiExtraConfig
1260
+ };
1261
+ } else if (getAIConfig(keys.useAzureOpenai)) {
1262
+ const azureOpenaiScope = getAIConfig(keys.azureOpenaiScope);
1263
+ const azureOpenaiApiKey = getAIConfig(keys.azureOpenaiApiKey);
1264
+ const azureOpenaiEndpoint = getAIConfig(keys.azureOpenaiEndpoint);
1265
+ const azureOpenaiDeployment = getAIConfig(keys.azureOpenaiDeployment);
1266
+ const azureOpenaiApiVersion = getAIConfig(keys.azureOpenaiApiVersion);
1267
+ const azureExtraConfig = getAIConfigInJson(keys.azureExtraConfig);
1268
+ const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1269
+ valueAssert(azureOpenaiApiKey, keys.azureOpenaiApiKey, keys.useAzureOpenai);
1270
+ return {
1271
+ socksProxy,
1272
+ httpProxy,
1273
+ modelName,
1274
+ useAzureOpenai: true,
1275
+ azureOpenaiScope,
1276
+ azureOpenaiApiKey,
1277
+ azureOpenaiEndpoint,
1278
+ azureOpenaiDeployment,
1279
+ azureOpenaiApiVersion,
1280
+ azureExtraConfig,
1281
+ openaiExtraConfig
1282
+ };
1283
+ } else if (getAIConfig(keys.useAnthropicSdk)) {
1284
+ const anthropicApiKey = getAIConfig(keys.anthropicApiKey);
1285
+ valueAssert(anthropicApiKey, keys.anthropicApiKey, keys.useAnthropicSdk);
1286
+ return {
1287
+ socksProxy,
1288
+ httpProxy,
1289
+ modelName,
1290
+ useAnthropicSdk: true,
1291
+ anthropicApiKey
1292
+ };
1293
+ } else {
1294
+ const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
1295
+ const openaiApiKey = getAIConfig(keys.openaiApiKey);
1296
+ const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1297
+ valueAssert(openaiBaseURL, keys.openaiBaseURL);
1298
+ valueAssert(openaiApiKey, keys.openaiApiKey);
1299
+ return {
1300
+ socksProxy,
1301
+ httpProxy,
1302
+ modelName,
1303
+ openaiBaseURL,
1304
+ openaiApiKey,
1305
+ openaiExtraConfig
1306
+ };
1307
+ }
1308
+ };
1309
+ var maskKey = (key, maskChar = "*") => {
1310
+ if (typeof key !== "string" || key.length === 0) {
1311
+ return key;
1312
+ }
1313
+ const prefixLen = 3;
1314
+ const suffixLen = 3;
1315
+ const keepLength = prefixLen + suffixLen;
1316
+ if (key.length <= keepLength) {
1317
+ return key;
1318
+ }
1319
+ const prefix = key.substring(0, prefixLen);
1320
+ const suffix = key.substring(key.length - suffixLen);
1321
+ const maskLength = key.length - keepLength;
1322
+ const mask = maskChar.repeat(maskLength);
1323
+ return `${prefix}${mask}${suffix}`;
1324
+ };
1325
+ var maskConfig = (config) => {
1326
+ return Object.fromEntries(
1327
+ Object.entries(config).map(([key, value]) => [
1328
+ key,
1329
+ ["openaiApiKey", "azureOpenaiApiKey", "anthropicApiKey"].includes(key) ? maskKey(value) : value
1330
+ ])
1331
+ );
1332
+ };
1333
+ var decideModelConfig = (modelPreferences) => {
1334
+ initDebugConfig();
1335
+ const debugLog = getDebug2("ai:decideModelConfig");
1336
+ debugLog("modelPreferences", modelPreferences);
1337
+ const isVQAIntent = modelPreferences?.intent === "VQA";
1338
+ const vqaModelName = getAIConfig(MIDSCENE_VQA_MODEL_NAME);
1339
+ if (isVQAIntent && vqaModelName) {
1340
+ debugLog(
1341
+ `current action is a VQA action and detected ${MIDSCENE_VQA_MODEL_NAME} ${vqaModelName}, will only read VQA related model config from process.env`
1342
+ );
1343
+ const config = getModelConfigFromEnv(
1344
+ vqaModelName,
1345
+ {
1346
+ /**
1347
+ * proxy
1348
+ */
1349
+ socksProxy: MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
1350
+ httpProxy: MIDSCENE_VQA_OPENAI_HTTP_PROXY,
1351
+ /**
1352
+ * OpenAI
1353
+ */
1354
+ openaiBaseURL: MIDSCENE_VQA_OPENAI_BASE_URL,
1355
+ openaiApiKey: MIDSCENE_VQA_OPENAI_API_KEY,
1356
+ openaiExtraConfig: MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
1357
+ /**
1358
+ * Azure
1359
+ */
1360
+ openaiUseAzureDeprecated: MIDSCENE_VQA_OPENAI_USE_AZURE,
1361
+ useAzureOpenai: MIDSCENE_VQA_USE_AZURE_OPENAI,
1362
+ azureOpenaiScope: MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
1363
+ azureOpenaiApiKey: MIDSCENE_VQA_AZURE_OPENAI_KEY,
1364
+ azureOpenaiEndpoint: MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
1365
+ azureOpenaiApiVersion: MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
1366
+ azureOpenaiDeployment: MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
1367
+ azureExtraConfig: MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
1368
+ /**
1369
+ * Anthropic
1370
+ */
1371
+ useAnthropicSdk: MIDSCENE_VQA_USE_ANTHROPIC_SDK,
1372
+ anthropicApiKey: MIDSCENE_VQA_ANTHROPIC_API_KEY
1373
+ },
1374
+ createAssert(MIDSCENE_VQA_MODEL_NAME, vqaModelName)
1375
+ );
1376
+ debugLog("got model config for VQA usage:", maskConfig(config));
1377
+ return config;
1378
+ } else {
1379
+ debugLog("read model config from process.env as normal.");
1380
+ const commonModelName = getAIConfig(MIDSCENE_MODEL_NAME);
1381
+ assert3(
1382
+ commonModelName,
1383
+ `${MIDSCENE_MODEL_NAME} is empty, please check your config.`
1384
+ );
1385
+ const config = getModelConfigFromEnv(
1386
+ commonModelName,
1387
+ {
1388
+ /**
1389
+ * proxy
1390
+ */
1391
+ socksProxy: MIDSCENE_OPENAI_SOCKS_PROXY,
1392
+ httpProxy: MIDSCENE_OPENAI_HTTP_PROXY,
1393
+ /**
1394
+ * OpenAI
1395
+ */
1396
+ openaiBaseURL: OPENAI_BASE_URL,
1397
+ openaiApiKey: OPENAI_API_KEY,
1398
+ openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
1399
+ /**
1400
+ * Azure
1401
+ */
1402
+ openaiUseAzureDeprecated: OPENAI_USE_AZURE,
1403
+ useAzureOpenai: MIDSCENE_USE_AZURE_OPENAI,
1404
+ azureOpenaiScope: MIDSCENE_AZURE_OPENAI_SCOPE,
1405
+ azureOpenaiApiKey: AZURE_OPENAI_KEY,
1406
+ azureOpenaiEndpoint: AZURE_OPENAI_ENDPOINT,
1407
+ azureOpenaiApiVersion: AZURE_OPENAI_API_VERSION,
1408
+ azureOpenaiDeployment: AZURE_OPENAI_DEPLOYMENT,
1409
+ azureExtraConfig: MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
1410
+ /**
1411
+ * Anthropic
1412
+ */
1413
+ useAnthropicSdk: MIDSCENE_USE_ANTHROPIC_SDK,
1414
+ anthropicApiKey: ANTHROPIC_API_KEY
1415
+ },
1416
+ createAssert(MIDSCENE_MODEL_NAME, commonModelName)
1417
+ );
1418
+ debugLog("got model config for common usage:", maskConfig(config));
1419
+ return config;
1420
+ }
1421
+ };
1422
+
1423
+ // src/ai-model/service-caller/index.ts
1219
1424
  async function createChatClient({
1220
- AIActionTypeValue
1425
+ AIActionTypeValue,
1426
+ modelPreferences
1221
1427
  }) {
1222
- initDebugConfig();
1428
+ const {
1429
+ socksProxy,
1430
+ httpProxy,
1431
+ modelName,
1432
+ openaiBaseURL,
1433
+ openaiApiKey,
1434
+ openaiExtraConfig,
1435
+ openaiUseAzureDeprecated,
1436
+ useAzureOpenai,
1437
+ azureOpenaiScope,
1438
+ azureOpenaiApiKey,
1439
+ azureOpenaiEndpoint,
1440
+ azureOpenaiApiVersion,
1441
+ azureOpenaiDeployment,
1442
+ azureExtraConfig,
1443
+ useAnthropicSdk,
1444
+ anthropicApiKey
1445
+ } = decideModelConfig(modelPreferences);
1223
1446
  let openai;
1224
- const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1225
- const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
1226
- const httpProxy = getAIConfig(MIDSCENE_OPENAI_HTTP_PROXY);
1227
1447
  let proxyAgent = void 0;
1228
- const debugProxy = getDebug2("ai:call:proxy");
1448
+ const debugProxy = getDebug3("ai:call:proxy");
1229
1449
  if (httpProxy) {
1230
1450
  debugProxy("using http proxy", httpProxy);
1231
1451
  proxyAgent = new HttpsProxyAgent(httpProxy);
@@ -1233,70 +1453,56 @@ async function createChatClient({
1233
1453
  debugProxy("using socks proxy", socksProxy);
1234
1454
  proxyAgent = new SocksProxyAgent(socksProxy);
1235
1455
  }
1236
- if (getAIConfig(OPENAI_USE_AZURE)) {
1456
+ if (openaiUseAzureDeprecated) {
1237
1457
  openai = new AzureOpenAI({
1238
- baseURL: getAIConfig(OPENAI_BASE_URL),
1239
- apiKey: getAIConfig(OPENAI_API_KEY),
1458
+ baseURL: openaiBaseURL,
1459
+ apiKey: openaiApiKey,
1240
1460
  httpAgent: proxyAgent,
1241
- ...extraConfig,
1461
+ ...openaiExtraConfig,
1242
1462
  dangerouslyAllowBrowser: true
1243
1463
  });
1244
- } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
1245
- const extraAzureConfig = getAIConfigInJson(
1246
- MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
1247
- );
1248
- const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
1464
+ } else if (useAzureOpenai) {
1249
1465
  let tokenProvider = void 0;
1250
- if (scope) {
1251
- assert3(
1466
+ if (azureOpenaiScope) {
1467
+ assert4(
1252
1468
  !ifInBrowser,
1253
1469
  "Azure OpenAI is not supported in browser with Midscene."
1254
1470
  );
1255
1471
  const credential = new DefaultAzureCredential();
1256
- assert3(scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
1257
- tokenProvider = getBearerTokenProvider(credential, scope);
1472
+ tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);
1258
1473
  openai = new AzureOpenAI({
1259
1474
  azureADTokenProvider: tokenProvider,
1260
- endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1261
- apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1262
- deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1263
- ...extraConfig,
1264
- ...extraAzureConfig
1475
+ endpoint: azureOpenaiEndpoint,
1476
+ apiVersion: azureOpenaiApiVersion,
1477
+ deployment: azureOpenaiDeployment,
1478
+ ...openaiExtraConfig,
1479
+ ...azureExtraConfig
1265
1480
  });
1266
1481
  } else {
1267
1482
  openai = new AzureOpenAI({
1268
- apiKey: getAIConfig(AZURE_OPENAI_KEY),
1269
- endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1270
- apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1271
- deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1483
+ apiKey: azureOpenaiApiKey,
1484
+ endpoint: azureOpenaiEndpoint,
1485
+ apiVersion: azureOpenaiApiVersion,
1486
+ deployment: azureOpenaiDeployment,
1272
1487
  dangerouslyAllowBrowser: true,
1273
- ...extraConfig,
1274
- ...extraAzureConfig
1488
+ ...openaiExtraConfig,
1489
+ ...azureExtraConfig
1275
1490
  });
1276
1491
  }
1277
- } else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1278
- const baseURL = getAIConfig(OPENAI_BASE_URL);
1279
- if (typeof baseURL === "string") {
1280
- if (!/^https?:\/\//.test(baseURL)) {
1281
- throw new Error(
1282
- `OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}
1283
- Please check your config.`
1284
- );
1285
- }
1286
- }
1492
+ } else if (!useAnthropicSdk) {
1287
1493
  openai = new OpenAI({
1288
- baseURL: getAIConfig(OPENAI_BASE_URL),
1289
- apiKey: getAIConfig(OPENAI_API_KEY),
1494
+ baseURL: openaiBaseURL,
1495
+ apiKey: openaiApiKey,
1290
1496
  httpAgent: proxyAgent,
1291
- ...extraConfig,
1497
+ ...openaiExtraConfig,
1292
1498
  defaultHeaders: {
1293
- ...extraConfig?.defaultHeaders || {},
1499
+ ...openaiExtraConfig?.defaultHeaders || {},
1294
1500
  [MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
1295
1501
  },
1296
1502
  dangerouslyAllowBrowser: true
1297
1503
  });
1298
1504
  }
1299
- if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
1505
+ if (openai && getAIConfigInBoolean2(MIDSCENE_LANGSMITH_DEBUG)) {
1300
1506
  if (ifInBrowser) {
1301
1507
  throw new Error("langsmith is not supported in browser");
1302
1508
  }
@@ -1307,14 +1513,13 @@ Please check your config.`
1307
1513
  if (typeof openai !== "undefined") {
1308
1514
  return {
1309
1515
  completion: openai.chat.completions,
1310
- style: "openai"
1516
+ style: "openai",
1517
+ modelName
1311
1518
  };
1312
1519
  }
1313
- if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1314
- const apiKey = getAIConfig(ANTHROPIC_API_KEY);
1315
- assert3(apiKey, "ANTHROPIC_API_KEY is required");
1520
+ if (useAnthropicSdk) {
1316
1521
  openai = new Anthropic({
1317
- apiKey,
1522
+ apiKey: anthropicApiKey,
1318
1523
  httpAgent: proxyAgent,
1319
1524
  dangerouslyAllowBrowser: true
1320
1525
  });
@@ -1322,25 +1527,23 @@ Please check your config.`
1322
1527
  if (typeof openai !== "undefined" && openai.messages) {
1323
1528
  return {
1324
1529
  completion: openai.messages,
1325
- style: "anthropic"
1530
+ style: "anthropic",
1531
+ modelName
1326
1532
  };
1327
1533
  }
1328
1534
  throw new Error("Openai SDK or Anthropic SDK is not initialized");
1329
1535
  }
1330
- async function call2(messages, AIActionTypeValue, responseFormat, options) {
1331
- assert3(
1332
- checkAIConfig(),
1333
- "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
1334
- );
1335
- const { completion, style } = await createChatClient({
1336
- AIActionTypeValue
1536
+ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1537
+ const { completion, style, modelName } = await createChatClient({
1538
+ AIActionTypeValue,
1539
+ modelPreferences
1337
1540
  });
1338
- const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);
1339
- const debugCall = getDebug2("ai:call");
1340
- const debugProfileStats = getDebug2("ai:profile:stats");
1341
- const debugProfileDetail = getDebug2("ai:profile:detail");
1541
+ const responseFormat = getResponseFormat(modelName, AIActionTypeValue);
1542
+ const maxTokens = getAIConfig2(OPENAI_MAX_TOKENS);
1543
+ const debugCall = getDebug3("ai:call");
1544
+ const debugProfileStats = getDebug3("ai:profile:stats");
1545
+ const debugProfileDetail = getDebug3("ai:profile:detail");
1342
1546
  const startTime = Date.now();
1343
- const model = getModelName();
1344
1547
  const isStreaming = options?.stream && options?.onChunk;
1345
1548
  let content;
1346
1549
  let accumulated = "";
@@ -1357,12 +1560,12 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1357
1560
  try {
1358
1561
  if (style === "openai") {
1359
1562
  debugCall(
1360
- `sending ${isStreaming ? "streaming " : ""}request to ${model}`
1563
+ `sending ${isStreaming ? "streaming " : ""}request to ${modelName}`
1361
1564
  );
1362
1565
  if (isStreaming) {
1363
1566
  const stream = await completion.create(
1364
1567
  {
1365
- model,
1568
+ model: modelName,
1366
1569
  messages,
1367
1570
  response_format: responseFormat,
1368
1571
  ...commonConfig
@@ -1419,23 +1622,23 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1419
1622
  }
1420
1623
  content = accumulated;
1421
1624
  debugProfileStats(
1422
- `streaming model, ${model}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
1625
+ `streaming model, ${modelName}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
1423
1626
  );
1424
1627
  } else {
1425
1628
  const result = await completion.create({
1426
- model,
1629
+ model: modelName,
1427
1630
  messages,
1428
1631
  response_format: responseFormat,
1429
1632
  ...commonConfig
1430
1633
  });
1431
1634
  timeCost = Date.now() - startTime;
1432
1635
  debugProfileStats(
1433
- `model, ${model}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1636
+ `model, ${modelName}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1434
1637
  );
1435
1638
  debugProfileDetail(
1436
1639
  `model usage detail: ${JSON.stringify(result.usage)}`
1437
1640
  );
1438
- assert3(
1641
+ assert4(
1439
1642
  result.choices,
1440
1643
  `invalid response from LLM service: ${JSON.stringify(result)}`
1441
1644
  );
@@ -1443,12 +1646,12 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1443
1646
  usage = result.usage;
1444
1647
  }
1445
1648
  debugCall(`response: ${content}`);
1446
- assert3(content, "empty content");
1649
+ assert4(content, "empty content");
1447
1650
  } else if (style === "anthropic") {
1448
1651
  const convertImageContent = (content2) => {
1449
1652
  if (content2.type === "image_url") {
1450
1653
  const imgBase64 = content2.image_url.url;
1451
- assert3(imgBase64, "image_url is required");
1654
+ assert4(imgBase64, "image_url is required");
1452
1655
  return {
1453
1656
  source: {
1454
1657
  type: "base64",
@@ -1462,7 +1665,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1462
1665
  };
1463
1666
  if (isStreaming) {
1464
1667
  const stream = await completion.create({
1465
- model,
1668
+ model: modelName,
1466
1669
  system: "You are a versatile professional in software UI automation",
1467
1670
  messages: messages.map((m) => ({
1468
1671
  role: "user",
@@ -1506,7 +1709,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1506
1709
  content = accumulated;
1507
1710
  } else {
1508
1711
  const result = await completion.create({
1509
- model,
1712
+ model: modelName,
1510
1713
  system: "You are a versatile professional in software UI automation",
1511
1714
  messages: messages.map((m) => ({
1512
1715
  role: "user",
@@ -1519,7 +1722,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1519
1722
  content = result.content[0].text;
1520
1723
  usage = result.usage;
1521
1724
  }
1522
- assert3(content, "empty content");
1725
+ assert4(content, "empty content");
1523
1726
  }
1524
1727
  if (isStreaming && !usage) {
1525
1728
  const estimatedTokens = Math.max(
@@ -1553,10 +1756,9 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1553
1756
  throw newError;
1554
1757
  }
1555
1758
  }
1556
- async function callToGetJSONObject(messages, AIActionTypeValue) {
1759
+ var getResponseFormat = (modelName, AIActionTypeValue) => {
1557
1760
  let responseFormat;
1558
- const model = getModelName();
1559
- if (model.includes("gpt-4")) {
1761
+ if (modelName.includes("gpt-4")) {
1560
1762
  switch (AIActionTypeValue) {
1561
1763
  case 0 /* ASSERT */:
1562
1764
  responseFormat = assertSchema;
@@ -1573,11 +1775,19 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
1573
1775
  break;
1574
1776
  }
1575
1777
  }
1576
- if (model === "gpt-4o-2024-05-13") {
1778
+ if (modelName === "gpt-4o-2024-05-13") {
1577
1779
  responseFormat = { type: "json_object" /* JSON */ };
1578
1780
  }
1579
- const response = await call2(messages, AIActionTypeValue, responseFormat);
1580
- assert3(response, "empty response");
1781
+ return responseFormat;
1782
+ };
1783
+ async function callToGetJSONObject(messages, AIActionTypeValue, modelPreferences) {
1784
+ const response = await call2(
1785
+ messages,
1786
+ AIActionTypeValue,
1787
+ void 0,
1788
+ modelPreferences
1789
+ );
1790
+ assert4(response, "empty response");
1581
1791
  const jsonContent = safeParseJson(response.content);
1582
1792
  return { content: jsonContent, usage: response.usage };
1583
1793
  }
@@ -1861,7 +2071,7 @@ Respond with YAML only, no explanations.`
1861
2071
  });
1862
2072
  }
1863
2073
  if (options.stream && options.onChunk) {
1864
- return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
2074
+ return await call2(prompt, 2 /* EXTRACT_DATA */, {
1865
2075
  stream: true,
1866
2076
  onChunk: options.onChunk
1867
2077
  });
@@ -1984,7 +2194,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
1984
2194
  }
1985
2195
  ];
1986
2196
  if (options.stream && options.onChunk) {
1987
- return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
2197
+ return await call2(prompt, 2 /* EXTRACT_DATA */, {
1988
2198
  stream: true,
1989
2199
  onChunk: options.onChunk
1990
2200
  });
@@ -2005,7 +2215,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
2005
2215
  import {
2006
2216
  MIDSCENE_USE_QWEN_VL,
2007
2217
  MIDSCENE_USE_VLM_UI_TARS,
2008
- getAIConfigInBoolean as getAIConfigInBoolean2,
2218
+ getAIConfigInBoolean as getAIConfigInBoolean3,
2009
2219
  vlLocateMode as vlLocateMode4
2010
2220
  } from "@midscene/shared/env";
2011
2221
  import {
@@ -2013,8 +2223,8 @@ import {
2013
2223
  paddingToMatchBlockByBase64,
2014
2224
  preProcessImageUrl
2015
2225
  } from "@midscene/shared/img";
2016
- import { getDebug as getDebug3 } from "@midscene/shared/logger";
2017
- import { assert as assert4 } from "@midscene/shared/utils";
2226
+ import { getDebug as getDebug4 } from "@midscene/shared/logger";
2227
+ import { assert as assert5 } from "@midscene/shared/utils";
2018
2228
 
2019
2229
  // src/ai-model/prompt/extraction.ts
2020
2230
  import { PromptTemplate as PromptTemplate3 } from "@langchain/core/prompts";
@@ -2169,8 +2379,8 @@ var sectionLocatorInstruction = new PromptTemplate4({
2169
2379
  });
2170
2380
 
2171
2381
  // src/ai-model/inspect.ts
2172
- var debugInspect = getDebug3("ai:inspect");
2173
- var debugSection = getDebug3("ai:section");
2382
+ var debugInspect = getDebug4("ai:inspect");
2383
+ var debugSection = getDebug4("ai:section");
2174
2384
  var extraTextFromUserPrompt = (prompt) => {
2175
2385
  if (typeof prompt === "string") {
2176
2386
  return prompt;
@@ -2224,7 +2434,7 @@ async function AiLocateElement(options) {
2224
2434
  const { context, targetElementDescription, callAI } = options;
2225
2435
  const { screenshotBase64 } = context;
2226
2436
  const { description, elementById, insertElementByPosition } = await describeUserPage(context);
2227
- assert4(
2437
+ assert5(
2228
2438
  targetElementDescription,
2229
2439
  "cannot find the target element description"
2230
2440
  );
@@ -2235,11 +2445,11 @@ async function AiLocateElement(options) {
2235
2445
  const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
2236
2446
  let imagePayload = screenshotBase64;
2237
2447
  if (options.searchConfig) {
2238
- assert4(
2448
+ assert5(
2239
2449
  options.searchConfig.rect,
2240
2450
  "searchArea is provided but its rect cannot be found. Failed to locate element"
2241
2451
  );
2242
- assert4(
2452
+ assert5(
2243
2453
  options.searchConfig.imageBase64,
2244
2454
  "searchArea is provided but its imageBase64 cannot be found. Failed to locate element"
2245
2455
  );
@@ -2391,7 +2601,7 @@ async function AiLocateSection(options) {
2391
2601
  imageBase64 = await cropByRect(
2392
2602
  screenshotBase64,
2393
2603
  sectionRect,
2394
- getAIConfigInBoolean2(MIDSCENE_USE_QWEN_VL)
2604
+ getAIConfigInBoolean3(MIDSCENE_USE_QWEN_VL)
2395
2605
  );
2396
2606
  }
2397
2607
  return {
@@ -2403,7 +2613,13 @@ async function AiLocateSection(options) {
2403
2613
  };
2404
2614
  }
2405
2615
  async function AiExtractElementInfo(options) {
2406
- const { dataQuery, context, extractOption, multimodalPrompt } = options;
2616
+ const {
2617
+ dataQuery,
2618
+ context,
2619
+ extractOption,
2620
+ multimodalPrompt,
2621
+ modelPreferences
2622
+ } = options;
2407
2623
  const systemPrompt = systemPromptToExtract();
2408
2624
  const { screenshotBase64 } = context;
2409
2625
  const { description, elementById } = await describeUserPage(context, {
@@ -2452,7 +2668,8 @@ async function AiExtractElementInfo(options) {
2452
2668
  }
2453
2669
  const result = await callAiFn(
2454
2670
  msgs,
2455
- 2 /* EXTRACT_DATA */
2671
+ 2 /* EXTRACT_DATA */,
2672
+ modelPreferences
2456
2673
  );
2457
2674
  return {
2458
2675
  parseResult: result.content,
@@ -2462,10 +2679,10 @@ async function AiExtractElementInfo(options) {
2462
2679
  }
2463
2680
  async function AiAssert(options) {
2464
2681
  const { assertion, context } = options;
2465
- assert4(assertion, "assertion should not be empty");
2682
+ assert5(assertion, "assertion should not be empty");
2466
2683
  const { screenshotBase64 } = context;
2467
2684
  const systemPrompt = systemPromptToAssert({
2468
- isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
2685
+ isUITars: getAIConfigInBoolean3(MIDSCENE_USE_VLM_UI_TARS)
2469
2686
  });
2470
2687
  const assertionText = extraTextFromUserPrompt(assertion);
2471
2688
  const msgs = [
@@ -2512,7 +2729,7 @@ ${assertionText}
2512
2729
  // src/ai-model/llm-planning.ts
2513
2730
  import { vlLocateMode as vlLocateMode5 } from "@midscene/shared/env";
2514
2731
  import { paddingToMatchBlockByBase64 as paddingToMatchBlockByBase642 } from "@midscene/shared/img";
2515
- import { assert as assert5 } from "@midscene/shared/utils";
2732
+ import { assert as assert6 } from "@midscene/shared/utils";
2516
2733
  async function plan(userInstruction, opts) {
2517
2734
  const { callAI, context } = opts || {};
2518
2735
  const { screenshotBase64, size } = context;
@@ -2574,7 +2791,7 @@ async function plan(userInstruction, opts) {
2574
2791
  usage,
2575
2792
  yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
2576
2793
  };
2577
- assert5(planFromAI, "can't get plans from AI");
2794
+ assert6(planFromAI, "can't get plans from AI");
2578
2795
  if (vlLocateMode5()) {
2579
2796
  actions.forEach((action) => {
2580
2797
  if (action.locate) {
@@ -2590,7 +2807,7 @@ async function plan(userInstruction, opts) {
2590
2807
  }
2591
2808
  }
2592
2809
  });
2593
- assert5(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2810
+ assert6(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2594
2811
  } else {
2595
2812
  actions.forEach((action) => {
2596
2813
  if (action.locate?.id) {
@@ -2618,8 +2835,8 @@ import {
2618
2835
  } from "@midscene/shared/env";
2619
2836
  import { resizeImgBase64 } from "@midscene/shared/img";
2620
2837
  import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
2621
- import { getDebug as getDebug4 } from "@midscene/shared/logger";
2622
- import { assert as assert6 } from "@midscene/shared/utils";
2838
+ import { getDebug as getDebug5 } from "@midscene/shared/logger";
2839
+ import { assert as assert7 } from "@midscene/shared/utils";
2623
2840
  import { actionParser } from "@ui-tars/action-parser";
2624
2841
 
2625
2842
  // src/ai-model/prompt/ui-tars-planning.ts
@@ -2658,7 +2875,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
2658
2875
  var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
2659
2876
 
2660
2877
  // src/ai-model/ui-tars-planning.ts
2661
- var debug = getDebug4("ui-tars-planning");
2878
+ var debug = getDebug5("ui-tars-planning");
2662
2879
  var bboxSize = 10;
2663
2880
  var pointToBbox = (point, width, height) => {
2664
2881
  return [
@@ -2696,7 +2913,7 @@ async function vlmPlanning(options) {
2696
2913
  const transformActions = [];
2697
2914
  parsed.forEach((action) => {
2698
2915
  if (action.action_type === "click") {
2699
- assert6(action.action_inputs.start_box, "start_box is required");
2916
+ assert7(action.action_inputs.start_box, "start_box is required");
2700
2917
  const point = getPoint(action.action_inputs.start_box, size);
2701
2918
  transformActions.push({
2702
2919
  type: "Locate",
@@ -2723,8 +2940,8 @@ async function vlmPlanning(options) {
2723
2940
  param: action.thought || ""
2724
2941
  });
2725
2942
  } else if (action.action_type === "drag") {
2726
- assert6(action.action_inputs.start_box, "start_box is required");
2727
- assert6(action.action_inputs.end_box, "end_box is required");
2943
+ assert7(action.action_inputs.start_box, "start_box is required");
2944
+ assert7(action.action_inputs.end_box, "end_box is required");
2728
2945
  const startPoint = getPoint(action.action_inputs.start_box, size);
2729
2946
  const endPoint = getPoint(action.action_inputs.end_box, size);
2730
2947
  transformActions.push({
@@ -2806,7 +3023,7 @@ async function vlmPlanning(options) {
2806
3023
  param: {}
2807
3024
  });
2808
3025
  } else if (action.action_type === "androidLongPress") {
2809
- assert6(
3026
+ assert7(
2810
3027
  action.action_inputs.start_coords,
2811
3028
  "start_coords is required for androidLongPress"
2812
3029
  );
@@ -2922,4 +3139,4 @@ export {
2922
3139
  resizeImageForUiTars
2923
3140
  };
2924
3141
 
2925
- //# sourceMappingURL=chunk-I5LBWOQA.js.map
3142
+ //# sourceMappingURL=chunk-G2JTYWI6.js.map