@midscene/core 0.25.4-beta-20250807062119.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +7 -6
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-G2JTYWI6.js → chunk-I5LBWOQA.js} +156 -373
  4. package/dist/es/chunk-I5LBWOQA.js.map +1 -0
  5. package/dist/es/{chunk-JH54OF4E.js → chunk-NBFEZEAH.js} +3 -3
  6. package/dist/es/index.d.ts +6 -6
  7. package/dist/es/index.js +4 -5
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-f449f3b8.d.ts → llm-planning-92cec090.d.ts} +2 -3
  10. package/dist/es/{types-7435eba0.d.ts → types-b4a208c6.d.ts} +3 -9
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +7 -6
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-G2JTYWI6.js → chunk-I5LBWOQA.js} +141 -358
  16. package/dist/lib/chunk-I5LBWOQA.js.map +1 -0
  17. package/dist/lib/{chunk-JH54OF4E.js → chunk-NBFEZEAH.js} +3 -3
  18. package/dist/lib/index.d.ts +6 -6
  19. package/dist/lib/index.js +14 -15
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-f449f3b8.d.ts → llm-planning-92cec090.d.ts} +2 -3
  22. package/dist/{types/types-7435eba0.d.ts → lib/types-b4a208c6.d.ts} +3 -9
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +7 -6
  26. package/dist/types/index.d.ts +6 -6
  27. package/dist/types/{llm-planning-f449f3b8.d.ts → llm-planning-92cec090.d.ts} +2 -3
  28. package/dist/{lib/types-7435eba0.d.ts → types/types-b4a208c6.d.ts} +3 -9
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-G2JTYWI6.js.map +0 -1
  32. package/dist/lib/chunk-G2JTYWI6.js.map +0 -1
  33. /package/dist/es/{chunk-JH54OF4E.js.map → chunk-NBFEZEAH.js.map} +0 -0
  34. /package/dist/lib/{chunk-JH54OF4E.js.map → chunk-NBFEZEAH.js.map} +0 -0
@@ -5,16 +5,35 @@ import {
5
5
  getBearerTokenProvider
6
6
  } from "@azure/identity";
7
7
  import {
8
+ ANTHROPIC_API_KEY,
9
+ AZURE_OPENAI_API_VERSION,
10
+ AZURE_OPENAI_DEPLOYMENT,
11
+ AZURE_OPENAI_ENDPOINT,
12
+ AZURE_OPENAI_KEY,
8
13
  MIDSCENE_API_TYPE,
14
+ MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
15
+ MIDSCENE_AZURE_OPENAI_SCOPE,
16
+ MIDSCENE_DEBUG_AI_PROFILE,
17
+ MIDSCENE_DEBUG_AI_RESPONSE,
9
18
  MIDSCENE_LANGSMITH_DEBUG,
19
+ MIDSCENE_MODEL_NAME,
20
+ MIDSCENE_OPENAI_HTTP_PROXY,
21
+ MIDSCENE_OPENAI_INIT_CONFIG_JSON,
22
+ MIDSCENE_OPENAI_SOCKS_PROXY,
23
+ MIDSCENE_USE_ANTHROPIC_SDK,
24
+ MIDSCENE_USE_AZURE_OPENAI,
25
+ OPENAI_API_KEY,
26
+ OPENAI_BASE_URL,
10
27
  OPENAI_MAX_TOKENS,
11
- getAIConfig as getAIConfig2,
12
- getAIConfigInBoolean as getAIConfigInBoolean2,
28
+ OPENAI_USE_AZURE,
29
+ getAIConfig,
30
+ getAIConfigInBoolean,
31
+ getAIConfigInJson,
13
32
  uiTarsModelVersion,
14
33
  vlLocateMode as vlLocateMode3
15
34
  } from "@midscene/shared/env";
16
- import { getDebug as getDebug3 } from "@midscene/shared/logger";
17
- import { assert as assert4 } from "@midscene/shared/utils";
35
+ import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
36
+ import { assert as assert3 } from "@midscene/shared/utils";
18
37
  import { ifInBrowser } from "@midscene/shared/utils";
19
38
  import { HttpsProxyAgent } from "https-proxy-agent";
20
39
  import { jsonrepair } from "jsonrepair";
@@ -36,11 +55,10 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
36
55
  AIActionType2[AIActionType2["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
37
56
  return AIActionType2;
38
57
  })(AIActionType || {});
39
- async function callAiFn(msgs, AIActionTypeValue, modelPreferences) {
58
+ async function callAiFn(msgs, AIActionTypeValue) {
40
59
  const { content, usage } = await callToGetJSONObject(
41
60
  msgs,
42
- AIActionTypeValue,
43
- modelPreferences
61
+ AIActionTypeValue
44
62
  );
45
63
  return { content, usage };
46
64
  }
@@ -1145,57 +1163,24 @@ pageDescription:
1145
1163
  });
1146
1164
  };
1147
1165
 
1148
- // src/ai-model/service-caller/utils.ts
1149
- import {
1150
- ANTHROPIC_API_KEY,
1151
- AZURE_OPENAI_API_VERSION,
1152
- AZURE_OPENAI_DEPLOYMENT,
1153
- AZURE_OPENAI_ENDPOINT,
1154
- AZURE_OPENAI_KEY,
1155
- MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
1156
- MIDSCENE_AZURE_OPENAI_SCOPE,
1157
- MIDSCENE_DEBUG_AI_PROFILE,
1158
- MIDSCENE_DEBUG_AI_RESPONSE,
1159
- MIDSCENE_MODEL_NAME,
1160
- MIDSCENE_OPENAI_HTTP_PROXY,
1161
- MIDSCENE_OPENAI_INIT_CONFIG_JSON,
1162
- MIDSCENE_OPENAI_SOCKS_PROXY,
1163
- MIDSCENE_USE_ANTHROPIC_SDK,
1164
- MIDSCENE_USE_AZURE_OPENAI,
1165
- MIDSCENE_VQA_ANTHROPIC_API_KEY,
1166
- MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
1167
- MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
1168
- MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
1169
- MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
1170
- MIDSCENE_VQA_AZURE_OPENAI_KEY,
1171
- MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
1172
- MIDSCENE_VQA_MODEL_NAME,
1173
- MIDSCENE_VQA_OPENAI_API_KEY,
1174
- MIDSCENE_VQA_OPENAI_BASE_URL,
1175
- MIDSCENE_VQA_OPENAI_HTTP_PROXY,
1176
- MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
1177
- MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
1178
- MIDSCENE_VQA_OPENAI_USE_AZURE,
1179
- MIDSCENE_VQA_USE_ANTHROPIC_SDK,
1180
- MIDSCENE_VQA_USE_AZURE_OPENAI,
1181
- OPENAI_API_KEY,
1182
- OPENAI_BASE_URL,
1183
- OPENAI_USE_AZURE,
1184
- getAIConfig,
1185
- getAIConfigInBoolean,
1186
- getAIConfigInJson
1187
- } from "@midscene/shared/env";
1188
- import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
1189
- import { assert as assert3 } from "@midscene/shared/utils";
1190
- function getModelName() {
1191
- let modelName = "gpt-4o";
1192
- const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1193
- if (nameInConfig) {
1194
- modelName = nameInConfig;
1195
- }
1196
- return modelName;
1166
+ // src/ai-model/service-caller/index.ts
1167
+ function checkAIConfig() {
1168
+ const openaiKey = getAIConfig(OPENAI_API_KEY);
1169
+ const azureConfig = getAIConfig(MIDSCENE_USE_AZURE_OPENAI);
1170
+ const anthropicKey = getAIConfig(ANTHROPIC_API_KEY);
1171
+ const initConfigJson = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1172
+ if (openaiKey)
1173
+ return true;
1174
+ if (azureConfig)
1175
+ return true;
1176
+ if (anthropicKey)
1177
+ return true;
1178
+ return Boolean(initConfigJson);
1197
1179
  }
1180
+ var debugConfigInitialized = false;
1198
1181
  function initDebugConfig() {
1182
+ if (debugConfigInitialized)
1183
+ return;
1199
1184
  const shouldPrintTiming = getAIConfigInBoolean(MIDSCENE_DEBUG_AI_PROFILE);
1200
1185
  let debugConfig = "";
1201
1186
  if (shouldPrintTiming) {
@@ -1220,232 +1205,27 @@ function initDebugConfig() {
1220
1205
  if (debugConfig) {
1221
1206
  enableDebug(debugConfig);
1222
1207
  }
1208
+ debugConfigInitialized = true;
1223
1209
  }
1224
- var createAssert = (modelNameKey, modelName) => (value, key, modelVendorFlag) => {
1225
- if (modelVendorFlag) {
1226
- assert3(
1227
- value,
1228
- `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName} and ${modelVendorFlag} has also been specified, but got: ${value}
1229
- Please check your config.`
1230
- );
1231
- } else {
1232
- assert3(
1233
- value,
1234
- `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName}, but got: ${value}
1235
- Please check your config.`
1236
- );
1237
- }
1238
- };
1239
- var getModelConfigFromEnv = (modelName, keys, valueAssert) => {
1240
- const socksProxy = getAIConfig(keys.socksProxy);
1241
- const httpProxy = getAIConfig(keys.httpProxy);
1242
- if (getAIConfig(keys.openaiUseAzureDeprecated)) {
1243
- const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
1244
- const openaiApiKey = getAIConfig(keys.openaiApiKey);
1245
- const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1246
- valueAssert(
1247
- openaiBaseURL,
1248
- keys.openaiBaseURL,
1249
- keys.openaiUseAzureDeprecated
1250
- );
1251
- valueAssert(openaiApiKey, keys.openaiApiKey, keys.openaiUseAzureDeprecated);
1252
- return {
1253
- socksProxy,
1254
- httpProxy,
1255
- modelName,
1256
- openaiUseAzureDeprecated: true,
1257
- openaiApiKey,
1258
- openaiBaseURL,
1259
- openaiExtraConfig
1260
- };
1261
- } else if (getAIConfig(keys.useAzureOpenai)) {
1262
- const azureOpenaiScope = getAIConfig(keys.azureOpenaiScope);
1263
- const azureOpenaiApiKey = getAIConfig(keys.azureOpenaiApiKey);
1264
- const azureOpenaiEndpoint = getAIConfig(keys.azureOpenaiEndpoint);
1265
- const azureOpenaiDeployment = getAIConfig(keys.azureOpenaiDeployment);
1266
- const azureOpenaiApiVersion = getAIConfig(keys.azureOpenaiApiVersion);
1267
- const azureExtraConfig = getAIConfigInJson(keys.azureExtraConfig);
1268
- const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1269
- valueAssert(azureOpenaiApiKey, keys.azureOpenaiApiKey, keys.useAzureOpenai);
1270
- return {
1271
- socksProxy,
1272
- httpProxy,
1273
- modelName,
1274
- useAzureOpenai: true,
1275
- azureOpenaiScope,
1276
- azureOpenaiApiKey,
1277
- azureOpenaiEndpoint,
1278
- azureOpenaiDeployment,
1279
- azureOpenaiApiVersion,
1280
- azureExtraConfig,
1281
- openaiExtraConfig
1282
- };
1283
- } else if (getAIConfig(keys.useAnthropicSdk)) {
1284
- const anthropicApiKey = getAIConfig(keys.anthropicApiKey);
1285
- valueAssert(anthropicApiKey, keys.anthropicApiKey, keys.useAnthropicSdk);
1286
- return {
1287
- socksProxy,
1288
- httpProxy,
1289
- modelName,
1290
- useAnthropicSdk: true,
1291
- anthropicApiKey
1292
- };
1293
- } else {
1294
- const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
1295
- const openaiApiKey = getAIConfig(keys.openaiApiKey);
1296
- const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1297
- valueAssert(openaiBaseURL, keys.openaiBaseURL);
1298
- valueAssert(openaiApiKey, keys.openaiApiKey);
1299
- return {
1300
- socksProxy,
1301
- httpProxy,
1302
- modelName,
1303
- openaiBaseURL,
1304
- openaiApiKey,
1305
- openaiExtraConfig
1306
- };
1307
- }
1308
- };
1309
- var maskKey = (key, maskChar = "*") => {
1310
- if (typeof key !== "string" || key.length === 0) {
1311
- return key;
1312
- }
1313
- const prefixLen = 3;
1314
- const suffixLen = 3;
1315
- const keepLength = prefixLen + suffixLen;
1316
- if (key.length <= keepLength) {
1317
- return key;
1318
- }
1319
- const prefix = key.substring(0, prefixLen);
1320
- const suffix = key.substring(key.length - suffixLen);
1321
- const maskLength = key.length - keepLength;
1322
- const mask = maskChar.repeat(maskLength);
1323
- return `${prefix}${mask}${suffix}`;
1324
- };
1325
- var maskConfig = (config) => {
1326
- return Object.fromEntries(
1327
- Object.entries(config).map(([key, value]) => [
1328
- key,
1329
- ["openaiApiKey", "azureOpenaiApiKey", "anthropicApiKey"].includes(key) ? maskKey(value) : value
1330
- ])
1331
- );
1332
- };
1333
- var decideModelConfig = (modelPreferences) => {
1334
- initDebugConfig();
1335
- const debugLog = getDebug2("ai:decideModelConfig");
1336
- debugLog("modelPreferences", modelPreferences);
1337
- const isVQAIntent = modelPreferences?.intent === "VQA";
1338
- const vqaModelName = getAIConfig(MIDSCENE_VQA_MODEL_NAME);
1339
- if (isVQAIntent && vqaModelName) {
1340
- debugLog(
1341
- `current action is a VQA action and detected ${MIDSCENE_VQA_MODEL_NAME} ${vqaModelName}, will only read VQA related model config from process.env`
1342
- );
1343
- const config = getModelConfigFromEnv(
1344
- vqaModelName,
1345
- {
1346
- /**
1347
- * proxy
1348
- */
1349
- socksProxy: MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
1350
- httpProxy: MIDSCENE_VQA_OPENAI_HTTP_PROXY,
1351
- /**
1352
- * OpenAI
1353
- */
1354
- openaiBaseURL: MIDSCENE_VQA_OPENAI_BASE_URL,
1355
- openaiApiKey: MIDSCENE_VQA_OPENAI_API_KEY,
1356
- openaiExtraConfig: MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
1357
- /**
1358
- * Azure
1359
- */
1360
- openaiUseAzureDeprecated: MIDSCENE_VQA_OPENAI_USE_AZURE,
1361
- useAzureOpenai: MIDSCENE_VQA_USE_AZURE_OPENAI,
1362
- azureOpenaiScope: MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
1363
- azureOpenaiApiKey: MIDSCENE_VQA_AZURE_OPENAI_KEY,
1364
- azureOpenaiEndpoint: MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
1365
- azureOpenaiApiVersion: MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
1366
- azureOpenaiDeployment: MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
1367
- azureExtraConfig: MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
1368
- /**
1369
- * Anthropic
1370
- */
1371
- useAnthropicSdk: MIDSCENE_VQA_USE_ANTHROPIC_SDK,
1372
- anthropicApiKey: MIDSCENE_VQA_ANTHROPIC_API_KEY
1373
- },
1374
- createAssert(MIDSCENE_VQA_MODEL_NAME, vqaModelName)
1375
- );
1376
- debugLog("got model config for VQA usage:", maskConfig(config));
1377
- return config;
1378
- } else {
1379
- debugLog("read model config from process.env as normal.");
1380
- const commonModelName = getAIConfig(MIDSCENE_MODEL_NAME);
1381
- assert3(
1382
- commonModelName,
1383
- `${MIDSCENE_MODEL_NAME} is empty, please check your config.`
1384
- );
1385
- const config = getModelConfigFromEnv(
1386
- commonModelName,
1387
- {
1388
- /**
1389
- * proxy
1390
- */
1391
- socksProxy: MIDSCENE_OPENAI_SOCKS_PROXY,
1392
- httpProxy: MIDSCENE_OPENAI_HTTP_PROXY,
1393
- /**
1394
- * OpenAI
1395
- */
1396
- openaiBaseURL: OPENAI_BASE_URL,
1397
- openaiApiKey: OPENAI_API_KEY,
1398
- openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
1399
- /**
1400
- * Azure
1401
- */
1402
- openaiUseAzureDeprecated: OPENAI_USE_AZURE,
1403
- useAzureOpenai: MIDSCENE_USE_AZURE_OPENAI,
1404
- azureOpenaiScope: MIDSCENE_AZURE_OPENAI_SCOPE,
1405
- azureOpenaiApiKey: AZURE_OPENAI_KEY,
1406
- azureOpenaiEndpoint: AZURE_OPENAI_ENDPOINT,
1407
- azureOpenaiApiVersion: AZURE_OPENAI_API_VERSION,
1408
- azureOpenaiDeployment: AZURE_OPENAI_DEPLOYMENT,
1409
- azureExtraConfig: MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
1410
- /**
1411
- * Anthropic
1412
- */
1413
- useAnthropicSdk: MIDSCENE_USE_ANTHROPIC_SDK,
1414
- anthropicApiKey: ANTHROPIC_API_KEY
1415
- },
1416
- createAssert(MIDSCENE_MODEL_NAME, commonModelName)
1417
- );
1418
- debugLog("got model config for common usage:", maskConfig(config));
1419
- return config;
1210
+ var defaultModel = "gpt-4o";
1211
+ function getModelName() {
1212
+ let modelName = defaultModel;
1213
+ const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1214
+ if (nameInConfig) {
1215
+ modelName = nameInConfig;
1420
1216
  }
1421
- };
1422
-
1423
- // src/ai-model/service-caller/index.ts
1217
+ return modelName;
1218
+ }
1424
1219
  async function createChatClient({
1425
- AIActionTypeValue,
1426
- modelPreferences
1220
+ AIActionTypeValue
1427
1221
  }) {
1428
- const {
1429
- socksProxy,
1430
- httpProxy,
1431
- modelName,
1432
- openaiBaseURL,
1433
- openaiApiKey,
1434
- openaiExtraConfig,
1435
- openaiUseAzureDeprecated,
1436
- useAzureOpenai,
1437
- azureOpenaiScope,
1438
- azureOpenaiApiKey,
1439
- azureOpenaiEndpoint,
1440
- azureOpenaiApiVersion,
1441
- azureOpenaiDeployment,
1442
- azureExtraConfig,
1443
- useAnthropicSdk,
1444
- anthropicApiKey
1445
- } = decideModelConfig(modelPreferences);
1222
+ initDebugConfig();
1446
1223
  let openai;
1224
+ const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1225
+ const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
1226
+ const httpProxy = getAIConfig(MIDSCENE_OPENAI_HTTP_PROXY);
1447
1227
  let proxyAgent = void 0;
1448
- const debugProxy = getDebug3("ai:call:proxy");
1228
+ const debugProxy = getDebug2("ai:call:proxy");
1449
1229
  if (httpProxy) {
1450
1230
  debugProxy("using http proxy", httpProxy);
1451
1231
  proxyAgent = new HttpsProxyAgent(httpProxy);
@@ -1453,56 +1233,70 @@ async function createChatClient({
1453
1233
  debugProxy("using socks proxy", socksProxy);
1454
1234
  proxyAgent = new SocksProxyAgent(socksProxy);
1455
1235
  }
1456
- if (openaiUseAzureDeprecated) {
1236
+ if (getAIConfig(OPENAI_USE_AZURE)) {
1457
1237
  openai = new AzureOpenAI({
1458
- baseURL: openaiBaseURL,
1459
- apiKey: openaiApiKey,
1238
+ baseURL: getAIConfig(OPENAI_BASE_URL),
1239
+ apiKey: getAIConfig(OPENAI_API_KEY),
1460
1240
  httpAgent: proxyAgent,
1461
- ...openaiExtraConfig,
1241
+ ...extraConfig,
1462
1242
  dangerouslyAllowBrowser: true
1463
1243
  });
1464
- } else if (useAzureOpenai) {
1244
+ } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
1245
+ const extraAzureConfig = getAIConfigInJson(
1246
+ MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
1247
+ );
1248
+ const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
1465
1249
  let tokenProvider = void 0;
1466
- if (azureOpenaiScope) {
1467
- assert4(
1250
+ if (scope) {
1251
+ assert3(
1468
1252
  !ifInBrowser,
1469
1253
  "Azure OpenAI is not supported in browser with Midscene."
1470
1254
  );
1471
1255
  const credential = new DefaultAzureCredential();
1472
- tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);
1256
+ assert3(scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
1257
+ tokenProvider = getBearerTokenProvider(credential, scope);
1473
1258
  openai = new AzureOpenAI({
1474
1259
  azureADTokenProvider: tokenProvider,
1475
- endpoint: azureOpenaiEndpoint,
1476
- apiVersion: azureOpenaiApiVersion,
1477
- deployment: azureOpenaiDeployment,
1478
- ...openaiExtraConfig,
1479
- ...azureExtraConfig
1260
+ endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1261
+ apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1262
+ deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1263
+ ...extraConfig,
1264
+ ...extraAzureConfig
1480
1265
  });
1481
1266
  } else {
1482
1267
  openai = new AzureOpenAI({
1483
- apiKey: azureOpenaiApiKey,
1484
- endpoint: azureOpenaiEndpoint,
1485
- apiVersion: azureOpenaiApiVersion,
1486
- deployment: azureOpenaiDeployment,
1268
+ apiKey: getAIConfig(AZURE_OPENAI_KEY),
1269
+ endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1270
+ apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1271
+ deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1487
1272
  dangerouslyAllowBrowser: true,
1488
- ...openaiExtraConfig,
1489
- ...azureExtraConfig
1273
+ ...extraConfig,
1274
+ ...extraAzureConfig
1490
1275
  });
1491
1276
  }
1492
- } else if (!useAnthropicSdk) {
1277
+ } else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1278
+ const baseURL = getAIConfig(OPENAI_BASE_URL);
1279
+ if (typeof baseURL === "string") {
1280
+ if (!/^https?:\/\//.test(baseURL)) {
1281
+ throw new Error(
1282
+ `OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}
1283
+ Please check your config.`
1284
+ );
1285
+ }
1286
+ }
1493
1287
  openai = new OpenAI({
1494
- baseURL: openaiBaseURL,
1495
- apiKey: openaiApiKey,
1288
+ baseURL: getAIConfig(OPENAI_BASE_URL),
1289
+ apiKey: getAIConfig(OPENAI_API_KEY),
1496
1290
  httpAgent: proxyAgent,
1497
- ...openaiExtraConfig,
1291
+ ...extraConfig,
1498
1292
  defaultHeaders: {
1499
- ...openaiExtraConfig?.defaultHeaders || {},
1293
+ ...extraConfig?.defaultHeaders || {},
1500
1294
  [MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
1501
1295
  },
1502
1296
  dangerouslyAllowBrowser: true
1503
1297
  });
1504
1298
  }
1505
- if (openai && getAIConfigInBoolean2(MIDSCENE_LANGSMITH_DEBUG)) {
1299
+ if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
1506
1300
  if (ifInBrowser) {
1507
1301
  throw new Error("langsmith is not supported in browser");
1508
1302
  }
@@ -1513,13 +1307,14 @@ async function createChatClient({
1513
1307
  if (typeof openai !== "undefined") {
1514
1308
  return {
1515
1309
  completion: openai.chat.completions,
1516
- style: "openai",
1517
- modelName
1310
+ style: "openai"
1518
1311
  };
1519
1312
  }
1520
- if (useAnthropicSdk) {
1313
+ if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1314
+ const apiKey = getAIConfig(ANTHROPIC_API_KEY);
1315
+ assert3(apiKey, "ANTHROPIC_API_KEY is required");
1521
1316
  openai = new Anthropic({
1522
- apiKey: anthropicApiKey,
1317
+ apiKey,
1523
1318
  httpAgent: proxyAgent,
1524
1319
  dangerouslyAllowBrowser: true
1525
1320
  });
@@ -1527,23 +1322,25 @@ async function createChatClient({
1527
1322
  if (typeof openai !== "undefined" && openai.messages) {
1528
1323
  return {
1529
1324
  completion: openai.messages,
1530
- style: "anthropic",
1531
- modelName
1325
+ style: "anthropic"
1532
1326
  };
1533
1327
  }
1534
1328
  throw new Error("Openai SDK or Anthropic SDK is not initialized");
1535
1329
  }
1536
- async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1537
- const { completion, style, modelName } = await createChatClient({
1538
- AIActionTypeValue,
1539
- modelPreferences
1330
+ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1331
+ assert3(
1332
+ checkAIConfig(),
1333
+ "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
1334
+ );
1335
+ const { completion, style } = await createChatClient({
1336
+ AIActionTypeValue
1540
1337
  });
1541
- const responseFormat = getResponseFormat(modelName, AIActionTypeValue);
1542
- const maxTokens = getAIConfig2(OPENAI_MAX_TOKENS);
1543
- const debugCall = getDebug3("ai:call");
1544
- const debugProfileStats = getDebug3("ai:profile:stats");
1545
- const debugProfileDetail = getDebug3("ai:profile:detail");
1338
+ const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);
1339
+ const debugCall = getDebug2("ai:call");
1340
+ const debugProfileStats = getDebug2("ai:profile:stats");
1341
+ const debugProfileDetail = getDebug2("ai:profile:detail");
1546
1342
  const startTime = Date.now();
1343
+ const model = getModelName();
1547
1344
  const isStreaming = options?.stream && options?.onChunk;
1548
1345
  let content;
1549
1346
  let accumulated = "";
@@ -1560,12 +1357,12 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1560
1357
  try {
1561
1358
  if (style === "openai") {
1562
1359
  debugCall(
1563
- `sending ${isStreaming ? "streaming " : ""}request to ${modelName}`
1360
+ `sending ${isStreaming ? "streaming " : ""}request to ${model}`
1564
1361
  );
1565
1362
  if (isStreaming) {
1566
1363
  const stream = await completion.create(
1567
1364
  {
1568
- model: modelName,
1365
+ model,
1569
1366
  messages,
1570
1367
  response_format: responseFormat,
1571
1368
  ...commonConfig
@@ -1622,23 +1419,23 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1622
1419
  }
1623
1420
  content = accumulated;
1624
1421
  debugProfileStats(
1625
- `streaming model, ${modelName}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
1422
+ `streaming model, ${model}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
1626
1423
  );
1627
1424
  } else {
1628
1425
  const result = await completion.create({
1629
- model: modelName,
1426
+ model,
1630
1427
  messages,
1631
1428
  response_format: responseFormat,
1632
1429
  ...commonConfig
1633
1430
  });
1634
1431
  timeCost = Date.now() - startTime;
1635
1432
  debugProfileStats(
1636
- `model, ${modelName}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1433
+ `model, ${model}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1637
1434
  );
1638
1435
  debugProfileDetail(
1639
1436
  `model usage detail: ${JSON.stringify(result.usage)}`
1640
1437
  );
1641
- assert4(
1438
+ assert3(
1642
1439
  result.choices,
1643
1440
  `invalid response from LLM service: ${JSON.stringify(result)}`
1644
1441
  );
@@ -1646,12 +1443,12 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1646
1443
  usage = result.usage;
1647
1444
  }
1648
1445
  debugCall(`response: ${content}`);
1649
- assert4(content, "empty content");
1446
+ assert3(content, "empty content");
1650
1447
  } else if (style === "anthropic") {
1651
1448
  const convertImageContent = (content2) => {
1652
1449
  if (content2.type === "image_url") {
1653
1450
  const imgBase64 = content2.image_url.url;
1654
- assert4(imgBase64, "image_url is required");
1451
+ assert3(imgBase64, "image_url is required");
1655
1452
  return {
1656
1453
  source: {
1657
1454
  type: "base64",
@@ -1665,7 +1462,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1665
1462
  };
1666
1463
  if (isStreaming) {
1667
1464
  const stream = await completion.create({
1668
- model: modelName,
1465
+ model,
1669
1466
  system: "You are a versatile professional in software UI automation",
1670
1467
  messages: messages.map((m) => ({
1671
1468
  role: "user",
@@ -1709,7 +1506,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1709
1506
  content = accumulated;
1710
1507
  } else {
1711
1508
  const result = await completion.create({
1712
- model: modelName,
1509
+ model,
1713
1510
  system: "You are a versatile professional in software UI automation",
1714
1511
  messages: messages.map((m) => ({
1715
1512
  role: "user",
@@ -1722,7 +1519,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1722
1519
  content = result.content[0].text;
1723
1520
  usage = result.usage;
1724
1521
  }
1725
- assert4(content, "empty content");
1522
+ assert3(content, "empty content");
1726
1523
  }
1727
1524
  if (isStreaming && !usage) {
1728
1525
  const estimatedTokens = Math.max(
@@ -1756,9 +1553,10 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1756
1553
  throw newError;
1757
1554
  }
1758
1555
  }
1759
- var getResponseFormat = (modelName, AIActionTypeValue) => {
1556
+ async function callToGetJSONObject(messages, AIActionTypeValue) {
1760
1557
  let responseFormat;
1761
- if (modelName.includes("gpt-4")) {
1558
+ const model = getModelName();
1559
+ if (model.includes("gpt-4")) {
1762
1560
  switch (AIActionTypeValue) {
1763
1561
  case 0 /* ASSERT */:
1764
1562
  responseFormat = assertSchema;
@@ -1775,19 +1573,11 @@ var getResponseFormat = (modelName, AIActionTypeValue) => {
1775
1573
  break;
1776
1574
  }
1777
1575
  }
1778
- if (modelName === "gpt-4o-2024-05-13") {
1576
+ if (model === "gpt-4o-2024-05-13") {
1779
1577
  responseFormat = { type: "json_object" /* JSON */ };
1780
1578
  }
1781
- return responseFormat;
1782
- };
1783
- async function callToGetJSONObject(messages, AIActionTypeValue, modelPreferences) {
1784
- const response = await call2(
1785
- messages,
1786
- AIActionTypeValue,
1787
- void 0,
1788
- modelPreferences
1789
- );
1790
- assert4(response, "empty response");
1579
+ const response = await call2(messages, AIActionTypeValue, responseFormat);
1580
+ assert3(response, "empty response");
1791
1581
  const jsonContent = safeParseJson(response.content);
1792
1582
  return { content: jsonContent, usage: response.usage };
1793
1583
  }
@@ -2071,7 +1861,7 @@ Respond with YAML only, no explanations.`
2071
1861
  });
2072
1862
  }
2073
1863
  if (options.stream && options.onChunk) {
2074
- return await call2(prompt, 2 /* EXTRACT_DATA */, {
1864
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
2075
1865
  stream: true,
2076
1866
  onChunk: options.onChunk
2077
1867
  });
@@ -2194,7 +1984,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
2194
1984
  }
2195
1985
  ];
2196
1986
  if (options.stream && options.onChunk) {
2197
- return await call2(prompt, 2 /* EXTRACT_DATA */, {
1987
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
2198
1988
  stream: true,
2199
1989
  onChunk: options.onChunk
2200
1990
  });
@@ -2215,7 +2005,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
2215
2005
  import {
2216
2006
  MIDSCENE_USE_QWEN_VL,
2217
2007
  MIDSCENE_USE_VLM_UI_TARS,
2218
- getAIConfigInBoolean as getAIConfigInBoolean3,
2008
+ getAIConfigInBoolean as getAIConfigInBoolean2,
2219
2009
  vlLocateMode as vlLocateMode4
2220
2010
  } from "@midscene/shared/env";
2221
2011
  import {
@@ -2223,8 +2013,8 @@ import {
2223
2013
  paddingToMatchBlockByBase64,
2224
2014
  preProcessImageUrl
2225
2015
  } from "@midscene/shared/img";
2226
- import { getDebug as getDebug4 } from "@midscene/shared/logger";
2227
- import { assert as assert5 } from "@midscene/shared/utils";
2016
+ import { getDebug as getDebug3 } from "@midscene/shared/logger";
2017
+ import { assert as assert4 } from "@midscene/shared/utils";
2228
2018
 
2229
2019
  // src/ai-model/prompt/extraction.ts
2230
2020
  import { PromptTemplate as PromptTemplate3 } from "@langchain/core/prompts";
@@ -2379,8 +2169,8 @@ var sectionLocatorInstruction = new PromptTemplate4({
2379
2169
  });
2380
2170
 
2381
2171
  // src/ai-model/inspect.ts
2382
- var debugInspect = getDebug4("ai:inspect");
2383
- var debugSection = getDebug4("ai:section");
2172
+ var debugInspect = getDebug3("ai:inspect");
2173
+ var debugSection = getDebug3("ai:section");
2384
2174
  var extraTextFromUserPrompt = (prompt) => {
2385
2175
  if (typeof prompt === "string") {
2386
2176
  return prompt;
@@ -2434,7 +2224,7 @@ async function AiLocateElement(options) {
2434
2224
  const { context, targetElementDescription, callAI } = options;
2435
2225
  const { screenshotBase64 } = context;
2436
2226
  const { description, elementById, insertElementByPosition } = await describeUserPage(context);
2437
- assert5(
2227
+ assert4(
2438
2228
  targetElementDescription,
2439
2229
  "cannot find the target element description"
2440
2230
  );
@@ -2445,11 +2235,11 @@ async function AiLocateElement(options) {
2445
2235
  const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
2446
2236
  let imagePayload = screenshotBase64;
2447
2237
  if (options.searchConfig) {
2448
- assert5(
2238
+ assert4(
2449
2239
  options.searchConfig.rect,
2450
2240
  "searchArea is provided but its rect cannot be found. Failed to locate element"
2451
2241
  );
2452
- assert5(
2242
+ assert4(
2453
2243
  options.searchConfig.imageBase64,
2454
2244
  "searchArea is provided but its imageBase64 cannot be found. Failed to locate element"
2455
2245
  );
@@ -2601,7 +2391,7 @@ async function AiLocateSection(options) {
2601
2391
  imageBase64 = await cropByRect(
2602
2392
  screenshotBase64,
2603
2393
  sectionRect,
2604
- getAIConfigInBoolean3(MIDSCENE_USE_QWEN_VL)
2394
+ getAIConfigInBoolean2(MIDSCENE_USE_QWEN_VL)
2605
2395
  );
2606
2396
  }
2607
2397
  return {
@@ -2613,13 +2403,7 @@ async function AiLocateSection(options) {
2613
2403
  };
2614
2404
  }
2615
2405
  async function AiExtractElementInfo(options) {
2616
- const {
2617
- dataQuery,
2618
- context,
2619
- extractOption,
2620
- multimodalPrompt,
2621
- modelPreferences
2622
- } = options;
2406
+ const { dataQuery, context, extractOption, multimodalPrompt } = options;
2623
2407
  const systemPrompt = systemPromptToExtract();
2624
2408
  const { screenshotBase64 } = context;
2625
2409
  const { description, elementById } = await describeUserPage(context, {
@@ -2668,8 +2452,7 @@ async function AiExtractElementInfo(options) {
2668
2452
  }
2669
2453
  const result = await callAiFn(
2670
2454
  msgs,
2671
- 2 /* EXTRACT_DATA */,
2672
- modelPreferences
2455
+ 2 /* EXTRACT_DATA */
2673
2456
  );
2674
2457
  return {
2675
2458
  parseResult: result.content,
@@ -2679,10 +2462,10 @@ async function AiExtractElementInfo(options) {
2679
2462
  }
2680
2463
  async function AiAssert(options) {
2681
2464
  const { assertion, context } = options;
2682
- assert5(assertion, "assertion should not be empty");
2465
+ assert4(assertion, "assertion should not be empty");
2683
2466
  const { screenshotBase64 } = context;
2684
2467
  const systemPrompt = systemPromptToAssert({
2685
- isUITars: getAIConfigInBoolean3(MIDSCENE_USE_VLM_UI_TARS)
2468
+ isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
2686
2469
  });
2687
2470
  const assertionText = extraTextFromUserPrompt(assertion);
2688
2471
  const msgs = [
@@ -2729,7 +2512,7 @@ ${assertionText}
2729
2512
  // src/ai-model/llm-planning.ts
2730
2513
  import { vlLocateMode as vlLocateMode5 } from "@midscene/shared/env";
2731
2514
  import { paddingToMatchBlockByBase64 as paddingToMatchBlockByBase642 } from "@midscene/shared/img";
2732
- import { assert as assert6 } from "@midscene/shared/utils";
2515
+ import { assert as assert5 } from "@midscene/shared/utils";
2733
2516
  async function plan(userInstruction, opts) {
2734
2517
  const { callAI, context } = opts || {};
2735
2518
  const { screenshotBase64, size } = context;
@@ -2791,7 +2574,7 @@ async function plan(userInstruction, opts) {
2791
2574
  usage,
2792
2575
  yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
2793
2576
  };
2794
- assert6(planFromAI, "can't get plans from AI");
2577
+ assert5(planFromAI, "can't get plans from AI");
2795
2578
  if (vlLocateMode5()) {
2796
2579
  actions.forEach((action) => {
2797
2580
  if (action.locate) {
@@ -2807,7 +2590,7 @@ async function plan(userInstruction, opts) {
2807
2590
  }
2808
2591
  }
2809
2592
  });
2810
- assert6(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2593
+ assert5(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2811
2594
  } else {
2812
2595
  actions.forEach((action) => {
2813
2596
  if (action.locate?.id) {
@@ -2835,8 +2618,8 @@ import {
2835
2618
  } from "@midscene/shared/env";
2836
2619
  import { resizeImgBase64 } from "@midscene/shared/img";
2837
2620
  import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
2838
- import { getDebug as getDebug5 } from "@midscene/shared/logger";
2839
- import { assert as assert7 } from "@midscene/shared/utils";
2621
+ import { getDebug as getDebug4 } from "@midscene/shared/logger";
2622
+ import { assert as assert6 } from "@midscene/shared/utils";
2840
2623
  import { actionParser } from "@ui-tars/action-parser";
2841
2624
 
2842
2625
  // src/ai-model/prompt/ui-tars-planning.ts
@@ -2875,7 +2658,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
2875
2658
  var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
2876
2659
 
2877
2660
  // src/ai-model/ui-tars-planning.ts
2878
- var debug = getDebug5("ui-tars-planning");
2661
+ var debug = getDebug4("ui-tars-planning");
2879
2662
  var bboxSize = 10;
2880
2663
  var pointToBbox = (point, width, height) => {
2881
2664
  return [
@@ -2913,7 +2696,7 @@ async function vlmPlanning(options) {
2913
2696
  const transformActions = [];
2914
2697
  parsed.forEach((action) => {
2915
2698
  if (action.action_type === "click") {
2916
- assert7(action.action_inputs.start_box, "start_box is required");
2699
+ assert6(action.action_inputs.start_box, "start_box is required");
2917
2700
  const point = getPoint(action.action_inputs.start_box, size);
2918
2701
  transformActions.push({
2919
2702
  type: "Locate",
@@ -2940,8 +2723,8 @@ async function vlmPlanning(options) {
2940
2723
  param: action.thought || ""
2941
2724
  });
2942
2725
  } else if (action.action_type === "drag") {
2943
- assert7(action.action_inputs.start_box, "start_box is required");
2944
- assert7(action.action_inputs.end_box, "end_box is required");
2726
+ assert6(action.action_inputs.start_box, "start_box is required");
2727
+ assert6(action.action_inputs.end_box, "end_box is required");
2945
2728
  const startPoint = getPoint(action.action_inputs.start_box, size);
2946
2729
  const endPoint = getPoint(action.action_inputs.end_box, size);
2947
2730
  transformActions.push({
@@ -3023,7 +2806,7 @@ async function vlmPlanning(options) {
3023
2806
  param: {}
3024
2807
  });
3025
2808
  } else if (action.action_type === "androidLongPress") {
3026
- assert7(
2809
+ assert6(
3027
2810
  action.action_inputs.start_coords,
3028
2811
  "start_coords is required for androidLongPress"
3029
2812
  );
@@ -3139,4 +2922,4 @@ export {
3139
2922
  resizeImageForUiTars
3140
2923
  };
3141
2924
 
3142
- //# sourceMappingURL=chunk-G2JTYWI6.js.map
2925
+ //# sourceMappingURL=chunk-I5LBWOQA.js.map