@midscene/shared 0.29.0 → 0.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/env/parse.mjs +7 -1
- package/dist/es/env/types.mjs +4 -1
- package/dist/es/img/transform.mjs +27 -7
- package/dist/lib/env/parse.js +6 -0
- package/dist/lib/env/types.js +6 -0
- package/dist/lib/img/transform.js +27 -7
- package/dist/types/env/types.d.ts +5 -4
- package/dist/types/img/transform.d.ts +15 -3
- package/package.json +1 -1
- package/src/env/parse.ts +10 -0
- package/src/env/types.ts +5 -0
- package/src/img/transform.ts +35 -10
package/dist/es/env/parse.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, UITarsModelVersion, VL_MODE_RAW_VALID_VALUES } from "./types.mjs";
|
|
1
|
+
import { MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN3_VL, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, UITarsModelVersion, VL_MODE_RAW_VALID_VALUES } from "./types.mjs";
|
|
2
2
|
const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
|
|
3
3
|
if (!vlModeRaw) return {
|
|
4
4
|
vlMode: void 0,
|
|
@@ -22,15 +22,21 @@ const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
|
|
|
22
22
|
const parseVlModeAndUiTarsFromGlobalConfig = (provider)=>{
|
|
23
23
|
const isDoubao = provider[MIDSCENE_USE_DOUBAO_VISION];
|
|
24
24
|
const isQwen = provider[MIDSCENE_USE_QWEN_VL];
|
|
25
|
+
const isQwen3 = provider[MIDSCENE_USE_QWEN3_VL];
|
|
25
26
|
const isUiTars = provider[MIDSCENE_USE_VLM_UI_TARS];
|
|
26
27
|
const isGemini = provider[MIDSCENE_USE_GEMINI];
|
|
27
28
|
const enabledModes = [
|
|
28
29
|
isDoubao && MIDSCENE_USE_DOUBAO_VISION,
|
|
29
30
|
isQwen && MIDSCENE_USE_QWEN_VL,
|
|
31
|
+
isQwen3 && MIDSCENE_USE_QWEN3_VL,
|
|
30
32
|
isUiTars && MIDSCENE_USE_VLM_UI_TARS,
|
|
31
33
|
isGemini && MIDSCENE_USE_GEMINI
|
|
32
34
|
].filter(Boolean);
|
|
33
35
|
if (enabledModes.length > 1) throw new Error(`Only one vision mode can be enabled at a time. Currently enabled modes: ${enabledModes.join(', ')}. Please disable all but one mode.`);
|
|
36
|
+
if (isQwen3) return {
|
|
37
|
+
vlMode: 'qwen3-vl',
|
|
38
|
+
uiTarsVersion: void 0
|
|
39
|
+
};
|
|
34
40
|
if (isQwen) return {
|
|
35
41
|
vlMode: 'qwen-vl',
|
|
36
42
|
uiTarsVersion: void 0
|
package/dist/es/env/types.mjs
CHANGED
|
@@ -22,6 +22,7 @@ const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
|
|
|
22
22
|
const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
|
|
23
23
|
const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
|
|
24
24
|
const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
|
|
25
|
+
const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
|
|
25
26
|
const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
|
|
26
27
|
const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
|
|
27
28
|
const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
|
|
@@ -153,6 +154,7 @@ const MODEL_ENV_KEYS = [
|
|
|
153
154
|
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
154
155
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
155
156
|
MIDSCENE_USE_QWEN_VL,
|
|
157
|
+
MIDSCENE_USE_QWEN3_VL,
|
|
156
158
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
157
159
|
MIDSCENE_USE_GEMINI,
|
|
158
160
|
MIDSCENE_USE_VL_MODEL,
|
|
@@ -239,8 +241,9 @@ const VL_MODE_RAW_VALID_VALUES = [
|
|
|
239
241
|
'doubao-vision',
|
|
240
242
|
'gemini',
|
|
241
243
|
'qwen-vl',
|
|
244
|
+
'qwen3-vl',
|
|
242
245
|
'vlm-ui-tars',
|
|
243
246
|
'vlm-ui-tars-doubao',
|
|
244
247
|
'vlm-ui-tars-doubao-1.5'
|
|
245
248
|
];
|
|
246
|
-
export { ALL_ENV_KEYS, ANTHROPIC_API_KEY, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, BASIC_ENV_KEYS, BOOLEAN_ENV_KEYS, DOCKER_CONTAINER, GLOBAL_ENV_KEYS, MATCH_BY_POSITION, MIDSCENE_ADB_PATH, MIDSCENE_ADB_REMOTE_HOST, MIDSCENE_ADB_REMOTE_PORT, MIDSCENE_ANDROID_IME_STRATEGY, MIDSCENE_ANTHROPIC_API_KEY, MIDSCENE_API_TYPE, MIDSCENE_AZURE_OPENAI_API_VERSION, MIDSCENE_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_AZURE_OPENAI_ENDPOINT, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_KEY, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_CACHE, MIDSCENE_CACHE_MAX_FILENAME_LENGTH, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, MIDSCENE_DEBUG_AI_PROFILE, MIDSCENE_DEBUG_AI_RESPONSE, MIDSCENE_DEBUG_MODE, MIDSCENE_FORCE_DEEP_THINK, MIDSCENE_GROUNDING_ANTHROPIC_API_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION, MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT, MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_AZURE_OPENAI_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE, MIDSCENE_GROUNDING_MODEL_NAME, MIDSCENE_GROUNDING_OPENAI_API_KEY, MIDSCENE_GROUNDING_OPENAI_BASE_URL, MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY, MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY, MIDSCENE_GROUNDING_OPENAI_USE_AZURE, MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK, MIDSCENE_GROUNDING_USE_AZURE_OPENAI, MIDSCENE_GROUNDING_VL_MODE, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MCP_ANDROID_MODE, MIDSCENE_MCP_CHROME_PATH, MIDSCENE_MCP_USE_PUPPETEER_MODE, MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_API_KEY, MIDSCENE_OPENAI_BASE_URL, MIDSCENE_OPENAI_HTTP_PROXY, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, MIDSCENE_OPENAI_USE_AZURE, MIDSCENE_PLANNING_ANTHROPIC_API_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION, MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT, MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_AZURE_OPENAI_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE, MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, MIDSCENE_PLANNING_OPENAI_HTTP_PROXY, MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY, MIDSCENE_PLANNING_OPENAI_USE_AZURE, MIDSCENE_PLANNING_USE_ANTHROPIC_SDK, MIDSCENE_PLANNING_USE_AZURE_OPENAI, MIDSCENE_PLANNING_VL_MODE, MIDSCENE_PREFERRED_LANGUAGE, MIDSCENE_REPLANNING_CYCLE_LIMIT, MIDSCENE_REPORT_TAG_NAME, MIDSCENE_RUN_DIR, MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, MIDSCENE_USE_VL_MODEL, MIDSCENE_VL_MODE, MIDSCENE_VQA_ANTHROPIC_API_KEY, MIDSCENE_VQA_AZURE_OPENAI_API_VERSION, MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT, MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_AZURE_OPENAI_KEY, MIDSCENE_VQA_AZURE_OPENAI_SCOPE, MIDSCENE_VQA_MODEL_NAME, MIDSCENE_VQA_OPENAI_API_KEY, MIDSCENE_VQA_OPENAI_BASE_URL, MIDSCENE_VQA_OPENAI_HTTP_PROXY, MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_OPENAI_SOCKS_PROXY, MIDSCENE_VQA_OPENAI_USE_AZURE, MIDSCENE_VQA_USE_ANTHROPIC_SDK, MIDSCENE_VQA_USE_AZURE_OPENAI, MIDSCENE_VQA_VL_MODE, MODEL_ENV_KEYS, NUMBER_ENV_KEYS, OPENAI_API_KEY, OPENAI_BASE_URL, OPENAI_MAX_TOKENS, OPENAI_USE_AZURE, STRING_ENV_KEYS, types_UITarsModelVersion as UITarsModelVersion, UNUSED_ENV_KEYS, VL_MODE_RAW_VALID_VALUES };
|
|
249
|
+
export { ALL_ENV_KEYS, ANTHROPIC_API_KEY, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, BASIC_ENV_KEYS, BOOLEAN_ENV_KEYS, DOCKER_CONTAINER, GLOBAL_ENV_KEYS, MATCH_BY_POSITION, MIDSCENE_ADB_PATH, MIDSCENE_ADB_REMOTE_HOST, MIDSCENE_ADB_REMOTE_PORT, MIDSCENE_ANDROID_IME_STRATEGY, MIDSCENE_ANTHROPIC_API_KEY, MIDSCENE_API_TYPE, MIDSCENE_AZURE_OPENAI_API_VERSION, MIDSCENE_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_AZURE_OPENAI_ENDPOINT, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_KEY, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_CACHE, MIDSCENE_CACHE_MAX_FILENAME_LENGTH, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, MIDSCENE_DEBUG_AI_PROFILE, MIDSCENE_DEBUG_AI_RESPONSE, MIDSCENE_DEBUG_MODE, MIDSCENE_FORCE_DEEP_THINK, MIDSCENE_GROUNDING_ANTHROPIC_API_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION, MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT, MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_AZURE_OPENAI_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE, MIDSCENE_GROUNDING_MODEL_NAME, MIDSCENE_GROUNDING_OPENAI_API_KEY, MIDSCENE_GROUNDING_OPENAI_BASE_URL, MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY, MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY, MIDSCENE_GROUNDING_OPENAI_USE_AZURE, MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK, MIDSCENE_GROUNDING_USE_AZURE_OPENAI, MIDSCENE_GROUNDING_VL_MODE, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MCP_ANDROID_MODE, MIDSCENE_MCP_CHROME_PATH, MIDSCENE_MCP_USE_PUPPETEER_MODE, MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_API_KEY, MIDSCENE_OPENAI_BASE_URL, MIDSCENE_OPENAI_HTTP_PROXY, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, MIDSCENE_OPENAI_USE_AZURE, MIDSCENE_PLANNING_ANTHROPIC_API_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION, MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT, MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_AZURE_OPENAI_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE, MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, MIDSCENE_PLANNING_OPENAI_HTTP_PROXY, MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY, MIDSCENE_PLANNING_OPENAI_USE_AZURE, MIDSCENE_PLANNING_USE_ANTHROPIC_SDK, MIDSCENE_PLANNING_USE_AZURE_OPENAI, MIDSCENE_PLANNING_VL_MODE, MIDSCENE_PREFERRED_LANGUAGE, MIDSCENE_REPLANNING_CYCLE_LIMIT, MIDSCENE_REPORT_TAG_NAME, MIDSCENE_RUN_DIR, MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN3_VL, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, MIDSCENE_USE_VL_MODEL, MIDSCENE_VL_MODE, MIDSCENE_VQA_ANTHROPIC_API_KEY, MIDSCENE_VQA_AZURE_OPENAI_API_VERSION, MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT, MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_AZURE_OPENAI_KEY, MIDSCENE_VQA_AZURE_OPENAI_SCOPE, MIDSCENE_VQA_MODEL_NAME, MIDSCENE_VQA_OPENAI_API_KEY, MIDSCENE_VQA_OPENAI_BASE_URL, MIDSCENE_VQA_OPENAI_HTTP_PROXY, MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_OPENAI_SOCKS_PROXY, MIDSCENE_VQA_OPENAI_USE_AZURE, MIDSCENE_VQA_USE_ANTHROPIC_SDK, MIDSCENE_VQA_USE_AZURE_OPENAI, MIDSCENE_VQA_VL_MODE, MODEL_ENV_KEYS, NUMBER_ENV_KEYS, OPENAI_API_KEY, OPENAI_BASE_URL, OPENAI_MAX_TOKENS, OPENAI_USE_AZURE, STRING_ENV_KEYS, types_UITarsModelVersion as UITarsModelVersion, UNUSED_ENV_KEYS, VL_MODE_RAW_VALID_VALUES };
|
|
@@ -106,26 +106,46 @@ async function paddingToMatchBlock(image, blockSize = 28) {
|
|
|
106
106
|
const { width, height } = image.bitmap;
|
|
107
107
|
const targetWidth = Math.ceil(width / blockSize) * blockSize;
|
|
108
108
|
const targetHeight = Math.ceil(height / blockSize) * blockSize;
|
|
109
|
-
if (targetWidth === width && targetHeight === height) return
|
|
109
|
+
if (targetWidth === width && targetHeight === height) return {
|
|
110
|
+
width,
|
|
111
|
+
height,
|
|
112
|
+
image
|
|
113
|
+
};
|
|
110
114
|
const Jimp = await get_jimp();
|
|
111
115
|
const paddedImage = new Jimp(targetWidth, targetHeight, 0xffffffff);
|
|
112
116
|
paddedImage.composite(image, 0, 0);
|
|
113
|
-
return
|
|
117
|
+
return {
|
|
118
|
+
width: targetWidth,
|
|
119
|
+
height: targetHeight,
|
|
120
|
+
image: paddedImage
|
|
121
|
+
};
|
|
114
122
|
}
|
|
115
123
|
async function paddingToMatchBlockByBase64(imageBase64, blockSize = 28) {
|
|
116
124
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
117
|
-
const
|
|
118
|
-
return
|
|
125
|
+
const paddedResult = await paddingToMatchBlock(jimpImage, blockSize);
|
|
126
|
+
return {
|
|
127
|
+
width: paddedResult.width,
|
|
128
|
+
height: paddedResult.height,
|
|
129
|
+
imageBase64: await jimpToBase64(paddedResult.image)
|
|
130
|
+
};
|
|
119
131
|
}
|
|
120
132
|
async function cropByRect(imageBase64, rect, paddingImage) {
|
|
121
133
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
122
134
|
const { left, top, width, height } = rect;
|
|
123
135
|
jimpImage.crop(left, top, width, height);
|
|
124
136
|
if (paddingImage) {
|
|
125
|
-
const
|
|
126
|
-
return
|
|
137
|
+
const paddedResult = await paddingToMatchBlock(jimpImage);
|
|
138
|
+
return {
|
|
139
|
+
width: paddedResult.width,
|
|
140
|
+
height: paddedResult.height,
|
|
141
|
+
imageBase64: await jimpToBase64(paddedResult.image)
|
|
142
|
+
};
|
|
127
143
|
}
|
|
128
|
-
return
|
|
144
|
+
return {
|
|
145
|
+
width: jimpImage.bitmap.width,
|
|
146
|
+
height: jimpImage.bitmap.height,
|
|
147
|
+
imageBase64: await jimpToBase64(jimpImage)
|
|
148
|
+
};
|
|
129
149
|
}
|
|
130
150
|
async function jimpToBase64(image) {
|
|
131
151
|
const Jimp = await get_jimp();
|
package/dist/lib/env/parse.js
CHANGED
|
@@ -51,15 +51,21 @@ const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
|
|
|
51
51
|
const parseVlModeAndUiTarsFromGlobalConfig = (provider)=>{
|
|
52
52
|
const isDoubao = provider[external_types_js_namespaceObject.MIDSCENE_USE_DOUBAO_VISION];
|
|
53
53
|
const isQwen = provider[external_types_js_namespaceObject.MIDSCENE_USE_QWEN_VL];
|
|
54
|
+
const isQwen3 = provider[external_types_js_namespaceObject.MIDSCENE_USE_QWEN3_VL];
|
|
54
55
|
const isUiTars = provider[external_types_js_namespaceObject.MIDSCENE_USE_VLM_UI_TARS];
|
|
55
56
|
const isGemini = provider[external_types_js_namespaceObject.MIDSCENE_USE_GEMINI];
|
|
56
57
|
const enabledModes = [
|
|
57
58
|
isDoubao && external_types_js_namespaceObject.MIDSCENE_USE_DOUBAO_VISION,
|
|
58
59
|
isQwen && external_types_js_namespaceObject.MIDSCENE_USE_QWEN_VL,
|
|
60
|
+
isQwen3 && external_types_js_namespaceObject.MIDSCENE_USE_QWEN3_VL,
|
|
59
61
|
isUiTars && external_types_js_namespaceObject.MIDSCENE_USE_VLM_UI_TARS,
|
|
60
62
|
isGemini && external_types_js_namespaceObject.MIDSCENE_USE_GEMINI
|
|
61
63
|
].filter(Boolean);
|
|
62
64
|
if (enabledModes.length > 1) throw new Error(`Only one vision mode can be enabled at a time. Currently enabled modes: ${enabledModes.join(', ')}. Please disable all but one mode.`);
|
|
65
|
+
if (isQwen3) return {
|
|
66
|
+
vlMode: 'qwen3-vl',
|
|
67
|
+
uiTarsVersion: void 0
|
|
68
|
+
};
|
|
63
69
|
if (isQwen) return {
|
|
64
70
|
vlMode: 'qwen-vl',
|
|
65
71
|
uiTarsVersion: void 0
|
package/dist/lib/env/types.js
CHANGED
|
@@ -107,6 +107,7 @@ __webpack_require__.d(__webpack_exports__, {
|
|
|
107
107
|
MIDSCENE_USE_AZURE_OPENAI: ()=>MIDSCENE_USE_AZURE_OPENAI,
|
|
108
108
|
MIDSCENE_USE_DOUBAO_VISION: ()=>MIDSCENE_USE_DOUBAO_VISION,
|
|
109
109
|
MIDSCENE_USE_GEMINI: ()=>MIDSCENE_USE_GEMINI,
|
|
110
|
+
MIDSCENE_USE_QWEN3_VL: ()=>MIDSCENE_USE_QWEN3_VL,
|
|
110
111
|
MIDSCENE_USE_QWEN_VL: ()=>MIDSCENE_USE_QWEN_VL,
|
|
111
112
|
MIDSCENE_USE_VLM_UI_TARS: ()=>MIDSCENE_USE_VLM_UI_TARS,
|
|
112
113
|
MIDSCENE_USE_VL_MODEL: ()=>MIDSCENE_USE_VL_MODEL,
|
|
@@ -163,6 +164,7 @@ const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
|
|
|
163
164
|
const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
|
|
164
165
|
const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
|
|
165
166
|
const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
|
|
167
|
+
const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
|
|
166
168
|
const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
|
|
167
169
|
const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
|
|
168
170
|
const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
|
|
@@ -294,6 +296,7 @@ const MODEL_ENV_KEYS = [
|
|
|
294
296
|
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
295
297
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
296
298
|
MIDSCENE_USE_QWEN_VL,
|
|
299
|
+
MIDSCENE_USE_QWEN3_VL,
|
|
297
300
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
298
301
|
MIDSCENE_USE_GEMINI,
|
|
299
302
|
MIDSCENE_USE_VL_MODEL,
|
|
@@ -380,6 +383,7 @@ const VL_MODE_RAW_VALID_VALUES = [
|
|
|
380
383
|
'doubao-vision',
|
|
381
384
|
'gemini',
|
|
382
385
|
'qwen-vl',
|
|
386
|
+
'qwen3-vl',
|
|
383
387
|
'vlm-ui-tars',
|
|
384
388
|
'vlm-ui-tars-doubao',
|
|
385
389
|
'vlm-ui-tars-doubao-1.5'
|
|
@@ -467,6 +471,7 @@ exports.MIDSCENE_USE_ANTHROPIC_SDK = __webpack_exports__.MIDSCENE_USE_ANTHROPIC_
|
|
|
467
471
|
exports.MIDSCENE_USE_AZURE_OPENAI = __webpack_exports__.MIDSCENE_USE_AZURE_OPENAI;
|
|
468
472
|
exports.MIDSCENE_USE_DOUBAO_VISION = __webpack_exports__.MIDSCENE_USE_DOUBAO_VISION;
|
|
469
473
|
exports.MIDSCENE_USE_GEMINI = __webpack_exports__.MIDSCENE_USE_GEMINI;
|
|
474
|
+
exports.MIDSCENE_USE_QWEN3_VL = __webpack_exports__.MIDSCENE_USE_QWEN3_VL;
|
|
470
475
|
exports.MIDSCENE_USE_QWEN_VL = __webpack_exports__.MIDSCENE_USE_QWEN_VL;
|
|
471
476
|
exports.MIDSCENE_USE_VLM_UI_TARS = __webpack_exports__.MIDSCENE_USE_VLM_UI_TARS;
|
|
472
477
|
exports.MIDSCENE_USE_VL_MODEL = __webpack_exports__.MIDSCENE_USE_VL_MODEL;
|
|
@@ -582,6 +587,7 @@ for(var __webpack_i__ in __webpack_exports__)if (-1 === [
|
|
|
582
587
|
"MIDSCENE_USE_AZURE_OPENAI",
|
|
583
588
|
"MIDSCENE_USE_DOUBAO_VISION",
|
|
584
589
|
"MIDSCENE_USE_GEMINI",
|
|
590
|
+
"MIDSCENE_USE_QWEN3_VL",
|
|
585
591
|
"MIDSCENE_USE_QWEN_VL",
|
|
586
592
|
"MIDSCENE_USE_VLM_UI_TARS",
|
|
587
593
|
"MIDSCENE_USE_VL_MODEL",
|
|
@@ -161,26 +161,46 @@ async function paddingToMatchBlock(image, blockSize = 28) {
|
|
|
161
161
|
const { width, height } = image.bitmap;
|
|
162
162
|
const targetWidth = Math.ceil(width / blockSize) * blockSize;
|
|
163
163
|
const targetHeight = Math.ceil(height / blockSize) * blockSize;
|
|
164
|
-
if (targetWidth === width && targetHeight === height) return
|
|
164
|
+
if (targetWidth === width && targetHeight === height) return {
|
|
165
|
+
width,
|
|
166
|
+
height,
|
|
167
|
+
image
|
|
168
|
+
};
|
|
165
169
|
const Jimp = await external_get_jimp_js_default()();
|
|
166
170
|
const paddedImage = new Jimp(targetWidth, targetHeight, 0xffffffff);
|
|
167
171
|
paddedImage.composite(image, 0, 0);
|
|
168
|
-
return
|
|
172
|
+
return {
|
|
173
|
+
width: targetWidth,
|
|
174
|
+
height: targetHeight,
|
|
175
|
+
image: paddedImage
|
|
176
|
+
};
|
|
169
177
|
}
|
|
170
178
|
async function paddingToMatchBlockByBase64(imageBase64, blockSize = 28) {
|
|
171
179
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
172
|
-
const
|
|
173
|
-
return
|
|
180
|
+
const paddedResult = await paddingToMatchBlock(jimpImage, blockSize);
|
|
181
|
+
return {
|
|
182
|
+
width: paddedResult.width,
|
|
183
|
+
height: paddedResult.height,
|
|
184
|
+
imageBase64: await jimpToBase64(paddedResult.image)
|
|
185
|
+
};
|
|
174
186
|
}
|
|
175
187
|
async function cropByRect(imageBase64, rect, paddingImage) {
|
|
176
188
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
177
189
|
const { left, top, width, height } = rect;
|
|
178
190
|
jimpImage.crop(left, top, width, height);
|
|
179
191
|
if (paddingImage) {
|
|
180
|
-
const
|
|
181
|
-
return
|
|
192
|
+
const paddedResult = await paddingToMatchBlock(jimpImage);
|
|
193
|
+
return {
|
|
194
|
+
width: paddedResult.width,
|
|
195
|
+
height: paddedResult.height,
|
|
196
|
+
imageBase64: await jimpToBase64(paddedResult.image)
|
|
197
|
+
};
|
|
182
198
|
}
|
|
183
|
-
return
|
|
199
|
+
return {
|
|
200
|
+
width: jimpImage.bitmap.width,
|
|
201
|
+
height: jimpImage.bitmap.height,
|
|
202
|
+
imageBase64: await jimpToBase64(jimpImage)
|
|
203
|
+
};
|
|
184
204
|
}
|
|
185
205
|
async function jimpToBase64(image) {
|
|
186
206
|
const Jimp = await external_get_jimp_js_default()();
|
|
@@ -22,6 +22,7 @@ export declare const MIDSCENE_ANDROID_IME_STRATEGY = "MIDSCENE_ANDROID_IME_STRAT
|
|
|
22
22
|
export declare const MIDSCENE_CACHE = "MIDSCENE_CACHE";
|
|
23
23
|
export declare const MIDSCENE_USE_VLM_UI_TARS = "MIDSCENE_USE_VLM_UI_TARS";
|
|
24
24
|
export declare const MIDSCENE_USE_QWEN_VL = "MIDSCENE_USE_QWEN_VL";
|
|
25
|
+
export declare const MIDSCENE_USE_QWEN3_VL = "MIDSCENE_USE_QWEN3_VL";
|
|
25
26
|
export declare const MIDSCENE_USE_DOUBAO_VISION = "MIDSCENE_USE_DOUBAO_VISION";
|
|
26
27
|
export declare const MIDSCENE_USE_GEMINI = "MIDSCENE_USE_GEMINI";
|
|
27
28
|
export declare const MIDSCENE_USE_VL_MODEL = "MIDSCENE_USE_VL_MODEL";
|
|
@@ -125,12 +126,12 @@ export declare const GLOBAL_ENV_KEYS: readonly ["MIDSCENE_CACHE", "MIDSCENE_LANG
|
|
|
125
126
|
* Can be override by both agent.modelConfig and overrideAIConfig
|
|
126
127
|
* Can only be access after agent.constructor
|
|
127
128
|
*/
|
|
128
|
-
export declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
|
|
129
|
-
export declare const ALL_ENV_KEYS: readonly [...string[], "MIDSCENE_DEBUG_MODE", "MIDSCENE_DEBUG_AI_PROFILE", "MIDSCENE_DEBUG_AI_RESPONSE", "MIDSCENE_RUN_DIR", "MIDSCENE_CACHE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_FORCE_DEEP_THINK", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER", "MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
|
|
129
|
+
export declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_QWEN3_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
|
|
130
|
+
export declare const ALL_ENV_KEYS: readonly [...string[], "MIDSCENE_DEBUG_MODE", "MIDSCENE_DEBUG_AI_PROFILE", "MIDSCENE_DEBUG_AI_RESPONSE", "MIDSCENE_RUN_DIR", "MIDSCENE_CACHE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_FORCE_DEEP_THINK", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER", "MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_QWEN3_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
|
|
130
131
|
export type TEnvKeys = (typeof ALL_ENV_KEYS)[number];
|
|
131
132
|
export type TGlobalConfig = Record<TEnvKeys, string | undefined>;
|
|
132
|
-
export type TVlModeValues = 'qwen-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
|
|
133
|
-
export type TVlModeTypes = 'qwen-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars';
|
|
133
|
+
export type TVlModeValues = 'qwen-vl' | 'qwen3-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
|
|
134
|
+
export type TVlModeTypes = 'qwen-vl' | 'qwen3-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars';
|
|
134
135
|
export interface IModelConfigForVQA {
|
|
135
136
|
[MIDSCENE_VQA_MODEL_NAME]: string;
|
|
136
137
|
[MIDSCENE_VQA_OPENAI_SOCKS_PROXY]?: string;
|
|
@@ -49,9 +49,21 @@ export declare function zoomForGPT4o(originalWidth: number, originalHeight: numb
|
|
|
49
49
|
height: number;
|
|
50
50
|
};
|
|
51
51
|
export declare function jimpFromBase64(base64: string): Promise<Jimp>;
|
|
52
|
-
export declare function paddingToMatchBlock(image: Jimp, blockSize?: number): Promise<
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
export declare function paddingToMatchBlock(image: Jimp, blockSize?: number): Promise<{
|
|
53
|
+
width: number;
|
|
54
|
+
height: number;
|
|
55
|
+
image: Jimp;
|
|
56
|
+
}>;
|
|
57
|
+
export declare function paddingToMatchBlockByBase64(imageBase64: string, blockSize?: number): Promise<{
|
|
58
|
+
width: number;
|
|
59
|
+
height: number;
|
|
60
|
+
imageBase64: string;
|
|
61
|
+
}>;
|
|
62
|
+
export declare function cropByRect(imageBase64: string, rect: Rect, paddingImage: boolean): Promise<{
|
|
63
|
+
width: number;
|
|
64
|
+
height: number;
|
|
65
|
+
imageBase64: string;
|
|
66
|
+
}>;
|
|
55
67
|
export declare function jimpToBase64(image: Jimp): Promise<string>;
|
|
56
68
|
export declare const httpImg2Base64: (url: string) => Promise<string>;
|
|
57
69
|
/**
|
package/package.json
CHANGED
package/src/env/parse.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
3
3
|
MIDSCENE_USE_GEMINI,
|
|
4
|
+
MIDSCENE_USE_QWEN3_VL,
|
|
4
5
|
MIDSCENE_USE_QWEN_VL,
|
|
5
6
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
6
7
|
type TVlModeTypes,
|
|
@@ -58,12 +59,14 @@ export const parseVlModeAndUiTarsFromGlobalConfig = (
|
|
|
58
59
|
} => {
|
|
59
60
|
const isDoubao = provider[MIDSCENE_USE_DOUBAO_VISION];
|
|
60
61
|
const isQwen = provider[MIDSCENE_USE_QWEN_VL];
|
|
62
|
+
const isQwen3 = provider[MIDSCENE_USE_QWEN3_VL];
|
|
61
63
|
const isUiTars = provider[MIDSCENE_USE_VLM_UI_TARS];
|
|
62
64
|
const isGemini = provider[MIDSCENE_USE_GEMINI];
|
|
63
65
|
|
|
64
66
|
const enabledModes = [
|
|
65
67
|
isDoubao && MIDSCENE_USE_DOUBAO_VISION,
|
|
66
68
|
isQwen && MIDSCENE_USE_QWEN_VL,
|
|
69
|
+
isQwen3 && MIDSCENE_USE_QWEN3_VL,
|
|
67
70
|
isUiTars && MIDSCENE_USE_VLM_UI_TARS,
|
|
68
71
|
isGemini && MIDSCENE_USE_GEMINI,
|
|
69
72
|
].filter(Boolean);
|
|
@@ -74,6 +77,13 @@ export const parseVlModeAndUiTarsFromGlobalConfig = (
|
|
|
74
77
|
);
|
|
75
78
|
}
|
|
76
79
|
|
|
80
|
+
if (isQwen3) {
|
|
81
|
+
return {
|
|
82
|
+
vlMode: 'qwen3-vl',
|
|
83
|
+
uiTarsVersion: undefined,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
77
87
|
if (isQwen) {
|
|
78
88
|
return {
|
|
79
89
|
vlMode: 'qwen-vl',
|
package/src/env/types.ts
CHANGED
|
@@ -29,6 +29,7 @@ export const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
|
|
|
29
29
|
export const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
|
|
30
30
|
export const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
|
|
31
31
|
export const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
|
|
32
|
+
export const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
|
|
32
33
|
export const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
|
|
33
34
|
export const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
|
|
34
35
|
export const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
|
|
@@ -242,6 +243,7 @@ export const MODEL_ENV_KEYS = [
|
|
|
242
243
|
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
243
244
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
244
245
|
MIDSCENE_USE_QWEN_VL,
|
|
246
|
+
MIDSCENE_USE_QWEN3_VL,
|
|
245
247
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
246
248
|
MIDSCENE_USE_GEMINI,
|
|
247
249
|
MIDSCENE_USE_VL_MODEL,
|
|
@@ -328,6 +330,7 @@ export type TGlobalConfig = Record<TEnvKeys, string | undefined>;
|
|
|
328
330
|
|
|
329
331
|
export type TVlModeValues =
|
|
330
332
|
| 'qwen-vl'
|
|
333
|
+
| 'qwen3-vl'
|
|
331
334
|
| 'doubao-vision'
|
|
332
335
|
| 'gemini'
|
|
333
336
|
| 'vlm-ui-tars'
|
|
@@ -336,6 +339,7 @@ export type TVlModeValues =
|
|
|
336
339
|
|
|
337
340
|
export type TVlModeTypes =
|
|
338
341
|
| 'qwen-vl'
|
|
342
|
+
| 'qwen3-vl'
|
|
339
343
|
| 'doubao-vision'
|
|
340
344
|
| 'gemini'
|
|
341
345
|
| 'vlm-ui-tars';
|
|
@@ -497,6 +501,7 @@ export const VL_MODE_RAW_VALID_VALUES: TVlModeValues[] = [
|
|
|
497
501
|
'doubao-vision',
|
|
498
502
|
'gemini',
|
|
499
503
|
'qwen-vl',
|
|
504
|
+
'qwen3-vl',
|
|
500
505
|
'vlm-ui-tars',
|
|
501
506
|
'vlm-ui-tars-doubao',
|
|
502
507
|
'vlm-ui-tars-doubao-1.5',
|
package/src/img/transform.ts
CHANGED
|
@@ -228,14 +228,18 @@ export async function jimpFromBase64(base64: string): Promise<Jimp> {
|
|
|
228
228
|
export async function paddingToMatchBlock(
|
|
229
229
|
image: Jimp,
|
|
230
230
|
blockSize = 28,
|
|
231
|
-
): Promise<
|
|
231
|
+
): Promise<{
|
|
232
|
+
width: number;
|
|
233
|
+
height: number;
|
|
234
|
+
image: Jimp;
|
|
235
|
+
}> {
|
|
232
236
|
const { width, height } = image.bitmap;
|
|
233
237
|
|
|
234
238
|
const targetWidth = Math.ceil(width / blockSize) * blockSize;
|
|
235
239
|
const targetHeight = Math.ceil(height / blockSize) * blockSize;
|
|
236
240
|
|
|
237
241
|
if (targetWidth === width && targetHeight === height) {
|
|
238
|
-
return image;
|
|
242
|
+
return { width, height, image };
|
|
239
243
|
}
|
|
240
244
|
|
|
241
245
|
const Jimp = await getJimp();
|
|
@@ -243,31 +247,52 @@ export async function paddingToMatchBlock(
|
|
|
243
247
|
|
|
244
248
|
// Composite the original image onto the new canvas
|
|
245
249
|
paddedImage.composite(image, 0, 0);
|
|
246
|
-
return paddedImage;
|
|
250
|
+
return { width: targetWidth, height: targetHeight, image: paddedImage };
|
|
247
251
|
}
|
|
248
252
|
|
|
249
253
|
export async function paddingToMatchBlockByBase64(
|
|
250
254
|
imageBase64: string,
|
|
251
255
|
blockSize = 28,
|
|
252
|
-
): Promise<
|
|
256
|
+
): Promise<{
|
|
257
|
+
width: number;
|
|
258
|
+
height: number;
|
|
259
|
+
imageBase64: string;
|
|
260
|
+
}> {
|
|
253
261
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
254
|
-
const
|
|
255
|
-
return
|
|
262
|
+
const paddedResult = await paddingToMatchBlock(jimpImage, blockSize);
|
|
263
|
+
return {
|
|
264
|
+
width: paddedResult.width,
|
|
265
|
+
height: paddedResult.height,
|
|
266
|
+
imageBase64: await jimpToBase64(paddedResult.image),
|
|
267
|
+
};
|
|
256
268
|
}
|
|
269
|
+
|
|
257
270
|
export async function cropByRect(
|
|
258
271
|
imageBase64: string,
|
|
259
272
|
rect: Rect,
|
|
260
273
|
paddingImage: boolean,
|
|
261
|
-
): Promise<
|
|
274
|
+
): Promise<{
|
|
275
|
+
width: number;
|
|
276
|
+
height: number;
|
|
277
|
+
imageBase64: string;
|
|
278
|
+
}> {
|
|
262
279
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
263
280
|
const { left, top, width, height } = rect;
|
|
264
281
|
jimpImage.crop(left, top, width, height);
|
|
265
282
|
|
|
266
283
|
if (paddingImage) {
|
|
267
|
-
const
|
|
268
|
-
return
|
|
284
|
+
const paddedResult = await paddingToMatchBlock(jimpImage);
|
|
285
|
+
return {
|
|
286
|
+
width: paddedResult.width,
|
|
287
|
+
height: paddedResult.height,
|
|
288
|
+
imageBase64: await jimpToBase64(paddedResult.image),
|
|
289
|
+
};
|
|
269
290
|
}
|
|
270
|
-
return
|
|
291
|
+
return {
|
|
292
|
+
width: jimpImage.bitmap.width,
|
|
293
|
+
height: jimpImage.bitmap.height,
|
|
294
|
+
imageBase64: await jimpToBase64(jimpImage),
|
|
295
|
+
};
|
|
271
296
|
}
|
|
272
297
|
|
|
273
298
|
export async function jimpToBase64(image: Jimp): Promise<string> {
|