@midscene/shared 0.29.0 → 0.29.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, UITarsModelVersion, VL_MODE_RAW_VALID_VALUES } from "./types.mjs";
1
+ import { MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN3_VL, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, UITarsModelVersion, VL_MODE_RAW_VALID_VALUES } from "./types.mjs";
2
2
  const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
3
3
  if (!vlModeRaw) return {
4
4
  vlMode: void 0,
@@ -22,15 +22,21 @@ const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
22
22
  const parseVlModeAndUiTarsFromGlobalConfig = (provider)=>{
23
23
  const isDoubao = provider[MIDSCENE_USE_DOUBAO_VISION];
24
24
  const isQwen = provider[MIDSCENE_USE_QWEN_VL];
25
+ const isQwen3 = provider[MIDSCENE_USE_QWEN3_VL];
25
26
  const isUiTars = provider[MIDSCENE_USE_VLM_UI_TARS];
26
27
  const isGemini = provider[MIDSCENE_USE_GEMINI];
27
28
  const enabledModes = [
28
29
  isDoubao && MIDSCENE_USE_DOUBAO_VISION,
29
30
  isQwen && MIDSCENE_USE_QWEN_VL,
31
+ isQwen3 && MIDSCENE_USE_QWEN3_VL,
30
32
  isUiTars && MIDSCENE_USE_VLM_UI_TARS,
31
33
  isGemini && MIDSCENE_USE_GEMINI
32
34
  ].filter(Boolean);
33
35
  if (enabledModes.length > 1) throw new Error(`Only one vision mode can be enabled at a time. Currently enabled modes: ${enabledModes.join(', ')}. Please disable all but one mode.`);
36
+ if (isQwen3) return {
37
+ vlMode: 'qwen3-vl',
38
+ uiTarsVersion: void 0
39
+ };
34
40
  if (isQwen) return {
35
41
  vlMode: 'qwen-vl',
36
42
  uiTarsVersion: void 0
@@ -22,6 +22,7 @@ const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
22
22
  const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
23
23
  const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
24
24
  const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
25
+ const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
25
26
  const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
26
27
  const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
27
28
  const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
@@ -153,6 +154,7 @@ const MODEL_ENV_KEYS = [
153
154
  MIDSCENE_USE_ANTHROPIC_SDK,
154
155
  MIDSCENE_USE_VLM_UI_TARS,
155
156
  MIDSCENE_USE_QWEN_VL,
157
+ MIDSCENE_USE_QWEN3_VL,
156
158
  MIDSCENE_USE_DOUBAO_VISION,
157
159
  MIDSCENE_USE_GEMINI,
158
160
  MIDSCENE_USE_VL_MODEL,
@@ -239,8 +241,9 @@ const VL_MODE_RAW_VALID_VALUES = [
239
241
  'doubao-vision',
240
242
  'gemini',
241
243
  'qwen-vl',
244
+ 'qwen3-vl',
242
245
  'vlm-ui-tars',
243
246
  'vlm-ui-tars-doubao',
244
247
  'vlm-ui-tars-doubao-1.5'
245
248
  ];
246
- export { ALL_ENV_KEYS, ANTHROPIC_API_KEY, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, BASIC_ENV_KEYS, BOOLEAN_ENV_KEYS, DOCKER_CONTAINER, GLOBAL_ENV_KEYS, MATCH_BY_POSITION, MIDSCENE_ADB_PATH, MIDSCENE_ADB_REMOTE_HOST, MIDSCENE_ADB_REMOTE_PORT, MIDSCENE_ANDROID_IME_STRATEGY, MIDSCENE_ANTHROPIC_API_KEY, MIDSCENE_API_TYPE, MIDSCENE_AZURE_OPENAI_API_VERSION, MIDSCENE_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_AZURE_OPENAI_ENDPOINT, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_KEY, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_CACHE, MIDSCENE_CACHE_MAX_FILENAME_LENGTH, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, MIDSCENE_DEBUG_AI_PROFILE, MIDSCENE_DEBUG_AI_RESPONSE, MIDSCENE_DEBUG_MODE, MIDSCENE_FORCE_DEEP_THINK, MIDSCENE_GROUNDING_ANTHROPIC_API_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION, MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT, MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_AZURE_OPENAI_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE, MIDSCENE_GROUNDING_MODEL_NAME, MIDSCENE_GROUNDING_OPENAI_API_KEY, MIDSCENE_GROUNDING_OPENAI_BASE_URL, MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY, MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY, MIDSCENE_GROUNDING_OPENAI_USE_AZURE, MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK, MIDSCENE_GROUNDING_USE_AZURE_OPENAI, MIDSCENE_GROUNDING_VL_MODE, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MCP_ANDROID_MODE, MIDSCENE_MCP_CHROME_PATH, MIDSCENE_MCP_USE_PUPPETEER_MODE, MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_API_KEY, MIDSCENE_OPENAI_BASE_URL, MIDSCENE_OPENAI_HTTP_PROXY, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, MIDSCENE_OPENAI_USE_AZURE, MIDSCENE_PLANNING_ANTHROPIC_API_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION, MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT, MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_AZURE_OPENAI_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE, MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, MIDSCENE_PLANNING_OPENAI_HTTP_PROXY, MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY, MIDSCENE_PLANNING_OPENAI_USE_AZURE, MIDSCENE_PLANNING_USE_ANTHROPIC_SDK, MIDSCENE_PLANNING_USE_AZURE_OPENAI, MIDSCENE_PLANNING_VL_MODE, MIDSCENE_PREFERRED_LANGUAGE, MIDSCENE_REPLANNING_CYCLE_LIMIT, MIDSCENE_REPORT_TAG_NAME, MIDSCENE_RUN_DIR, MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, MIDSCENE_USE_VL_MODEL, MIDSCENE_VL_MODE, MIDSCENE_VQA_ANTHROPIC_API_KEY, MIDSCENE_VQA_AZURE_OPENAI_API_VERSION, MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT, MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_AZURE_OPENAI_KEY, MIDSCENE_VQA_AZURE_OPENAI_SCOPE, MIDSCENE_VQA_MODEL_NAME, MIDSCENE_VQA_OPENAI_API_KEY, MIDSCENE_VQA_OPENAI_BASE_URL, MIDSCENE_VQA_OPENAI_HTTP_PROXY, MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_OPENAI_SOCKS_PROXY, MIDSCENE_VQA_OPENAI_USE_AZURE, MIDSCENE_VQA_USE_ANTHROPIC_SDK, MIDSCENE_VQA_USE_AZURE_OPENAI, MIDSCENE_VQA_VL_MODE, MODEL_ENV_KEYS, NUMBER_ENV_KEYS, OPENAI_API_KEY, OPENAI_BASE_URL, OPENAI_MAX_TOKENS, OPENAI_USE_AZURE, STRING_ENV_KEYS, types_UITarsModelVersion as UITarsModelVersion, UNUSED_ENV_KEYS, VL_MODE_RAW_VALID_VALUES };
249
+ export { ALL_ENV_KEYS, ANTHROPIC_API_KEY, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, BASIC_ENV_KEYS, BOOLEAN_ENV_KEYS, DOCKER_CONTAINER, GLOBAL_ENV_KEYS, MATCH_BY_POSITION, MIDSCENE_ADB_PATH, MIDSCENE_ADB_REMOTE_HOST, MIDSCENE_ADB_REMOTE_PORT, MIDSCENE_ANDROID_IME_STRATEGY, MIDSCENE_ANTHROPIC_API_KEY, MIDSCENE_API_TYPE, MIDSCENE_AZURE_OPENAI_API_VERSION, MIDSCENE_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_AZURE_OPENAI_ENDPOINT, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_KEY, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_CACHE, MIDSCENE_CACHE_MAX_FILENAME_LENGTH, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, MIDSCENE_DEBUG_AI_PROFILE, MIDSCENE_DEBUG_AI_RESPONSE, MIDSCENE_DEBUG_MODE, MIDSCENE_FORCE_DEEP_THINK, MIDSCENE_GROUNDING_ANTHROPIC_API_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION, MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT, MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_AZURE_OPENAI_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE, MIDSCENE_GROUNDING_MODEL_NAME, MIDSCENE_GROUNDING_OPENAI_API_KEY, MIDSCENE_GROUNDING_OPENAI_BASE_URL, MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY, MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY, MIDSCENE_GROUNDING_OPENAI_USE_AZURE, MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK, MIDSCENE_GROUNDING_USE_AZURE_OPENAI, MIDSCENE_GROUNDING_VL_MODE, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MCP_ANDROID_MODE, MIDSCENE_MCP_CHROME_PATH, MIDSCENE_MCP_USE_PUPPETEER_MODE, MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_API_KEY, MIDSCENE_OPENAI_BASE_URL, MIDSCENE_OPENAI_HTTP_PROXY, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, MIDSCENE_OPENAI_USE_AZURE, MIDSCENE_PLANNING_ANTHROPIC_API_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION, MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT, MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_AZURE_OPENAI_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE, MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, MIDSCENE_PLANNING_OPENAI_HTTP_PROXY, MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY, MIDSCENE_PLANNING_OPENAI_USE_AZURE, MIDSCENE_PLANNING_USE_ANTHROPIC_SDK, MIDSCENE_PLANNING_USE_AZURE_OPENAI, MIDSCENE_PLANNING_VL_MODE, MIDSCENE_PREFERRED_LANGUAGE, MIDSCENE_REPLANNING_CYCLE_LIMIT, MIDSCENE_REPORT_TAG_NAME, MIDSCENE_RUN_DIR, MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN3_VL, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, MIDSCENE_USE_VL_MODEL, MIDSCENE_VL_MODE, MIDSCENE_VQA_ANTHROPIC_API_KEY, MIDSCENE_VQA_AZURE_OPENAI_API_VERSION, MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT, MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_AZURE_OPENAI_KEY, MIDSCENE_VQA_AZURE_OPENAI_SCOPE, MIDSCENE_VQA_MODEL_NAME, MIDSCENE_VQA_OPENAI_API_KEY, MIDSCENE_VQA_OPENAI_BASE_URL, MIDSCENE_VQA_OPENAI_HTTP_PROXY, MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_OPENAI_SOCKS_PROXY, MIDSCENE_VQA_OPENAI_USE_AZURE, MIDSCENE_VQA_USE_ANTHROPIC_SDK, MIDSCENE_VQA_USE_AZURE_OPENAI, MIDSCENE_VQA_VL_MODE, MODEL_ENV_KEYS, NUMBER_ENV_KEYS, OPENAI_API_KEY, OPENAI_BASE_URL, OPENAI_MAX_TOKENS, OPENAI_USE_AZURE, STRING_ENV_KEYS, types_UITarsModelVersion as UITarsModelVersion, UNUSED_ENV_KEYS, VL_MODE_RAW_VALID_VALUES };
@@ -106,26 +106,46 @@ async function paddingToMatchBlock(image, blockSize = 28) {
106
106
  const { width, height } = image.bitmap;
107
107
  const targetWidth = Math.ceil(width / blockSize) * blockSize;
108
108
  const targetHeight = Math.ceil(height / blockSize) * blockSize;
109
- if (targetWidth === width && targetHeight === height) return image;
109
+ if (targetWidth === width && targetHeight === height) return {
110
+ width,
111
+ height,
112
+ image
113
+ };
110
114
  const Jimp = await get_jimp();
111
115
  const paddedImage = new Jimp(targetWidth, targetHeight, 0xffffffff);
112
116
  paddedImage.composite(image, 0, 0);
113
- return paddedImage;
117
+ return {
118
+ width: targetWidth,
119
+ height: targetHeight,
120
+ image: paddedImage
121
+ };
114
122
  }
115
123
  async function paddingToMatchBlockByBase64(imageBase64, blockSize = 28) {
116
124
  const jimpImage = await jimpFromBase64(imageBase64);
117
- const paddedImage = await paddingToMatchBlock(jimpImage, blockSize);
118
- return jimpToBase64(paddedImage);
125
+ const paddedResult = await paddingToMatchBlock(jimpImage, blockSize);
126
+ return {
127
+ width: paddedResult.width,
128
+ height: paddedResult.height,
129
+ imageBase64: await jimpToBase64(paddedResult.image)
130
+ };
119
131
  }
120
132
  async function cropByRect(imageBase64, rect, paddingImage) {
121
133
  const jimpImage = await jimpFromBase64(imageBase64);
122
134
  const { left, top, width, height } = rect;
123
135
  jimpImage.crop(left, top, width, height);
124
136
  if (paddingImage) {
125
- const paddedImage = await paddingToMatchBlock(jimpImage);
126
- return jimpToBase64(paddedImage);
137
+ const paddedResult = await paddingToMatchBlock(jimpImage);
138
+ return {
139
+ width: paddedResult.width,
140
+ height: paddedResult.height,
141
+ imageBase64: await jimpToBase64(paddedResult.image)
142
+ };
127
143
  }
128
- return jimpToBase64(jimpImage);
144
+ return {
145
+ width: jimpImage.bitmap.width,
146
+ height: jimpImage.bitmap.height,
147
+ imageBase64: await jimpToBase64(jimpImage)
148
+ };
129
149
  }
130
150
  async function jimpToBase64(image) {
131
151
  const Jimp = await get_jimp();
@@ -51,15 +51,21 @@ const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
51
51
  const parseVlModeAndUiTarsFromGlobalConfig = (provider)=>{
52
52
  const isDoubao = provider[external_types_js_namespaceObject.MIDSCENE_USE_DOUBAO_VISION];
53
53
  const isQwen = provider[external_types_js_namespaceObject.MIDSCENE_USE_QWEN_VL];
54
+ const isQwen3 = provider[external_types_js_namespaceObject.MIDSCENE_USE_QWEN3_VL];
54
55
  const isUiTars = provider[external_types_js_namespaceObject.MIDSCENE_USE_VLM_UI_TARS];
55
56
  const isGemini = provider[external_types_js_namespaceObject.MIDSCENE_USE_GEMINI];
56
57
  const enabledModes = [
57
58
  isDoubao && external_types_js_namespaceObject.MIDSCENE_USE_DOUBAO_VISION,
58
59
  isQwen && external_types_js_namespaceObject.MIDSCENE_USE_QWEN_VL,
60
+ isQwen3 && external_types_js_namespaceObject.MIDSCENE_USE_QWEN3_VL,
59
61
  isUiTars && external_types_js_namespaceObject.MIDSCENE_USE_VLM_UI_TARS,
60
62
  isGemini && external_types_js_namespaceObject.MIDSCENE_USE_GEMINI
61
63
  ].filter(Boolean);
62
64
  if (enabledModes.length > 1) throw new Error(`Only one vision mode can be enabled at a time. Currently enabled modes: ${enabledModes.join(', ')}. Please disable all but one mode.`);
65
+ if (isQwen3) return {
66
+ vlMode: 'qwen3-vl',
67
+ uiTarsVersion: void 0
68
+ };
63
69
  if (isQwen) return {
64
70
  vlMode: 'qwen-vl',
65
71
  uiTarsVersion: void 0
@@ -107,6 +107,7 @@ __webpack_require__.d(__webpack_exports__, {
107
107
  MIDSCENE_USE_AZURE_OPENAI: ()=>MIDSCENE_USE_AZURE_OPENAI,
108
108
  MIDSCENE_USE_DOUBAO_VISION: ()=>MIDSCENE_USE_DOUBAO_VISION,
109
109
  MIDSCENE_USE_GEMINI: ()=>MIDSCENE_USE_GEMINI,
110
+ MIDSCENE_USE_QWEN3_VL: ()=>MIDSCENE_USE_QWEN3_VL,
110
111
  MIDSCENE_USE_QWEN_VL: ()=>MIDSCENE_USE_QWEN_VL,
111
112
  MIDSCENE_USE_VLM_UI_TARS: ()=>MIDSCENE_USE_VLM_UI_TARS,
112
113
  MIDSCENE_USE_VL_MODEL: ()=>MIDSCENE_USE_VL_MODEL,
@@ -163,6 +164,7 @@ const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
163
164
  const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
164
165
  const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
165
166
  const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
167
+ const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
166
168
  const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
167
169
  const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
168
170
  const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
@@ -294,6 +296,7 @@ const MODEL_ENV_KEYS = [
294
296
  MIDSCENE_USE_ANTHROPIC_SDK,
295
297
  MIDSCENE_USE_VLM_UI_TARS,
296
298
  MIDSCENE_USE_QWEN_VL,
299
+ MIDSCENE_USE_QWEN3_VL,
297
300
  MIDSCENE_USE_DOUBAO_VISION,
298
301
  MIDSCENE_USE_GEMINI,
299
302
  MIDSCENE_USE_VL_MODEL,
@@ -380,6 +383,7 @@ const VL_MODE_RAW_VALID_VALUES = [
380
383
  'doubao-vision',
381
384
  'gemini',
382
385
  'qwen-vl',
386
+ 'qwen3-vl',
383
387
  'vlm-ui-tars',
384
388
  'vlm-ui-tars-doubao',
385
389
  'vlm-ui-tars-doubao-1.5'
@@ -467,6 +471,7 @@ exports.MIDSCENE_USE_ANTHROPIC_SDK = __webpack_exports__.MIDSCENE_USE_ANTHROPIC_
467
471
  exports.MIDSCENE_USE_AZURE_OPENAI = __webpack_exports__.MIDSCENE_USE_AZURE_OPENAI;
468
472
  exports.MIDSCENE_USE_DOUBAO_VISION = __webpack_exports__.MIDSCENE_USE_DOUBAO_VISION;
469
473
  exports.MIDSCENE_USE_GEMINI = __webpack_exports__.MIDSCENE_USE_GEMINI;
474
+ exports.MIDSCENE_USE_QWEN3_VL = __webpack_exports__.MIDSCENE_USE_QWEN3_VL;
470
475
  exports.MIDSCENE_USE_QWEN_VL = __webpack_exports__.MIDSCENE_USE_QWEN_VL;
471
476
  exports.MIDSCENE_USE_VLM_UI_TARS = __webpack_exports__.MIDSCENE_USE_VLM_UI_TARS;
472
477
  exports.MIDSCENE_USE_VL_MODEL = __webpack_exports__.MIDSCENE_USE_VL_MODEL;
@@ -582,6 +587,7 @@ for(var __webpack_i__ in __webpack_exports__)if (-1 === [
582
587
  "MIDSCENE_USE_AZURE_OPENAI",
583
588
  "MIDSCENE_USE_DOUBAO_VISION",
584
589
  "MIDSCENE_USE_GEMINI",
590
+ "MIDSCENE_USE_QWEN3_VL",
585
591
  "MIDSCENE_USE_QWEN_VL",
586
592
  "MIDSCENE_USE_VLM_UI_TARS",
587
593
  "MIDSCENE_USE_VL_MODEL",
@@ -161,26 +161,46 @@ async function paddingToMatchBlock(image, blockSize = 28) {
161
161
  const { width, height } = image.bitmap;
162
162
  const targetWidth = Math.ceil(width / blockSize) * blockSize;
163
163
  const targetHeight = Math.ceil(height / blockSize) * blockSize;
164
- if (targetWidth === width && targetHeight === height) return image;
164
+ if (targetWidth === width && targetHeight === height) return {
165
+ width,
166
+ height,
167
+ image
168
+ };
165
169
  const Jimp = await external_get_jimp_js_default()();
166
170
  const paddedImage = new Jimp(targetWidth, targetHeight, 0xffffffff);
167
171
  paddedImage.composite(image, 0, 0);
168
- return paddedImage;
172
+ return {
173
+ width: targetWidth,
174
+ height: targetHeight,
175
+ image: paddedImage
176
+ };
169
177
  }
170
178
  async function paddingToMatchBlockByBase64(imageBase64, blockSize = 28) {
171
179
  const jimpImage = await jimpFromBase64(imageBase64);
172
- const paddedImage = await paddingToMatchBlock(jimpImage, blockSize);
173
- return jimpToBase64(paddedImage);
180
+ const paddedResult = await paddingToMatchBlock(jimpImage, blockSize);
181
+ return {
182
+ width: paddedResult.width,
183
+ height: paddedResult.height,
184
+ imageBase64: await jimpToBase64(paddedResult.image)
185
+ };
174
186
  }
175
187
  async function cropByRect(imageBase64, rect, paddingImage) {
176
188
  const jimpImage = await jimpFromBase64(imageBase64);
177
189
  const { left, top, width, height } = rect;
178
190
  jimpImage.crop(left, top, width, height);
179
191
  if (paddingImage) {
180
- const paddedImage = await paddingToMatchBlock(jimpImage);
181
- return jimpToBase64(paddedImage);
192
+ const paddedResult = await paddingToMatchBlock(jimpImage);
193
+ return {
194
+ width: paddedResult.width,
195
+ height: paddedResult.height,
196
+ imageBase64: await jimpToBase64(paddedResult.image)
197
+ };
182
198
  }
183
- return jimpToBase64(jimpImage);
199
+ return {
200
+ width: jimpImage.bitmap.width,
201
+ height: jimpImage.bitmap.height,
202
+ imageBase64: await jimpToBase64(jimpImage)
203
+ };
184
204
  }
185
205
  async function jimpToBase64(image) {
186
206
  const Jimp = await external_get_jimp_js_default()();
@@ -22,6 +22,7 @@ export declare const MIDSCENE_ANDROID_IME_STRATEGY = "MIDSCENE_ANDROID_IME_STRAT
22
22
  export declare const MIDSCENE_CACHE = "MIDSCENE_CACHE";
23
23
  export declare const MIDSCENE_USE_VLM_UI_TARS = "MIDSCENE_USE_VLM_UI_TARS";
24
24
  export declare const MIDSCENE_USE_QWEN_VL = "MIDSCENE_USE_QWEN_VL";
25
+ export declare const MIDSCENE_USE_QWEN3_VL = "MIDSCENE_USE_QWEN3_VL";
25
26
  export declare const MIDSCENE_USE_DOUBAO_VISION = "MIDSCENE_USE_DOUBAO_VISION";
26
27
  export declare const MIDSCENE_USE_GEMINI = "MIDSCENE_USE_GEMINI";
27
28
  export declare const MIDSCENE_USE_VL_MODEL = "MIDSCENE_USE_VL_MODEL";
@@ -125,12 +126,12 @@ export declare const GLOBAL_ENV_KEYS: readonly ["MIDSCENE_CACHE", "MIDSCENE_LANG
125
126
  * Can be override by both agent.modelConfig and overrideAIConfig
126
127
  * Can only be access after agent.constructor
127
128
  */
128
- export declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
129
- export declare const ALL_ENV_KEYS: readonly [...string[], "MIDSCENE_DEBUG_MODE", "MIDSCENE_DEBUG_AI_PROFILE", "MIDSCENE_DEBUG_AI_RESPONSE", "MIDSCENE_RUN_DIR", "MIDSCENE_CACHE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_FORCE_DEEP_THINK", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER", "MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
129
+ export declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_QWEN3_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
130
+ export declare const ALL_ENV_KEYS: readonly [...string[], "MIDSCENE_DEBUG_MODE", "MIDSCENE_DEBUG_AI_PROFILE", "MIDSCENE_DEBUG_AI_RESPONSE", "MIDSCENE_RUN_DIR", "MIDSCENE_CACHE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_FORCE_DEEP_THINK", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER", "MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_QWEN3_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
130
131
  export type TEnvKeys = (typeof ALL_ENV_KEYS)[number];
131
132
  export type TGlobalConfig = Record<TEnvKeys, string | undefined>;
132
- export type TVlModeValues = 'qwen-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
133
- export type TVlModeTypes = 'qwen-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars';
133
+ export type TVlModeValues = 'qwen-vl' | 'qwen3-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
134
+ export type TVlModeTypes = 'qwen-vl' | 'qwen3-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars';
134
135
  export interface IModelConfigForVQA {
135
136
  [MIDSCENE_VQA_MODEL_NAME]: string;
136
137
  [MIDSCENE_VQA_OPENAI_SOCKS_PROXY]?: string;
@@ -49,9 +49,21 @@ export declare function zoomForGPT4o(originalWidth: number, originalHeight: numb
49
49
  height: number;
50
50
  };
51
51
  export declare function jimpFromBase64(base64: string): Promise<Jimp>;
52
- export declare function paddingToMatchBlock(image: Jimp, blockSize?: number): Promise<Jimp>;
53
- export declare function paddingToMatchBlockByBase64(imageBase64: string, blockSize?: number): Promise<string>;
54
- export declare function cropByRect(imageBase64: string, rect: Rect, paddingImage: boolean): Promise<string>;
52
+ export declare function paddingToMatchBlock(image: Jimp, blockSize?: number): Promise<{
53
+ width: number;
54
+ height: number;
55
+ image: Jimp;
56
+ }>;
57
+ export declare function paddingToMatchBlockByBase64(imageBase64: string, blockSize?: number): Promise<{
58
+ width: number;
59
+ height: number;
60
+ imageBase64: string;
61
+ }>;
62
+ export declare function cropByRect(imageBase64: string, rect: Rect, paddingImage: boolean): Promise<{
63
+ width: number;
64
+ height: number;
65
+ imageBase64: string;
66
+ }>;
55
67
  export declare function jimpToBase64(image: Jimp): Promise<string>;
56
68
  export declare const httpImg2Base64: (url: string) => Promise<string>;
57
69
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@midscene/shared",
3
- "version": "0.29.0",
3
+ "version": "0.29.1",
4
4
  "repository": "https://github.com/web-infra-dev/midscene",
5
5
  "homepage": "https://midscenejs.com/",
6
6
  "types": "./dist/types/index.d.ts",
package/src/env/parse.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import {
2
2
  MIDSCENE_USE_DOUBAO_VISION,
3
3
  MIDSCENE_USE_GEMINI,
4
+ MIDSCENE_USE_QWEN3_VL,
4
5
  MIDSCENE_USE_QWEN_VL,
5
6
  MIDSCENE_USE_VLM_UI_TARS,
6
7
  type TVlModeTypes,
@@ -58,12 +59,14 @@ export const parseVlModeAndUiTarsFromGlobalConfig = (
58
59
  } => {
59
60
  const isDoubao = provider[MIDSCENE_USE_DOUBAO_VISION];
60
61
  const isQwen = provider[MIDSCENE_USE_QWEN_VL];
62
+ const isQwen3 = provider[MIDSCENE_USE_QWEN3_VL];
61
63
  const isUiTars = provider[MIDSCENE_USE_VLM_UI_TARS];
62
64
  const isGemini = provider[MIDSCENE_USE_GEMINI];
63
65
 
64
66
  const enabledModes = [
65
67
  isDoubao && MIDSCENE_USE_DOUBAO_VISION,
66
68
  isQwen && MIDSCENE_USE_QWEN_VL,
69
+ isQwen3 && MIDSCENE_USE_QWEN3_VL,
67
70
  isUiTars && MIDSCENE_USE_VLM_UI_TARS,
68
71
  isGemini && MIDSCENE_USE_GEMINI,
69
72
  ].filter(Boolean);
@@ -74,6 +77,13 @@ export const parseVlModeAndUiTarsFromGlobalConfig = (
74
77
  );
75
78
  }
76
79
 
80
+ if (isQwen3) {
81
+ return {
82
+ vlMode: 'qwen3-vl',
83
+ uiTarsVersion: undefined,
84
+ };
85
+ }
86
+
77
87
  if (isQwen) {
78
88
  return {
79
89
  vlMode: 'qwen-vl',
package/src/env/types.ts CHANGED
@@ -29,6 +29,7 @@ export const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
29
29
  export const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
30
30
  export const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
31
31
  export const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
32
+ export const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
32
33
  export const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
33
34
  export const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
34
35
  export const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
@@ -242,6 +243,7 @@ export const MODEL_ENV_KEYS = [
242
243
  MIDSCENE_USE_ANTHROPIC_SDK,
243
244
  MIDSCENE_USE_VLM_UI_TARS,
244
245
  MIDSCENE_USE_QWEN_VL,
246
+ MIDSCENE_USE_QWEN3_VL,
245
247
  MIDSCENE_USE_DOUBAO_VISION,
246
248
  MIDSCENE_USE_GEMINI,
247
249
  MIDSCENE_USE_VL_MODEL,
@@ -328,6 +330,7 @@ export type TGlobalConfig = Record<TEnvKeys, string | undefined>;
328
330
 
329
331
  export type TVlModeValues =
330
332
  | 'qwen-vl'
333
+ | 'qwen3-vl'
331
334
  | 'doubao-vision'
332
335
  | 'gemini'
333
336
  | 'vlm-ui-tars'
@@ -336,6 +339,7 @@ export type TVlModeValues =
336
339
 
337
340
  export type TVlModeTypes =
338
341
  | 'qwen-vl'
342
+ | 'qwen3-vl'
339
343
  | 'doubao-vision'
340
344
  | 'gemini'
341
345
  | 'vlm-ui-tars';
@@ -497,6 +501,7 @@ export const VL_MODE_RAW_VALID_VALUES: TVlModeValues[] = [
497
501
  'doubao-vision',
498
502
  'gemini',
499
503
  'qwen-vl',
504
+ 'qwen3-vl',
500
505
  'vlm-ui-tars',
501
506
  'vlm-ui-tars-doubao',
502
507
  'vlm-ui-tars-doubao-1.5',
@@ -228,14 +228,18 @@ export async function jimpFromBase64(base64: string): Promise<Jimp> {
228
228
  export async function paddingToMatchBlock(
229
229
  image: Jimp,
230
230
  blockSize = 28,
231
- ): Promise<Jimp> {
231
+ ): Promise<{
232
+ width: number;
233
+ height: number;
234
+ image: Jimp;
235
+ }> {
232
236
  const { width, height } = image.bitmap;
233
237
 
234
238
  const targetWidth = Math.ceil(width / blockSize) * blockSize;
235
239
  const targetHeight = Math.ceil(height / blockSize) * blockSize;
236
240
 
237
241
  if (targetWidth === width && targetHeight === height) {
238
- return image;
242
+ return { width, height, image };
239
243
  }
240
244
 
241
245
  const Jimp = await getJimp();
@@ -243,31 +247,52 @@ export async function paddingToMatchBlock(
243
247
 
244
248
  // Composite the original image onto the new canvas
245
249
  paddedImage.composite(image, 0, 0);
246
- return paddedImage;
250
+ return { width: targetWidth, height: targetHeight, image: paddedImage };
247
251
  }
248
252
 
249
253
  export async function paddingToMatchBlockByBase64(
250
254
  imageBase64: string,
251
255
  blockSize = 28,
252
- ): Promise<string> {
256
+ ): Promise<{
257
+ width: number;
258
+ height: number;
259
+ imageBase64: string;
260
+ }> {
253
261
  const jimpImage = await jimpFromBase64(imageBase64);
254
- const paddedImage = await paddingToMatchBlock(jimpImage, blockSize);
255
- return jimpToBase64(paddedImage);
262
+ const paddedResult = await paddingToMatchBlock(jimpImage, blockSize);
263
+ return {
264
+ width: paddedResult.width,
265
+ height: paddedResult.height,
266
+ imageBase64: await jimpToBase64(paddedResult.image),
267
+ };
256
268
  }
269
+
257
270
  export async function cropByRect(
258
271
  imageBase64: string,
259
272
  rect: Rect,
260
273
  paddingImage: boolean,
261
- ): Promise<string> {
274
+ ): Promise<{
275
+ width: number;
276
+ height: number;
277
+ imageBase64: string;
278
+ }> {
262
279
  const jimpImage = await jimpFromBase64(imageBase64);
263
280
  const { left, top, width, height } = rect;
264
281
  jimpImage.crop(left, top, width, height);
265
282
 
266
283
  if (paddingImage) {
267
- const paddedImage = await paddingToMatchBlock(jimpImage);
268
- return jimpToBase64(paddedImage);
284
+ const paddedResult = await paddingToMatchBlock(jimpImage);
285
+ return {
286
+ width: paddedResult.width,
287
+ height: paddedResult.height,
288
+ imageBase64: await jimpToBase64(paddedResult.image),
289
+ };
269
290
  }
270
- return jimpToBase64(jimpImage);
291
+ return {
292
+ width: jimpImage.bitmap.width,
293
+ height: jimpImage.bitmap.height,
294
+ imageBase64: await jimpToBase64(jimpImage),
295
+ };
271
296
  }
272
297
 
273
298
  export async function jimpToBase64(image: Jimp): Promise<string> {