@midscene/shared 0.28.12-beta-20250924093113.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/env/parse.mjs +1 -7
- package/dist/es/env/types.mjs +1 -4
- package/dist/es/img/transform.mjs +7 -27
- package/dist/lib/env/parse.js +0 -6
- package/dist/lib/env/types.js +0 -6
- package/dist/lib/img/transform.js +7 -27
- package/dist/types/env/types.d.ts +4 -5
- package/dist/types/img/transform.d.ts +3 -15
- package/package.json +1 -1
- package/src/env/parse.ts +0 -10
- package/src/env/types.ts +0 -5
- package/src/img/transform.ts +10 -35
package/dist/es/env/parse.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI,
|
|
1
|
+
import { MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, UITarsModelVersion, VL_MODE_RAW_VALID_VALUES } from "./types.mjs";
|
|
2
2
|
const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
|
|
3
3
|
if (!vlModeRaw) return {
|
|
4
4
|
vlMode: void 0,
|
|
@@ -22,21 +22,15 @@ const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
|
|
|
22
22
|
const parseVlModeAndUiTarsFromGlobalConfig = (provider)=>{
|
|
23
23
|
const isDoubao = provider[MIDSCENE_USE_DOUBAO_VISION];
|
|
24
24
|
const isQwen = provider[MIDSCENE_USE_QWEN_VL];
|
|
25
|
-
const isQwen3 = provider[MIDSCENE_USE_QWEN3_VL];
|
|
26
25
|
const isUiTars = provider[MIDSCENE_USE_VLM_UI_TARS];
|
|
27
26
|
const isGemini = provider[MIDSCENE_USE_GEMINI];
|
|
28
27
|
const enabledModes = [
|
|
29
28
|
isDoubao && MIDSCENE_USE_DOUBAO_VISION,
|
|
30
29
|
isQwen && MIDSCENE_USE_QWEN_VL,
|
|
31
|
-
isQwen3 && MIDSCENE_USE_QWEN3_VL,
|
|
32
30
|
isUiTars && MIDSCENE_USE_VLM_UI_TARS,
|
|
33
31
|
isGemini && MIDSCENE_USE_GEMINI
|
|
34
32
|
].filter(Boolean);
|
|
35
33
|
if (enabledModes.length > 1) throw new Error(`Only one vision mode can be enabled at a time. Currently enabled modes: ${enabledModes.join(', ')}. Please disable all but one mode.`);
|
|
36
|
-
if (isQwen3) return {
|
|
37
|
-
vlMode: 'qwen3-vl',
|
|
38
|
-
uiTarsVersion: void 0
|
|
39
|
-
};
|
|
40
34
|
if (isQwen) return {
|
|
41
35
|
vlMode: 'qwen-vl',
|
|
42
36
|
uiTarsVersion: void 0
|
package/dist/es/env/types.mjs
CHANGED
|
@@ -22,7 +22,6 @@ const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
|
|
|
22
22
|
const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
|
|
23
23
|
const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
|
|
24
24
|
const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
|
|
25
|
-
const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
|
|
26
25
|
const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
|
|
27
26
|
const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
|
|
28
27
|
const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
|
|
@@ -154,7 +153,6 @@ const MODEL_ENV_KEYS = [
|
|
|
154
153
|
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
155
154
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
156
155
|
MIDSCENE_USE_QWEN_VL,
|
|
157
|
-
MIDSCENE_USE_QWEN3_VL,
|
|
158
156
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
159
157
|
MIDSCENE_USE_GEMINI,
|
|
160
158
|
MIDSCENE_USE_VL_MODEL,
|
|
@@ -241,9 +239,8 @@ const VL_MODE_RAW_VALID_VALUES = [
|
|
|
241
239
|
'doubao-vision',
|
|
242
240
|
'gemini',
|
|
243
241
|
'qwen-vl',
|
|
244
|
-
'qwen3-vl',
|
|
245
242
|
'vlm-ui-tars',
|
|
246
243
|
'vlm-ui-tars-doubao',
|
|
247
244
|
'vlm-ui-tars-doubao-1.5'
|
|
248
245
|
];
|
|
249
|
-
export { ALL_ENV_KEYS, ANTHROPIC_API_KEY, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, BASIC_ENV_KEYS, BOOLEAN_ENV_KEYS, DOCKER_CONTAINER, GLOBAL_ENV_KEYS, MATCH_BY_POSITION, MIDSCENE_ADB_PATH, MIDSCENE_ADB_REMOTE_HOST, MIDSCENE_ADB_REMOTE_PORT, MIDSCENE_ANDROID_IME_STRATEGY, MIDSCENE_ANTHROPIC_API_KEY, MIDSCENE_API_TYPE, MIDSCENE_AZURE_OPENAI_API_VERSION, MIDSCENE_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_AZURE_OPENAI_ENDPOINT, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_KEY, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_CACHE, MIDSCENE_CACHE_MAX_FILENAME_LENGTH, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, MIDSCENE_DEBUG_AI_PROFILE, MIDSCENE_DEBUG_AI_RESPONSE, MIDSCENE_DEBUG_MODE, MIDSCENE_FORCE_DEEP_THINK, MIDSCENE_GROUNDING_ANTHROPIC_API_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION, MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT, MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_AZURE_OPENAI_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE, MIDSCENE_GROUNDING_MODEL_NAME, MIDSCENE_GROUNDING_OPENAI_API_KEY, MIDSCENE_GROUNDING_OPENAI_BASE_URL, MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY, MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY, MIDSCENE_GROUNDING_OPENAI_USE_AZURE, MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK, MIDSCENE_GROUNDING_USE_AZURE_OPENAI, MIDSCENE_GROUNDING_VL_MODE, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MCP_ANDROID_MODE, MIDSCENE_MCP_CHROME_PATH, MIDSCENE_MCP_USE_PUPPETEER_MODE, MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_API_KEY, MIDSCENE_OPENAI_BASE_URL, MIDSCENE_OPENAI_HTTP_PROXY, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, MIDSCENE_OPENAI_USE_AZURE, MIDSCENE_PLANNING_ANTHROPIC_API_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION, MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT, MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_AZURE_OPENAI_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE, MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, MIDSCENE_PLANNING_OPENAI_HTTP_PROXY, MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY, MIDSCENE_PLANNING_OPENAI_USE_AZURE, MIDSCENE_PLANNING_USE_ANTHROPIC_SDK, MIDSCENE_PLANNING_USE_AZURE_OPENAI, MIDSCENE_PLANNING_VL_MODE, MIDSCENE_PREFERRED_LANGUAGE, MIDSCENE_REPLANNING_CYCLE_LIMIT, MIDSCENE_REPORT_TAG_NAME, MIDSCENE_RUN_DIR, MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI,
|
|
246
|
+
export { ALL_ENV_KEYS, ANTHROPIC_API_KEY, AZURE_OPENAI_API_VERSION, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, BASIC_ENV_KEYS, BOOLEAN_ENV_KEYS, DOCKER_CONTAINER, GLOBAL_ENV_KEYS, MATCH_BY_POSITION, MIDSCENE_ADB_PATH, MIDSCENE_ADB_REMOTE_HOST, MIDSCENE_ADB_REMOTE_PORT, MIDSCENE_ANDROID_IME_STRATEGY, MIDSCENE_ANTHROPIC_API_KEY, MIDSCENE_API_TYPE, MIDSCENE_AZURE_OPENAI_API_VERSION, MIDSCENE_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_AZURE_OPENAI_ENDPOINT, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_KEY, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_CACHE, MIDSCENE_CACHE_MAX_FILENAME_LENGTH, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, MIDSCENE_DEBUG_AI_PROFILE, MIDSCENE_DEBUG_AI_RESPONSE, MIDSCENE_DEBUG_MODE, MIDSCENE_FORCE_DEEP_THINK, MIDSCENE_GROUNDING_ANTHROPIC_API_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION, MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT, MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_AZURE_OPENAI_KEY, MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE, MIDSCENE_GROUNDING_MODEL_NAME, MIDSCENE_GROUNDING_OPENAI_API_KEY, MIDSCENE_GROUNDING_OPENAI_BASE_URL, MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY, MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY, MIDSCENE_GROUNDING_OPENAI_USE_AZURE, MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK, MIDSCENE_GROUNDING_USE_AZURE_OPENAI, MIDSCENE_GROUNDING_VL_MODE, MIDSCENE_LANGSMITH_DEBUG, MIDSCENE_MCP_ANDROID_MODE, MIDSCENE_MCP_CHROME_PATH, MIDSCENE_MCP_USE_PUPPETEER_MODE, MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_API_KEY, MIDSCENE_OPENAI_BASE_URL, MIDSCENE_OPENAI_HTTP_PROXY, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, MIDSCENE_OPENAI_USE_AZURE, MIDSCENE_PLANNING_ANTHROPIC_API_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION, MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT, MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_AZURE_OPENAI_KEY, MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE, MIDSCENE_PLANNING_MODEL_NAME, MIDSCENE_PLANNING_OPENAI_API_KEY, MIDSCENE_PLANNING_OPENAI_BASE_URL, MIDSCENE_PLANNING_OPENAI_HTTP_PROXY, MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON, MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY, MIDSCENE_PLANNING_OPENAI_USE_AZURE, MIDSCENE_PLANNING_USE_ANTHROPIC_SDK, MIDSCENE_PLANNING_USE_AZURE_OPENAI, MIDSCENE_PLANNING_VL_MODE, MIDSCENE_PREFERRED_LANGUAGE, MIDSCENE_REPLANNING_CYCLE_LIMIT, MIDSCENE_REPORT_TAG_NAME, MIDSCENE_RUN_DIR, MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, MIDSCENE_USE_DOUBAO_VISION, MIDSCENE_USE_GEMINI, MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, MIDSCENE_USE_VL_MODEL, MIDSCENE_VL_MODE, MIDSCENE_VQA_ANTHROPIC_API_KEY, MIDSCENE_VQA_AZURE_OPENAI_API_VERSION, MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT, MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT, MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_AZURE_OPENAI_KEY, MIDSCENE_VQA_AZURE_OPENAI_SCOPE, MIDSCENE_VQA_MODEL_NAME, MIDSCENE_VQA_OPENAI_API_KEY, MIDSCENE_VQA_OPENAI_BASE_URL, MIDSCENE_VQA_OPENAI_HTTP_PROXY, MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON, MIDSCENE_VQA_OPENAI_SOCKS_PROXY, MIDSCENE_VQA_OPENAI_USE_AZURE, MIDSCENE_VQA_USE_ANTHROPIC_SDK, MIDSCENE_VQA_USE_AZURE_OPENAI, MIDSCENE_VQA_VL_MODE, MODEL_ENV_KEYS, NUMBER_ENV_KEYS, OPENAI_API_KEY, OPENAI_BASE_URL, OPENAI_MAX_TOKENS, OPENAI_USE_AZURE, STRING_ENV_KEYS, types_UITarsModelVersion as UITarsModelVersion, UNUSED_ENV_KEYS, VL_MODE_RAW_VALID_VALUES };
|
|
@@ -106,46 +106,26 @@ async function paddingToMatchBlock(image, blockSize = 28) {
|
|
|
106
106
|
const { width, height } = image.bitmap;
|
|
107
107
|
const targetWidth = Math.ceil(width / blockSize) * blockSize;
|
|
108
108
|
const targetHeight = Math.ceil(height / blockSize) * blockSize;
|
|
109
|
-
if (targetWidth === width && targetHeight === height) return
|
|
110
|
-
width,
|
|
111
|
-
height,
|
|
112
|
-
image
|
|
113
|
-
};
|
|
109
|
+
if (targetWidth === width && targetHeight === height) return image;
|
|
114
110
|
const Jimp = await get_jimp();
|
|
115
111
|
const paddedImage = new Jimp(targetWidth, targetHeight, 0xffffffff);
|
|
116
112
|
paddedImage.composite(image, 0, 0);
|
|
117
|
-
return
|
|
118
|
-
width: targetWidth,
|
|
119
|
-
height: targetHeight,
|
|
120
|
-
image: paddedImage
|
|
121
|
-
};
|
|
113
|
+
return paddedImage;
|
|
122
114
|
}
|
|
123
115
|
async function paddingToMatchBlockByBase64(imageBase64, blockSize = 28) {
|
|
124
116
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
125
|
-
const
|
|
126
|
-
return
|
|
127
|
-
width: paddedResult.width,
|
|
128
|
-
height: paddedResult.height,
|
|
129
|
-
imageBase64: await jimpToBase64(paddedResult.image)
|
|
130
|
-
};
|
|
117
|
+
const paddedImage = await paddingToMatchBlock(jimpImage, blockSize);
|
|
118
|
+
return jimpToBase64(paddedImage);
|
|
131
119
|
}
|
|
132
120
|
async function cropByRect(imageBase64, rect, paddingImage) {
|
|
133
121
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
134
122
|
const { left, top, width, height } = rect;
|
|
135
123
|
jimpImage.crop(left, top, width, height);
|
|
136
124
|
if (paddingImage) {
|
|
137
|
-
const
|
|
138
|
-
return
|
|
139
|
-
width: paddedResult.width,
|
|
140
|
-
height: paddedResult.height,
|
|
141
|
-
imageBase64: await jimpToBase64(paddedResult.image)
|
|
142
|
-
};
|
|
125
|
+
const paddedImage = await paddingToMatchBlock(jimpImage);
|
|
126
|
+
return jimpToBase64(paddedImage);
|
|
143
127
|
}
|
|
144
|
-
return
|
|
145
|
-
width: jimpImage.bitmap.width,
|
|
146
|
-
height: jimpImage.bitmap.height,
|
|
147
|
-
imageBase64: await jimpToBase64(jimpImage)
|
|
148
|
-
};
|
|
128
|
+
return jimpToBase64(jimpImage);
|
|
149
129
|
}
|
|
150
130
|
async function jimpToBase64(image) {
|
|
151
131
|
const Jimp = await get_jimp();
|
package/dist/lib/env/parse.js
CHANGED
|
@@ -51,21 +51,15 @@ const parseVlModeAndUiTarsModelVersionFromRawValue = (vlModeRaw)=>{
|
|
|
51
51
|
const parseVlModeAndUiTarsFromGlobalConfig = (provider)=>{
|
|
52
52
|
const isDoubao = provider[external_types_js_namespaceObject.MIDSCENE_USE_DOUBAO_VISION];
|
|
53
53
|
const isQwen = provider[external_types_js_namespaceObject.MIDSCENE_USE_QWEN_VL];
|
|
54
|
-
const isQwen3 = provider[external_types_js_namespaceObject.MIDSCENE_USE_QWEN3_VL];
|
|
55
54
|
const isUiTars = provider[external_types_js_namespaceObject.MIDSCENE_USE_VLM_UI_TARS];
|
|
56
55
|
const isGemini = provider[external_types_js_namespaceObject.MIDSCENE_USE_GEMINI];
|
|
57
56
|
const enabledModes = [
|
|
58
57
|
isDoubao && external_types_js_namespaceObject.MIDSCENE_USE_DOUBAO_VISION,
|
|
59
58
|
isQwen && external_types_js_namespaceObject.MIDSCENE_USE_QWEN_VL,
|
|
60
|
-
isQwen3 && external_types_js_namespaceObject.MIDSCENE_USE_QWEN3_VL,
|
|
61
59
|
isUiTars && external_types_js_namespaceObject.MIDSCENE_USE_VLM_UI_TARS,
|
|
62
60
|
isGemini && external_types_js_namespaceObject.MIDSCENE_USE_GEMINI
|
|
63
61
|
].filter(Boolean);
|
|
64
62
|
if (enabledModes.length > 1) throw new Error(`Only one vision mode can be enabled at a time. Currently enabled modes: ${enabledModes.join(', ')}. Please disable all but one mode.`);
|
|
65
|
-
if (isQwen3) return {
|
|
66
|
-
vlMode: 'qwen3-vl',
|
|
67
|
-
uiTarsVersion: void 0
|
|
68
|
-
};
|
|
69
63
|
if (isQwen) return {
|
|
70
64
|
vlMode: 'qwen-vl',
|
|
71
65
|
uiTarsVersion: void 0
|
package/dist/lib/env/types.js
CHANGED
|
@@ -107,7 +107,6 @@ __webpack_require__.d(__webpack_exports__, {
|
|
|
107
107
|
MIDSCENE_USE_AZURE_OPENAI: ()=>MIDSCENE_USE_AZURE_OPENAI,
|
|
108
108
|
MIDSCENE_USE_DOUBAO_VISION: ()=>MIDSCENE_USE_DOUBAO_VISION,
|
|
109
109
|
MIDSCENE_USE_GEMINI: ()=>MIDSCENE_USE_GEMINI,
|
|
110
|
-
MIDSCENE_USE_QWEN3_VL: ()=>MIDSCENE_USE_QWEN3_VL,
|
|
111
110
|
MIDSCENE_USE_QWEN_VL: ()=>MIDSCENE_USE_QWEN_VL,
|
|
112
111
|
MIDSCENE_USE_VLM_UI_TARS: ()=>MIDSCENE_USE_VLM_UI_TARS,
|
|
113
112
|
MIDSCENE_USE_VL_MODEL: ()=>MIDSCENE_USE_VL_MODEL,
|
|
@@ -164,7 +163,6 @@ const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
|
|
|
164
163
|
const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
|
|
165
164
|
const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
|
|
166
165
|
const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
|
|
167
|
-
const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
|
|
168
166
|
const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
|
|
169
167
|
const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
|
|
170
168
|
const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
|
|
@@ -296,7 +294,6 @@ const MODEL_ENV_KEYS = [
|
|
|
296
294
|
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
297
295
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
298
296
|
MIDSCENE_USE_QWEN_VL,
|
|
299
|
-
MIDSCENE_USE_QWEN3_VL,
|
|
300
297
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
301
298
|
MIDSCENE_USE_GEMINI,
|
|
302
299
|
MIDSCENE_USE_VL_MODEL,
|
|
@@ -383,7 +380,6 @@ const VL_MODE_RAW_VALID_VALUES = [
|
|
|
383
380
|
'doubao-vision',
|
|
384
381
|
'gemini',
|
|
385
382
|
'qwen-vl',
|
|
386
|
-
'qwen3-vl',
|
|
387
383
|
'vlm-ui-tars',
|
|
388
384
|
'vlm-ui-tars-doubao',
|
|
389
385
|
'vlm-ui-tars-doubao-1.5'
|
|
@@ -471,7 +467,6 @@ exports.MIDSCENE_USE_ANTHROPIC_SDK = __webpack_exports__.MIDSCENE_USE_ANTHROPIC_
|
|
|
471
467
|
exports.MIDSCENE_USE_AZURE_OPENAI = __webpack_exports__.MIDSCENE_USE_AZURE_OPENAI;
|
|
472
468
|
exports.MIDSCENE_USE_DOUBAO_VISION = __webpack_exports__.MIDSCENE_USE_DOUBAO_VISION;
|
|
473
469
|
exports.MIDSCENE_USE_GEMINI = __webpack_exports__.MIDSCENE_USE_GEMINI;
|
|
474
|
-
exports.MIDSCENE_USE_QWEN3_VL = __webpack_exports__.MIDSCENE_USE_QWEN3_VL;
|
|
475
470
|
exports.MIDSCENE_USE_QWEN_VL = __webpack_exports__.MIDSCENE_USE_QWEN_VL;
|
|
476
471
|
exports.MIDSCENE_USE_VLM_UI_TARS = __webpack_exports__.MIDSCENE_USE_VLM_UI_TARS;
|
|
477
472
|
exports.MIDSCENE_USE_VL_MODEL = __webpack_exports__.MIDSCENE_USE_VL_MODEL;
|
|
@@ -587,7 +582,6 @@ for(var __webpack_i__ in __webpack_exports__)if (-1 === [
|
|
|
587
582
|
"MIDSCENE_USE_AZURE_OPENAI",
|
|
588
583
|
"MIDSCENE_USE_DOUBAO_VISION",
|
|
589
584
|
"MIDSCENE_USE_GEMINI",
|
|
590
|
-
"MIDSCENE_USE_QWEN3_VL",
|
|
591
585
|
"MIDSCENE_USE_QWEN_VL",
|
|
592
586
|
"MIDSCENE_USE_VLM_UI_TARS",
|
|
593
587
|
"MIDSCENE_USE_VL_MODEL",
|
|
@@ -161,46 +161,26 @@ async function paddingToMatchBlock(image, blockSize = 28) {
|
|
|
161
161
|
const { width, height } = image.bitmap;
|
|
162
162
|
const targetWidth = Math.ceil(width / blockSize) * blockSize;
|
|
163
163
|
const targetHeight = Math.ceil(height / blockSize) * blockSize;
|
|
164
|
-
if (targetWidth === width && targetHeight === height) return
|
|
165
|
-
width,
|
|
166
|
-
height,
|
|
167
|
-
image
|
|
168
|
-
};
|
|
164
|
+
if (targetWidth === width && targetHeight === height) return image;
|
|
169
165
|
const Jimp = await external_get_jimp_js_default()();
|
|
170
166
|
const paddedImage = new Jimp(targetWidth, targetHeight, 0xffffffff);
|
|
171
167
|
paddedImage.composite(image, 0, 0);
|
|
172
|
-
return
|
|
173
|
-
width: targetWidth,
|
|
174
|
-
height: targetHeight,
|
|
175
|
-
image: paddedImage
|
|
176
|
-
};
|
|
168
|
+
return paddedImage;
|
|
177
169
|
}
|
|
178
170
|
async function paddingToMatchBlockByBase64(imageBase64, blockSize = 28) {
|
|
179
171
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
180
|
-
const
|
|
181
|
-
return
|
|
182
|
-
width: paddedResult.width,
|
|
183
|
-
height: paddedResult.height,
|
|
184
|
-
imageBase64: await jimpToBase64(paddedResult.image)
|
|
185
|
-
};
|
|
172
|
+
const paddedImage = await paddingToMatchBlock(jimpImage, blockSize);
|
|
173
|
+
return jimpToBase64(paddedImage);
|
|
186
174
|
}
|
|
187
175
|
async function cropByRect(imageBase64, rect, paddingImage) {
|
|
188
176
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
189
177
|
const { left, top, width, height } = rect;
|
|
190
178
|
jimpImage.crop(left, top, width, height);
|
|
191
179
|
if (paddingImage) {
|
|
192
|
-
const
|
|
193
|
-
return
|
|
194
|
-
width: paddedResult.width,
|
|
195
|
-
height: paddedResult.height,
|
|
196
|
-
imageBase64: await jimpToBase64(paddedResult.image)
|
|
197
|
-
};
|
|
180
|
+
const paddedImage = await paddingToMatchBlock(jimpImage);
|
|
181
|
+
return jimpToBase64(paddedImage);
|
|
198
182
|
}
|
|
199
|
-
return
|
|
200
|
-
width: jimpImage.bitmap.width,
|
|
201
|
-
height: jimpImage.bitmap.height,
|
|
202
|
-
imageBase64: await jimpToBase64(jimpImage)
|
|
203
|
-
};
|
|
183
|
+
return jimpToBase64(jimpImage);
|
|
204
184
|
}
|
|
205
185
|
async function jimpToBase64(image) {
|
|
206
186
|
const Jimp = await external_get_jimp_js_default()();
|
|
@@ -22,7 +22,6 @@ export declare const MIDSCENE_ANDROID_IME_STRATEGY = "MIDSCENE_ANDROID_IME_STRAT
|
|
|
22
22
|
export declare const MIDSCENE_CACHE = "MIDSCENE_CACHE";
|
|
23
23
|
export declare const MIDSCENE_USE_VLM_UI_TARS = "MIDSCENE_USE_VLM_UI_TARS";
|
|
24
24
|
export declare const MIDSCENE_USE_QWEN_VL = "MIDSCENE_USE_QWEN_VL";
|
|
25
|
-
export declare const MIDSCENE_USE_QWEN3_VL = "MIDSCENE_USE_QWEN3_VL";
|
|
26
25
|
export declare const MIDSCENE_USE_DOUBAO_VISION = "MIDSCENE_USE_DOUBAO_VISION";
|
|
27
26
|
export declare const MIDSCENE_USE_GEMINI = "MIDSCENE_USE_GEMINI";
|
|
28
27
|
export declare const MIDSCENE_USE_VL_MODEL = "MIDSCENE_USE_VL_MODEL";
|
|
@@ -126,12 +125,12 @@ export declare const GLOBAL_ENV_KEYS: readonly ["MIDSCENE_CACHE", "MIDSCENE_LANG
|
|
|
126
125
|
* Can be override by both agent.modelConfig and overrideAIConfig
|
|
127
126
|
* Can only be access after agent.constructor
|
|
128
127
|
*/
|
|
129
|
-
export declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "
|
|
130
|
-
export declare const ALL_ENV_KEYS: readonly [...string[], "MIDSCENE_DEBUG_MODE", "MIDSCENE_DEBUG_AI_PROFILE", "MIDSCENE_DEBUG_AI_RESPONSE", "MIDSCENE_RUN_DIR", "MIDSCENE_CACHE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_FORCE_DEEP_THINK", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER", "MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "
|
|
128
|
+
export declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
|
|
129
|
+
export declare const ALL_ENV_KEYS: readonly [...string[], "MIDSCENE_DEBUG_MODE", "MIDSCENE_DEBUG_AI_PROFILE", "MIDSCENE_DEBUG_AI_RESPONSE", "MIDSCENE_RUN_DIR", "MIDSCENE_CACHE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_FORCE_DEEP_THINK", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER", "MIDSCENE_MODEL_NAME", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_API_KEY", "MIDSCENE_OPENAI_BASE_URL", "MIDSCENE_OPENAI_USE_AZURE", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_USE_AZURE_OPENAI", "MIDSCENE_AZURE_OPENAI_SCOPE", "MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_USE_ANTHROPIC_SDK", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "ANTHROPIC_API_KEY", "MIDSCENE_AZURE_OPENAI_ENDPOINT", "MIDSCENE_AZURE_OPENAI_KEY", "MIDSCENE_AZURE_OPENAI_API_VERSION", "MIDSCENE_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VL_MODE", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_USE_AZURE", "ANTHROPIC_API_KEY", "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_KEY", "AZURE_OPENAI_API_VERSION", "AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_MODEL_NAME", "MIDSCENE_VQA_OPENAI_SOCKS_PROXY", "MIDSCENE_VQA_OPENAI_HTTP_PROXY", "MIDSCENE_VQA_OPENAI_BASE_URL", "MIDSCENE_VQA_OPENAI_API_KEY", "MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_OPENAI_USE_AZURE", "MIDSCENE_VQA_USE_AZURE_OPENAI", "MIDSCENE_VQA_AZURE_OPENAI_SCOPE", "MIDSCENE_VQA_AZURE_OPENAI_KEY", "MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT", "MIDSCENE_VQA_AZURE_OPENAI_API_VERSION", "MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_VQA_USE_ANTHROPIC_SDK", "MIDSCENE_VQA_ANTHROPIC_API_KEY", "MIDSCENE_VQA_VL_MODE", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_OPENAI_SOCKS_PROXY", "MIDSCENE_PLANNING_OPENAI_HTTP_PROXY", "MIDSCENE_PLANNING_OPENAI_BASE_URL", "MIDSCENE_PLANNING_OPENAI_API_KEY", "MIDSCENE_PLANNING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_OPENAI_USE_AZURE", "MIDSCENE_PLANNING_USE_AZURE_OPENAI", "MIDSCENE_PLANNING_AZURE_OPENAI_SCOPE", "MIDSCENE_PLANNING_AZURE_OPENAI_KEY", "MIDSCENE_PLANNING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_PLANNING_AZURE_OPENAI_API_VERSION", "MIDSCENE_PLANNING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_PLANNING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_USE_ANTHROPIC_SDK", "MIDSCENE_PLANNING_ANTHROPIC_API_KEY", "MIDSCENE_PLANNING_VL_MODE", "MIDSCENE_GROUNDING_MODEL_NAME", "MIDSCENE_GROUNDING_OPENAI_SOCKS_PROXY", "MIDSCENE_GROUNDING_OPENAI_HTTP_PROXY", "MIDSCENE_GROUNDING_OPENAI_BASE_URL", "MIDSCENE_GROUNDING_OPENAI_API_KEY", "MIDSCENE_GROUNDING_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_OPENAI_USE_AZURE", "MIDSCENE_GROUNDING_USE_AZURE_OPENAI", "MIDSCENE_GROUNDING_AZURE_OPENAI_SCOPE", "MIDSCENE_GROUNDING_AZURE_OPENAI_KEY", "MIDSCENE_GROUNDING_AZURE_OPENAI_ENDPOINT", "MIDSCENE_GROUNDING_AZURE_OPENAI_API_VERSION", "MIDSCENE_GROUNDING_AZURE_OPENAI_DEPLOYMENT", "MIDSCENE_GROUNDING_AZURE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_GROUNDING_USE_ANTHROPIC_SDK", "MIDSCENE_GROUNDING_ANTHROPIC_API_KEY", "MIDSCENE_GROUNDING_VL_MODE"];
|
|
131
130
|
export type TEnvKeys = (typeof ALL_ENV_KEYS)[number];
|
|
132
131
|
export type TGlobalConfig = Record<TEnvKeys, string | undefined>;
|
|
133
|
-
export type TVlModeValues = 'qwen-vl' | '
|
|
134
|
-
export type TVlModeTypes = 'qwen-vl' | '
|
|
132
|
+
export type TVlModeValues = 'qwen-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
|
|
133
|
+
export type TVlModeTypes = 'qwen-vl' | 'doubao-vision' | 'gemini' | 'vlm-ui-tars';
|
|
135
134
|
export interface IModelConfigForVQA {
|
|
136
135
|
[MIDSCENE_VQA_MODEL_NAME]: string;
|
|
137
136
|
[MIDSCENE_VQA_OPENAI_SOCKS_PROXY]?: string;
|
|
@@ -49,21 +49,9 @@ export declare function zoomForGPT4o(originalWidth: number, originalHeight: numb
|
|
|
49
49
|
height: number;
|
|
50
50
|
};
|
|
51
51
|
export declare function jimpFromBase64(base64: string): Promise<Jimp>;
|
|
52
|
-
export declare function paddingToMatchBlock(image: Jimp, blockSize?: number): Promise<
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
image: Jimp;
|
|
56
|
-
}>;
|
|
57
|
-
export declare function paddingToMatchBlockByBase64(imageBase64: string, blockSize?: number): Promise<{
|
|
58
|
-
width: number;
|
|
59
|
-
height: number;
|
|
60
|
-
imageBase64: string;
|
|
61
|
-
}>;
|
|
62
|
-
export declare function cropByRect(imageBase64: string, rect: Rect, paddingImage: boolean): Promise<{
|
|
63
|
-
width: number;
|
|
64
|
-
height: number;
|
|
65
|
-
imageBase64: string;
|
|
66
|
-
}>;
|
|
52
|
+
export declare function paddingToMatchBlock(image: Jimp, blockSize?: number): Promise<Jimp>;
|
|
53
|
+
export declare function paddingToMatchBlockByBase64(imageBase64: string, blockSize?: number): Promise<string>;
|
|
54
|
+
export declare function cropByRect(imageBase64: string, rect: Rect, paddingImage: boolean): Promise<string>;
|
|
67
55
|
export declare function jimpToBase64(image: Jimp): Promise<string>;
|
|
68
56
|
export declare const httpImg2Base64: (url: string) => Promise<string>;
|
|
69
57
|
/**
|
package/package.json
CHANGED
package/src/env/parse.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
3
3
|
MIDSCENE_USE_GEMINI,
|
|
4
|
-
MIDSCENE_USE_QWEN3_VL,
|
|
5
4
|
MIDSCENE_USE_QWEN_VL,
|
|
6
5
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
7
6
|
type TVlModeTypes,
|
|
@@ -59,14 +58,12 @@ export const parseVlModeAndUiTarsFromGlobalConfig = (
|
|
|
59
58
|
} => {
|
|
60
59
|
const isDoubao = provider[MIDSCENE_USE_DOUBAO_VISION];
|
|
61
60
|
const isQwen = provider[MIDSCENE_USE_QWEN_VL];
|
|
62
|
-
const isQwen3 = provider[MIDSCENE_USE_QWEN3_VL];
|
|
63
61
|
const isUiTars = provider[MIDSCENE_USE_VLM_UI_TARS];
|
|
64
62
|
const isGemini = provider[MIDSCENE_USE_GEMINI];
|
|
65
63
|
|
|
66
64
|
const enabledModes = [
|
|
67
65
|
isDoubao && MIDSCENE_USE_DOUBAO_VISION,
|
|
68
66
|
isQwen && MIDSCENE_USE_QWEN_VL,
|
|
69
|
-
isQwen3 && MIDSCENE_USE_QWEN3_VL,
|
|
70
67
|
isUiTars && MIDSCENE_USE_VLM_UI_TARS,
|
|
71
68
|
isGemini && MIDSCENE_USE_GEMINI,
|
|
72
69
|
].filter(Boolean);
|
|
@@ -77,13 +74,6 @@ export const parseVlModeAndUiTarsFromGlobalConfig = (
|
|
|
77
74
|
);
|
|
78
75
|
}
|
|
79
76
|
|
|
80
|
-
if (isQwen3) {
|
|
81
|
-
return {
|
|
82
|
-
vlMode: 'qwen3-vl',
|
|
83
|
-
uiTarsVersion: undefined,
|
|
84
|
-
};
|
|
85
|
-
}
|
|
86
|
-
|
|
87
77
|
if (isQwen) {
|
|
88
78
|
return {
|
|
89
79
|
vlMode: 'qwen-vl',
|
package/src/env/types.ts
CHANGED
|
@@ -29,7 +29,6 @@ export const MIDSCENE_ANDROID_IME_STRATEGY = 'MIDSCENE_ANDROID_IME_STRATEGY';
|
|
|
29
29
|
export const MIDSCENE_CACHE = 'MIDSCENE_CACHE';
|
|
30
30
|
export const MIDSCENE_USE_VLM_UI_TARS = 'MIDSCENE_USE_VLM_UI_TARS';
|
|
31
31
|
export const MIDSCENE_USE_QWEN_VL = 'MIDSCENE_USE_QWEN_VL';
|
|
32
|
-
export const MIDSCENE_USE_QWEN3_VL = 'MIDSCENE_USE_QWEN3_VL';
|
|
33
32
|
export const MIDSCENE_USE_DOUBAO_VISION = 'MIDSCENE_USE_DOUBAO_VISION';
|
|
34
33
|
export const MIDSCENE_USE_GEMINI = 'MIDSCENE_USE_GEMINI';
|
|
35
34
|
export const MIDSCENE_USE_VL_MODEL = 'MIDSCENE_USE_VL_MODEL';
|
|
@@ -243,7 +242,6 @@ export const MODEL_ENV_KEYS = [
|
|
|
243
242
|
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
244
243
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
245
244
|
MIDSCENE_USE_QWEN_VL,
|
|
246
|
-
MIDSCENE_USE_QWEN3_VL,
|
|
247
245
|
MIDSCENE_USE_DOUBAO_VISION,
|
|
248
246
|
MIDSCENE_USE_GEMINI,
|
|
249
247
|
MIDSCENE_USE_VL_MODEL,
|
|
@@ -330,7 +328,6 @@ export type TGlobalConfig = Record<TEnvKeys, string | undefined>;
|
|
|
330
328
|
|
|
331
329
|
export type TVlModeValues =
|
|
332
330
|
| 'qwen-vl'
|
|
333
|
-
| 'qwen3-vl'
|
|
334
331
|
| 'doubao-vision'
|
|
335
332
|
| 'gemini'
|
|
336
333
|
| 'vlm-ui-tars'
|
|
@@ -339,7 +336,6 @@ export type TVlModeValues =
|
|
|
339
336
|
|
|
340
337
|
export type TVlModeTypes =
|
|
341
338
|
| 'qwen-vl'
|
|
342
|
-
| 'qwen3-vl'
|
|
343
339
|
| 'doubao-vision'
|
|
344
340
|
| 'gemini'
|
|
345
341
|
| 'vlm-ui-tars';
|
|
@@ -501,7 +497,6 @@ export const VL_MODE_RAW_VALID_VALUES: TVlModeValues[] = [
|
|
|
501
497
|
'doubao-vision',
|
|
502
498
|
'gemini',
|
|
503
499
|
'qwen-vl',
|
|
504
|
-
'qwen3-vl',
|
|
505
500
|
'vlm-ui-tars',
|
|
506
501
|
'vlm-ui-tars-doubao',
|
|
507
502
|
'vlm-ui-tars-doubao-1.5',
|
package/src/img/transform.ts
CHANGED
|
@@ -228,18 +228,14 @@ export async function jimpFromBase64(base64: string): Promise<Jimp> {
|
|
|
228
228
|
export async function paddingToMatchBlock(
|
|
229
229
|
image: Jimp,
|
|
230
230
|
blockSize = 28,
|
|
231
|
-
): Promise<{
|
|
232
|
-
width: number;
|
|
233
|
-
height: number;
|
|
234
|
-
image: Jimp;
|
|
235
|
-
}> {
|
|
231
|
+
): Promise<Jimp> {
|
|
236
232
|
const { width, height } = image.bitmap;
|
|
237
233
|
|
|
238
234
|
const targetWidth = Math.ceil(width / blockSize) * blockSize;
|
|
239
235
|
const targetHeight = Math.ceil(height / blockSize) * blockSize;
|
|
240
236
|
|
|
241
237
|
if (targetWidth === width && targetHeight === height) {
|
|
242
|
-
return
|
|
238
|
+
return image;
|
|
243
239
|
}
|
|
244
240
|
|
|
245
241
|
const Jimp = await getJimp();
|
|
@@ -247,52 +243,31 @@ export async function paddingToMatchBlock(
|
|
|
247
243
|
|
|
248
244
|
// Composite the original image onto the new canvas
|
|
249
245
|
paddedImage.composite(image, 0, 0);
|
|
250
|
-
return
|
|
246
|
+
return paddedImage;
|
|
251
247
|
}
|
|
252
248
|
|
|
253
249
|
export async function paddingToMatchBlockByBase64(
|
|
254
250
|
imageBase64: string,
|
|
255
251
|
blockSize = 28,
|
|
256
|
-
): Promise<{
|
|
257
|
-
width: number;
|
|
258
|
-
height: number;
|
|
259
|
-
imageBase64: string;
|
|
260
|
-
}> {
|
|
252
|
+
): Promise<string> {
|
|
261
253
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
262
|
-
const
|
|
263
|
-
return
|
|
264
|
-
width: paddedResult.width,
|
|
265
|
-
height: paddedResult.height,
|
|
266
|
-
imageBase64: await jimpToBase64(paddedResult.image),
|
|
267
|
-
};
|
|
254
|
+
const paddedImage = await paddingToMatchBlock(jimpImage, blockSize);
|
|
255
|
+
return jimpToBase64(paddedImage);
|
|
268
256
|
}
|
|
269
|
-
|
|
270
257
|
export async function cropByRect(
|
|
271
258
|
imageBase64: string,
|
|
272
259
|
rect: Rect,
|
|
273
260
|
paddingImage: boolean,
|
|
274
|
-
): Promise<{
|
|
275
|
-
width: number;
|
|
276
|
-
height: number;
|
|
277
|
-
imageBase64: string;
|
|
278
|
-
}> {
|
|
261
|
+
): Promise<string> {
|
|
279
262
|
const jimpImage = await jimpFromBase64(imageBase64);
|
|
280
263
|
const { left, top, width, height } = rect;
|
|
281
264
|
jimpImage.crop(left, top, width, height);
|
|
282
265
|
|
|
283
266
|
if (paddingImage) {
|
|
284
|
-
const
|
|
285
|
-
return
|
|
286
|
-
width: paddedResult.width,
|
|
287
|
-
height: paddedResult.height,
|
|
288
|
-
imageBase64: await jimpToBase64(paddedResult.image),
|
|
289
|
-
};
|
|
267
|
+
const paddedImage = await paddingToMatchBlock(jimpImage);
|
|
268
|
+
return jimpToBase64(paddedImage);
|
|
290
269
|
}
|
|
291
|
-
return
|
|
292
|
-
width: jimpImage.bitmap.width,
|
|
293
|
-
height: jimpImage.bitmap.height,
|
|
294
|
-
imageBase64: await jimpToBase64(jimpImage),
|
|
295
|
-
};
|
|
270
|
+
return jimpToBase64(jimpImage);
|
|
296
271
|
}
|
|
297
272
|
|
|
298
273
|
export async function jimpToBase64(image: Jimp): Promise<string> {
|