@midscene/core 1.8.11 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +40 -50
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/task-builder.mjs +39 -19
- package/dist/es/agent/task-builder.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +24 -22
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +11 -14
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/connectivity.mjs +7 -3
- package/dist/es/ai-model/connectivity.mjs.map +1 -1
- package/dist/es/ai-model/errors.mjs +9 -0
- package/dist/es/ai-model/errors.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +3 -4
- package/dist/es/ai-model/inspect.mjs +132 -144
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +46 -28
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/actions.mjs +22 -44
- package/dist/es/ai-model/models/auto-glm/actions.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/adapter.mjs +45 -0
- package/dist/es/ai-model/models/auto-glm/adapter.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/locate.mjs +112 -0
- package/dist/es/ai-model/models/auto-glm/locate.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/parser.mjs.map +1 -0
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/planning.mjs +6 -7
- package/dist/es/ai-model/models/auto-glm/planning.mjs.map +1 -0
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/prompt.mjs +3 -11
- package/dist/es/ai-model/models/auto-glm/prompt.mjs.map +1 -0
- package/dist/es/ai-model/models/default.mjs +12 -0
- package/dist/es/ai-model/models/default.mjs.map +1 -0
- package/dist/es/ai-model/models/doubao.mjs +138 -0
- package/dist/es/ai-model/models/doubao.mjs.map +1 -0
- package/dist/es/ai-model/models/gemini.mjs +34 -0
- package/dist/es/ai-model/models/gemini.mjs.map +1 -0
- package/dist/es/ai-model/models/glm.mjs +37 -0
- package/dist/es/ai-model/models/glm.mjs.map +1 -0
- package/dist/es/ai-model/models/gpt.mjs +31 -0
- package/dist/es/ai-model/models/gpt.mjs.map +1 -0
- package/dist/es/ai-model/models/index.mjs +2 -0
- package/dist/es/ai-model/models/qwen.mjs +113 -0
- package/dist/es/ai-model/models/qwen.mjs.map +1 -0
- package/dist/es/ai-model/models/registry.mjs +45 -0
- package/dist/es/ai-model/models/registry.mjs.map +1 -0
- package/dist/es/ai-model/models/resolved.mjs +104 -0
- package/dist/es/ai-model/models/resolved.mjs.map +1 -0
- package/dist/es/ai-model/models/types.mjs +0 -0
- package/dist/es/ai-model/models/ui-tars/adapter.mjs +142 -0
- package/dist/es/ai-model/models/ui-tars/adapter.mjs.map +1 -0
- package/dist/es/ai-model/{ui-tars-planning.mjs → models/ui-tars/planning.mjs} +44 -62
- package/dist/es/ai-model/models/ui-tars/planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +3 -3
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +11 -11
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +25 -60
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -10
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/locate-grounding-rules.mjs +9 -0
- package/dist/es/ai-model/prompt/locate-grounding-rules.mjs.map +1 -0
- package/dist/es/ai-model/prompt/locate-param-example.mjs +15 -0
- package/dist/es/ai-model/prompt/locate-param-example.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +5 -5
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +5 -5
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompts/locate-result-coordinates.mjs +107 -0
- package/dist/es/ai-model/prompts/locate-result-coordinates.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +59 -190
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/json.mjs +60 -0
- package/dist/es/ai-model/service-caller/json.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/bbox.mjs +68 -0
- package/dist/es/ai-model/shared/model-locate-result/bbox.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/factory.mjs +96 -0
- package/dist/es/ai-model/shared/model-locate-result/factory.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/index.mjs +3 -0
- package/dist/es/ai-model/shared/model-locate-result/parse.mjs +41 -0
- package/dist/es/ai-model/shared/model-locate-result/parse.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/pixel-bbox-mapper.mjs +64 -0
- package/dist/es/ai-model/shared/model-locate-result/pixel-bbox-mapper.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/types.mjs +0 -0
- package/dist/es/ai-model/types.mjs +0 -0
- package/dist/es/ai-model/workflows/image-preprocess.mjs +27 -0
- package/dist/es/ai-model/workflows/image-preprocess.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/index.mjs +2 -0
- package/dist/es/ai-model/workflows/inspect/locate-result-rect.mjs +23 -0
- package/dist/es/ai-model/workflows/inspect/locate-result-rect.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/search-area-mapping.mjs +18 -0
- package/dist/es/ai-model/workflows/inspect/search-area-mapping.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/types.mjs +0 -0
- package/dist/es/ai-model/workflows/planning/index.mjs +5 -0
- package/dist/es/ai-model/workflows/planning/index.mjs.map +1 -0
- package/dist/es/ai-model/workflows/planning/types.mjs +0 -0
- package/dist/es/common.mjs +2 -174
- package/dist/es/common.mjs.map +1 -1
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/service/index.mjs +96 -69
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/player.mjs +4 -3
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/lib/agent/agent.js +43 -53
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/task-builder.js +38 -18
- package/dist/lib/agent/task-builder.js.map +1 -1
- package/dist/lib/agent/tasks.js +23 -21
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +17 -17
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/connectivity.js +7 -3
- package/dist/lib/ai-model/connectivity.js.map +1 -1
- package/dist/lib/ai-model/errors.js +46 -0
- package/dist/lib/ai-model/errors.js.map +1 -0
- package/dist/lib/ai-model/index.js +7 -14
- package/dist/lib/ai-model/inspect.js +141 -144
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +44 -26
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/actions.js +22 -44
- package/dist/lib/ai-model/models/auto-glm/actions.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/adapter.js +79 -0
- package/dist/lib/ai-model/models/auto-glm/adapter.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/locate.js +146 -0
- package/dist/lib/ai-model/models/auto-glm/locate.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/parser.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/planning.js +8 -9
- package/dist/lib/ai-model/models/auto-glm/planning.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/prompt.js +14 -16
- package/dist/lib/ai-model/models/auto-glm/prompt.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm/util.js → models/default.js} +13 -13
- package/dist/lib/ai-model/models/default.js.map +1 -0
- package/dist/lib/ai-model/models/doubao.js +184 -0
- package/dist/lib/ai-model/models/doubao.js.map +1 -0
- package/dist/lib/ai-model/models/gemini.js +68 -0
- package/dist/lib/ai-model/models/gemini.js.map +1 -0
- package/dist/lib/ai-model/models/glm.js +71 -0
- package/dist/lib/ai-model/models/glm.js.map +1 -0
- package/dist/lib/ai-model/models/gpt.js +65 -0
- package/dist/lib/ai-model/models/gpt.js.map +1 -0
- package/dist/lib/ai-model/{service-caller/image-detail.js → models/index.js} +8 -7
- package/dist/lib/ai-model/models/index.js.map +1 -0
- package/dist/lib/ai-model/models/qwen.js +147 -0
- package/dist/lib/ai-model/models/qwen.js.map +1 -0
- package/dist/lib/ai-model/models/registry.js +85 -0
- package/dist/lib/ai-model/models/registry.js.map +1 -0
- package/dist/lib/ai-model/models/resolved.js +138 -0
- package/dist/lib/ai-model/models/resolved.js.map +1 -0
- package/dist/lib/ai-model/models/types.js +20 -0
- package/dist/lib/ai-model/models/types.js.map +1 -0
- package/dist/lib/ai-model/models/ui-tars/adapter.js +176 -0
- package/dist/lib/ai-model/models/ui-tars/adapter.js.map +1 -0
- package/dist/lib/ai-model/{ui-tars-planning.js → models/ui-tars/planning.js} +44 -62
- package/dist/lib/ai-model/models/ui-tars/planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +3 -3
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +11 -11
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +25 -60
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js +15 -10
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/locate-grounding-rules.js +43 -0
- package/dist/lib/ai-model/prompt/locate-grounding-rules.js.map +1 -0
- package/dist/lib/ai-model/prompt/locate-param-example.js +52 -0
- package/dist/lib/ai-model/prompt/locate-param-example.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +5 -5
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +5 -5
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/prompts/locate-result-coordinates.js +150 -0
- package/dist/lib/ai-model/prompts/locate-result-coordinates.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +68 -199
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/service-caller/json.js +100 -0
- package/dist/lib/ai-model/service-caller/json.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/bbox.js +117 -0
- package/dist/lib/ai-model/shared/model-locate-result/bbox.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/factory.js +130 -0
- package/dist/lib/ai-model/shared/model-locate-result/factory.js.map +1 -0
- package/dist/lib/ai-model/{prompt/common.js → shared/model-locate-result/index.js} +9 -9
- package/dist/lib/ai-model/shared/model-locate-result/index.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/parse.js +78 -0
- package/dist/lib/ai-model/shared/model-locate-result/parse.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/pixel-bbox-mapper.js +98 -0
- package/dist/lib/ai-model/shared/model-locate-result/pixel-bbox-mapper.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/types.js +20 -0
- package/dist/lib/ai-model/shared/model-locate-result/types.js.map +1 -0
- package/dist/lib/ai-model/types.js +20 -0
- package/dist/lib/ai-model/types.js.map +1 -0
- package/dist/lib/ai-model/workflows/image-preprocess.js +61 -0
- package/dist/lib/ai-model/workflows/image-preprocess.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/index.js +50 -0
- package/dist/lib/ai-model/workflows/inspect/index.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/locate-result-rect.js +60 -0
- package/dist/lib/ai-model/workflows/inspect/locate-result-rect.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/search-area-mapping.js +52 -0
- package/dist/lib/ai-model/workflows/inspect/search-area-mapping.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/types.js +20 -0
- package/dist/lib/ai-model/workflows/inspect/types.js.map +1 -0
- package/dist/lib/ai-model/{model-family.js → workflows/planning/index.js} +6 -7
- package/dist/lib/ai-model/workflows/planning/index.js.map +1 -0
- package/dist/lib/ai-model/workflows/planning/types.js +20 -0
- package/dist/lib/ai-model/workflows/planning/types.js.map +1 -0
- package/dist/lib/common.js +4 -206
- package/dist/lib/common.js.map +1 -1
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/service/index.js +96 -69
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/player.js +4 -3
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/types/agent/agent.d.ts +14 -6
- package/dist/types/agent/task-builder.d.ts +2 -2
- package/dist/types/agent/tasks.d.ts +6 -6
- package/dist/types/agent/utils.d.ts +8 -5
- package/dist/types/ai-model/errors.d.ts +2 -0
- package/dist/types/ai-model/index.d.ts +2 -4
- package/dist/types/ai-model/inspect.d.ts +13 -33
- package/dist/types/ai-model/llm-planning.d.ts +6 -17
- package/dist/types/ai-model/{auto-glm → models/auto-glm}/actions.d.ts +2 -2
- package/dist/types/ai-model/models/auto-glm/adapter.d.ts +5 -0
- package/dist/types/ai-model/models/auto-glm/locate.d.ts +3 -0
- package/dist/types/ai-model/models/auto-glm/planning.d.ts +3 -0
- package/dist/types/ai-model/models/auto-glm/prompt.d.ts +4 -0
- package/dist/types/ai-model/models/default.d.ts +2 -0
- package/dist/types/ai-model/models/doubao.d.ts +10 -0
- package/dist/types/ai-model/models/gemini.d.ts +18 -0
- package/dist/types/ai-model/models/glm.d.ts +18 -0
- package/dist/types/ai-model/models/gpt.d.ts +18 -0
- package/dist/types/ai-model/models/index.d.ts +2 -0
- package/dist/types/ai-model/models/qwen.d.ts +30 -0
- package/dist/types/ai-model/models/registry.d.ts +81 -0
- package/dist/types/ai-model/models/resolved.d.ts +9 -0
- package/dist/types/ai-model/models/types.d.ts +102 -0
- package/dist/types/ai-model/models/ui-tars/adapter.d.ts +6 -0
- package/dist/types/ai-model/{ui-tars-planning.d.ts → models/ui-tars/planning.d.ts} +7 -11
- package/dist/types/ai-model/prompt/llm-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +5 -5
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/locate-grounding-rules.d.ts +1 -0
- package/dist/types/ai-model/prompt/locate-param-example.d.ts +3 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +3 -3
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +3 -3
- package/dist/types/ai-model/prompts/locate-result-coordinates.d.ts +6 -0
- package/dist/types/ai-model/service-caller/index.d.ts +19 -27
- package/dist/types/ai-model/service-caller/json.d.ts +9 -0
- package/dist/types/ai-model/shared/model-locate-result/bbox.d.ts +7 -0
- package/dist/types/ai-model/shared/model-locate-result/factory.d.ts +2 -0
- package/dist/types/ai-model/shared/model-locate-result/index.d.ts +3 -0
- package/dist/types/ai-model/shared/model-locate-result/parse.d.ts +5 -0
- package/dist/types/ai-model/shared/model-locate-result/pixel-bbox-mapper.d.ts +7 -0
- package/dist/types/ai-model/shared/model-locate-result/types.d.ts +157 -0
- package/dist/types/ai-model/types.d.ts +2 -0
- package/dist/types/ai-model/workflows/image-preprocess.d.ts +30 -0
- package/dist/types/ai-model/workflows/inspect/index.d.ts +1 -0
- package/dist/types/ai-model/workflows/inspect/locate-result-rect.d.ts +4 -0
- package/dist/types/ai-model/workflows/inspect/search-area-mapping.d.ts +3 -0
- package/dist/types/ai-model/workflows/inspect/types.d.ts +37 -0
- package/dist/types/ai-model/workflows/planning/index.d.ts +2 -0
- package/dist/types/ai-model/workflows/planning/types.d.ts +15 -0
- package/dist/types/common.d.ts +0 -30
- package/dist/types/device/index.d.ts +22 -22
- package/dist/types/service/index.d.ts +5 -4
- package/dist/types/types.d.ts +21 -9
- package/dist/types/yaml.d.ts +8 -2
- package/package.json +2 -2
- package/dist/es/ai-model/auto-glm/actions.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/index.mjs +0 -6
- package/dist/es/ai-model/auto-glm/parser.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/planning.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/util.mjs +0 -9
- package/dist/es/ai-model/auto-glm/util.mjs.map +0 -1
- package/dist/es/ai-model/model-family.mjs +0 -6
- package/dist/es/ai-model/model-family.mjs.map +0 -1
- package/dist/es/ai-model/prompt/common.mjs +0 -8
- package/dist/es/ai-model/prompt/common.mjs.map +0 -1
- package/dist/es/ai-model/service-caller/image-detail.mjs +0 -6
- package/dist/es/ai-model/service-caller/image-detail.mjs.map +0 -1
- package/dist/es/ai-model/ui-tars-planning.mjs.map +0 -1
- package/dist/lib/ai-model/auto-glm/actions.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/index.js +0 -66
- package/dist/lib/ai-model/auto-glm/index.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/parser.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/planning.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/prompt.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/util.js.map +0 -1
- package/dist/lib/ai-model/model-family.js.map +0 -1
- package/dist/lib/ai-model/prompt/common.js.map +0 -1
- package/dist/lib/ai-model/service-caller/image-detail.js.map +0 -1
- package/dist/lib/ai-model/ui-tars-planning.js.map +0 -1
- package/dist/types/ai-model/auto-glm/index.d.ts +0 -6
- package/dist/types/ai-model/auto-glm/planning.d.ts +0 -12
- package/dist/types/ai-model/auto-glm/prompt.d.ts +0 -27
- package/dist/types/ai-model/auto-glm/util.d.ts +0 -13
- package/dist/types/ai-model/model-family.d.ts +0 -7
- package/dist/types/ai-model/prompt/common.d.ts +0 -2
- package/dist/types/ai-model/service-caller/image-detail.d.ts +0 -2
- /package/dist/es/ai-model/{auto-glm → models/auto-glm}/parser.mjs +0 -0
- /package/dist/lib/ai-model/{auto-glm → models/auto-glm}/parser.js +0 -0
- /package/dist/types/ai-model/{auto-glm → models/auto-glm}/parser.d.ts +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { unwrapCoordinateListLikeInput } from "../shared/model-locate-result/index.mjs";
|
|
2
|
+
const defaultBboxSize = 20;
|
|
3
|
+
function topLeftPointToPixelBbox(x, y) {
|
|
4
|
+
return [
|
|
5
|
+
Math.round(x),
|
|
6
|
+
Math.round(y),
|
|
7
|
+
Math.round(x + defaultBboxSize),
|
|
8
|
+
Math.round(y + defaultBboxSize)
|
|
9
|
+
];
|
|
10
|
+
}
|
|
11
|
+
function parseQwen25RawLocateValue(input) {
|
|
12
|
+
const bbox = unwrapCoordinateListLikeInput(input);
|
|
13
|
+
if (bbox.length < 2) {
|
|
14
|
+
const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;
|
|
15
|
+
throw new Error(msg);
|
|
16
|
+
}
|
|
17
|
+
if ('number' == typeof bbox[2] && 'number' == typeof bbox[3]) return {
|
|
18
|
+
type: 'bbox',
|
|
19
|
+
coordinates: [
|
|
20
|
+
bbox[0],
|
|
21
|
+
bbox[1],
|
|
22
|
+
bbox[2],
|
|
23
|
+
bbox[3]
|
|
24
|
+
]
|
|
25
|
+
};
|
|
26
|
+
return {
|
|
27
|
+
type: 'point',
|
|
28
|
+
coordinates: [
|
|
29
|
+
bbox[0],
|
|
30
|
+
bbox[1]
|
|
31
|
+
]
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
function normalizeQwen25ResultToPixelBbox(result) {
|
|
35
|
+
if ('bbox' === result.type) return [
|
|
36
|
+
Math.round(result.coordinates[0]),
|
|
37
|
+
Math.round(result.coordinates[1]),
|
|
38
|
+
Math.round(result.coordinates[2]),
|
|
39
|
+
Math.round(result.coordinates[3])
|
|
40
|
+
];
|
|
41
|
+
return topLeftPointToPixelBbox(result.coordinates[0], result.coordinates[1]);
|
|
42
|
+
}
|
|
43
|
+
const buildQwenChatCompletionParams = (input)=>{
|
|
44
|
+
const { midsceneDefaults, userConfig } = input;
|
|
45
|
+
const { reasoningEnabled, reasoningBudget } = userConfig;
|
|
46
|
+
const effectiveReasoningEnabled = reasoningEnabled ?? false;
|
|
47
|
+
const config = {
|
|
48
|
+
temperature: userConfig.temperature ?? midsceneDefaults.temperature,
|
|
49
|
+
enable_thinking: effectiveReasoningEnabled
|
|
50
|
+
};
|
|
51
|
+
if (void 0 !== reasoningBudget) config.thinking_budget = reasoningBudget;
|
|
52
|
+
return {
|
|
53
|
+
config
|
|
54
|
+
};
|
|
55
|
+
};
|
|
56
|
+
const buildQwen25ChatCompletionParams = (input)=>{
|
|
57
|
+
const { midsceneDefaults, userConfig } = input;
|
|
58
|
+
return {
|
|
59
|
+
config: {
|
|
60
|
+
temperature: userConfig.temperature ?? midsceneDefaults.temperature,
|
|
61
|
+
vl_high_resolution_images: true
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
};
|
|
65
|
+
const qwen3Adapter = {
|
|
66
|
+
chatCompletion: {
|
|
67
|
+
unsupportedUserConfig: [
|
|
68
|
+
'reasoningEffort'
|
|
69
|
+
],
|
|
70
|
+
buildChatCompletionParams: buildQwenChatCompletionParams
|
|
71
|
+
},
|
|
72
|
+
locate: {
|
|
73
|
+
resultAdapter: {
|
|
74
|
+
coordinates: {
|
|
75
|
+
shape: 'bbox',
|
|
76
|
+
order: 'xy',
|
|
77
|
+
normalizedBy: 1000
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
const qwenAdapters = {
|
|
83
|
+
'qwen2.5-vl': {
|
|
84
|
+
chatCompletion: {
|
|
85
|
+
unsupportedUserConfig: [
|
|
86
|
+
'reasoningEnabled',
|
|
87
|
+
'reasoningEffort',
|
|
88
|
+
'reasoningBudget'
|
|
89
|
+
],
|
|
90
|
+
buildChatCompletionParams: buildQwen25ChatCompletionParams
|
|
91
|
+
},
|
|
92
|
+
imagePreprocess: {
|
|
93
|
+
padBlockSize: 28
|
|
94
|
+
},
|
|
95
|
+
locate: {
|
|
96
|
+
resultAdapter: {
|
|
97
|
+
coordinates: {
|
|
98
|
+
shape: 'bbox',
|
|
99
|
+
order: 'xy'
|
|
100
|
+
},
|
|
101
|
+
parseRawLocateValue: parseQwen25RawLocateValue,
|
|
102
|
+
mapLocateResultToPixelBbox: normalizeQwen25ResultToPixelBbox
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
'qwen3-vl': qwen3Adapter,
|
|
107
|
+
qwen3: qwen3Adapter,
|
|
108
|
+
'qwen3.5': qwen3Adapter,
|
|
109
|
+
'qwen3.6': qwen3Adapter
|
|
110
|
+
};
|
|
111
|
+
export { qwenAdapters };
|
|
112
|
+
|
|
113
|
+
//# sourceMappingURL=qwen.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/models/qwen.mjs","sources":["../../../../src/ai-model/models/qwen.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nimport {\n type LocateResultValue,\n type PixelBbox,\n unwrapCoordinateListLikeInput,\n} from '../shared/model-locate-result';\nimport type {\n ChatCompletionCallContext,\n ChatCompletionParamsResult,\n ModelAdapterDefinition,\n} from './types';\n\nconst defaultBboxSize = 20;\n\nfunction topLeftPointToPixelBbox(x: number, y: number): PixelBbox {\n return [\n Math.round(x),\n Math.round(y),\n Math.round(x + defaultBboxSize),\n Math.round(y + defaultBboxSize),\n ];\n}\n\nfunction parseQwen25RawLocateValue(input: unknown): LocateResultValue {\n const bbox = unwrapCoordinateListLikeInput(input as any) as number[];\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n if (typeof bbox[2] === 'number' && typeof bbox[3] === 'number') {\n return {\n type: 'bbox',\n coordinates: [bbox[0], bbox[1], bbox[2], bbox[3]],\n };\n }\n\n return { type: 'point', coordinates: [bbox[0], bbox[1]] };\n}\n\nfunction normalizeQwen25ResultToPixelBbox(\n result: LocateResultValue,\n): PixelBbox {\n if (result.type === 'bbox') {\n return [\n Math.round(result.coordinates[0]),\n Math.round(result.coordinates[1]),\n Math.round(result.coordinates[2]),\n Math.round(result.coordinates[3]),\n ];\n }\n\n return topLeftPointToPixelBbox(result.coordinates[0], result.coordinates[1]);\n}\n\nconst buildQwenChatCompletionParams = (\n input: ChatCompletionCallContext,\n): ChatCompletionParamsResult => {\n const { midsceneDefaults, userConfig } = input;\n const { reasoningEnabled, reasoningBudget } = userConfig;\n const effectiveReasoningEnabled = reasoningEnabled ?? false;\n const config: Record<string, unknown> = {\n temperature: userConfig.temperature ?? midsceneDefaults.temperature,\n enable_thinking: effectiveReasoningEnabled,\n };\n\n if (reasoningBudget !== undefined) {\n config.thinking_budget = reasoningBudget;\n }\n\n return { config };\n};\n\nconst buildQwen25ChatCompletionParams = (\n input: ChatCompletionCallContext,\n): ChatCompletionParamsResult => {\n const { midsceneDefaults, userConfig } = input;\n return {\n config: {\n temperature: userConfig.temperature ?? midsceneDefaults.temperature,\n vl_high_resolution_images: true,\n },\n };\n};\n\nconst qwen3Adapter: ModelAdapterDefinition = {\n chatCompletion: {\n unsupportedUserConfig: ['reasoningEffort'],\n buildChatCompletionParams: buildQwenChatCompletionParams,\n },\n locate: {\n resultAdapter: {\n coordinates: { shape: 'bbox', order: 'xy', normalizedBy: 1000 },\n },\n },\n};\n\nexport const qwenAdapters = {\n 'qwen2.5-vl': {\n chatCompletion: {\n unsupportedUserConfig: [\n 'reasoningEnabled',\n 'reasoningEffort',\n 'reasoningBudget',\n ],\n buildChatCompletionParams: buildQwen25ChatCompletionParams,\n },\n imagePreprocess: {\n padBlockSize: 28,\n },\n locate: {\n resultAdapter: {\n coordinates: { shape: 'bbox', order: 'xy' },\n parseRawLocateValue: parseQwen25RawLocateValue,\n mapLocateResultToPixelBbox: normalizeQwen25ResultToPixelBbox,\n },\n },\n },\n 'qwen3-vl': qwen3Adapter,\n qwen3: qwen3Adapter,\n 'qwen3.5': qwen3Adapter,\n 'qwen3.6': qwen3Adapter,\n} satisfies Pick<\n Record<TModelFamily, ModelAdapterDefinition>,\n 'qwen2.5-vl' | 'qwen3-vl' | 'qwen3' | 'qwen3.5' | 'qwen3.6'\n>;\n"],"names":["defaultBboxSize","topLeftPointToPixelBbox","x","y","Math","parseQwen25RawLocateValue","input","bbox","unwrapCoordinateListLikeInput","msg","JSON","Error","normalizeQwen25ResultToPixelBbox","result","buildQwenChatCompletionParams","midsceneDefaults","userConfig","reasoningEnabled","reasoningBudget","effectiveReasoningEnabled","config","undefined","buildQwen25ChatCompletionParams","qwen3Adapter","qwenAdapters"],"mappings":";AAYA,MAAMA,kBAAkB;AAExB,SAASC,wBAAwBC,CAAS,EAAEC,CAAS;IACnD,OAAO;QACLC,KAAK,KAAK,CAACF;QACXE,KAAK,KAAK,CAACD;QACXC,KAAK,KAAK,CAACF,IAAIF;QACfI,KAAK,KAAK,CAACD,IAAIH;KAChB;AACH;AAEA,SAASK,0BAA0BC,KAAc;IAC/C,MAAMC,OAAOC,8BAA8BF;IAC3C,IAAIC,KAAK,MAAM,GAAG,GAAG;QACnB,MAAME,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACH,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAII,MAAMF;IAClB;IAEA,IAAI,AAAmB,YAAnB,OAAOF,IAAI,CAAC,EAAE,IAAiB,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EAC/C,OAAO;QACL,MAAM;QACN,aAAa;YAACA,IAAI,CAAC,EAAE;YAAEA,IAAI,CAAC,EAAE;YAAEA,IAAI,CAAC,EAAE;YAAEA,IAAI,CAAC,EAAE;SAAC;IACnD;IAGF,OAAO;QAAE,MAAM;QAAS,aAAa;YAACA,IAAI,CAAC,EAAE;YAAEA,IAAI,CAAC,EAAE;SAAC;IAAC;AAC1D;AAEA,SAASK,iCACPC,MAAyB;IAEzB,IAAIA,AAAgB,WAAhBA,OAAO,IAAI,EACb,OAAO;QACLT,KAAK,KAAK,CAACS,OAAO,WAAW,CAAC,EAAE;QAChCT,KAAK,KAAK,CAACS,OAAO,WAAW,CAAC,EAAE;QAChCT,KAAK,KAAK,CAACS,OAAO,WAAW,CAAC,EAAE;QAChCT,KAAK,KAAK,CAACS,OAAO,WAAW,CAAC,EAAE;KACjC;IAGH,OAAOZ,wBAAwBY,OAAO,WAAW,CAAC,EAAE,EAAEA,OAAO,WAAW,CAAC,EAAE;AAC7E;AAEA,MAAMC,gCAAgC,CACpCR;IAEA,MAAM,EAAES,gBAAgB,EAAEC,UAAU,EAAE,GAAGV;IACzC,MAAM,EAAEW,gBAAgB,EAAEC,eAAe,EAAE,GAAGF;IAC9C,MAAMG,4BAA4BF,oBAAoB;IACtD,MAAMG,SAAkC;QACtC,aAAaJ,WAAW,WAAW,IAAID,iBAAiB,WAAW;QACnE,iBAAiBI;IACnB;IAEA,IAAID,AAAoBG,WAApBH,iBACFE,OAAO,eAAe,GAAGF;IAG3B,OAAO;QAAEE;IAAO;AAClB;AAEA,MAAME,kCAAkC,CACtChB;IAEA,MAAM,EAAES,gBAAgB,EAAEC,UAAU,EAAE,GAAGV;IACzC,OAAO;QACL,QAAQ;YACN,aAAaU,WAAW,WAAW,IAAID,iBAAiB,WAAW;YACnE,2BAA2B;QAC7B;IACF;AACF;AAEA,MAAMQ,eAAuC;IAC3C,gBAAgB;QACd,uBAAuB;YAAC;SAAkB;QAC1C,2BAA2BT;IAC7B;IACA,QAAQ;QACN,eAAe;YACb,aAAa;gBAAE,OAAO;gBAAQ,OAAO;gBAAM,cAAc;YAAK;QAChE;IACF;AACF;AAEO,MAAMU,eAAe;IAC1B,cAAc;QACZ,gBAAgB;YACd,uBAAuB;gBACrB;gBACA;gBACA;aACD;YACD,2BAA2BF;QAC7B;QACA,iBAAiB;YACf,cAAc;QAChB;QACA,QAAQ;YACN,eAAe;gBACb,aAAa;oBAAE,OAAO;oBAAQ,OAAO;gBAAK;gBAC1C,qBAAqBjB;gBACrB,4BAA4BO;YAC9B;QACF;IACF;IACA,YAAYW;IACZ,OAAOA;IACP,WAAWA;IACX,WAAWA;AACb"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
2
|
+
import { autoGlmAdapters } from "./auto-glm/adapter.mjs";
|
|
3
|
+
import { defaultOpenAICompatibleAdapterConfig } from "./default.mjs";
|
|
4
|
+
import { doubaoAdapters } from "./doubao.mjs";
|
|
5
|
+
import { geminiAdapters } from "./gemini.mjs";
|
|
6
|
+
import { glmAdapters } from "./glm.mjs";
|
|
7
|
+
import { gptAdapters } from "./gpt.mjs";
|
|
8
|
+
import { qwenAdapters } from "./qwen.mjs";
|
|
9
|
+
import { ResolvedModelAdapter } from "./resolved.mjs";
|
|
10
|
+
import { uiTarsAdapters } from "./ui-tars/adapter.mjs";
|
|
11
|
+
const MODEL_ADAPTER_CONFIGS = {
|
|
12
|
+
...qwenAdapters,
|
|
13
|
+
...doubaoAdapters,
|
|
14
|
+
...geminiAdapters,
|
|
15
|
+
...uiTarsAdapters,
|
|
16
|
+
...glmAdapters,
|
|
17
|
+
...autoGlmAdapters,
|
|
18
|
+
...gptAdapters
|
|
19
|
+
};
|
|
20
|
+
const modelAdapterCache = new Map();
|
|
21
|
+
const debugModelAdapter = getDebug('ai:model-adapter');
|
|
22
|
+
function debugAdapterUnsupportedUserConfig(modelFamily, adapter) {
|
|
23
|
+
if (0 === adapter.chatCompletion.unsupportedUserConfig.length) return;
|
|
24
|
+
debugModelAdapter(`model adapter "${modelFamily}" unsupportedUserConfig: ${JSON.stringify(adapter.chatCompletion.unsupportedUserConfig)}`);
|
|
25
|
+
}
|
|
26
|
+
function getModelAdapter(modelFamily) {
|
|
27
|
+
const cacheKey = modelFamily ?? 'default';
|
|
28
|
+
let adapter = modelAdapterCache.get(cacheKey);
|
|
29
|
+
if (adapter) return adapter;
|
|
30
|
+
const config = modelFamily ? MODEL_ADAPTER_CONFIGS[modelFamily] : defaultOpenAICompatibleAdapterConfig;
|
|
31
|
+
if (!config) throw new Error(`No model adapter registered for modelFamily: ${modelFamily}`);
|
|
32
|
+
adapter = new ResolvedModelAdapter(config, cacheKey);
|
|
33
|
+
modelAdapterCache.set(cacheKey, adapter);
|
|
34
|
+
debugAdapterUnsupportedUserConfig(cacheKey, adapter);
|
|
35
|
+
return adapter;
|
|
36
|
+
}
|
|
37
|
+
function getModelRuntime(config) {
|
|
38
|
+
return {
|
|
39
|
+
config,
|
|
40
|
+
adapter: getModelAdapter(config.modelFamily)
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export { MODEL_ADAPTER_CONFIGS, getModelAdapter, getModelRuntime };
|
|
44
|
+
|
|
45
|
+
//# sourceMappingURL=registry.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/models/registry.mjs","sources":["../../../../src/ai-model/models/registry.ts"],"sourcesContent":["import type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport { autoGlmAdapters } from './auto-glm/adapter';\nimport { defaultOpenAICompatibleAdapterConfig } from './default';\nimport { doubaoAdapters } from './doubao';\nimport { geminiAdapters } from './gemini';\nimport { glmAdapters } from './glm';\nimport { gptAdapters } from './gpt';\nimport { qwenAdapters } from './qwen';\nimport { ResolvedModelAdapter } from './resolved';\nimport type {\n ModelAdapter,\n ModelAdapterDefinition,\n ModelRuntime,\n} from './types';\nimport { uiTarsAdapters } from './ui-tars/adapter';\n\nexport const MODEL_ADAPTER_CONFIGS = {\n ...qwenAdapters,\n ...doubaoAdapters,\n ...geminiAdapters,\n ...uiTarsAdapters,\n ...glmAdapters,\n ...autoGlmAdapters,\n ...gptAdapters,\n} satisfies Record<TModelFamily, ModelAdapterDefinition>;\n\ntype ModelAdapterCacheKey = TModelFamily | 'default';\n\nconst modelAdapterCache = new Map<ModelAdapterCacheKey, ModelAdapter>();\nconst debugModelAdapter = getDebug('ai:model-adapter');\n\nfunction debugAdapterUnsupportedUserConfig(\n modelFamily: ModelAdapterCacheKey,\n adapter: ModelAdapter,\n): void {\n if (adapter.chatCompletion.unsupportedUserConfig.length === 0) {\n return;\n }\n\n debugModelAdapter(\n `model adapter \"${modelFamily}\" unsupportedUserConfig: ${JSON.stringify(\n adapter.chatCompletion.unsupportedUserConfig,\n )}`,\n );\n}\n\nexport function getModelAdapter(modelFamily?: TModelFamily): ModelAdapter {\n const cacheKey: ModelAdapterCacheKey = modelFamily ?? 'default';\n let adapter = modelAdapterCache.get(cacheKey);\n if (adapter) {\n return adapter;\n }\n\n const config = modelFamily\n ? MODEL_ADAPTER_CONFIGS[modelFamily]\n : defaultOpenAICompatibleAdapterConfig;\n if (!config) {\n throw new Error(\n `No model adapter registered for modelFamily: ${modelFamily}`,\n );\n }\n\n adapter = new ResolvedModelAdapter(config, cacheKey);\n modelAdapterCache.set(cacheKey, adapter);\n debugAdapterUnsupportedUserConfig(cacheKey, adapter);\n\n return adapter;\n}\n\nexport function getModelRuntime(config: IModelConfig): ModelRuntime {\n return {\n config,\n adapter: getModelAdapter(config.modelFamily),\n };\n}\n"],"names":["MODEL_ADAPTER_CONFIGS","qwenAdapters","doubaoAdapters","geminiAdapters","uiTarsAdapters","glmAdapters","autoGlmAdapters","gptAdapters","modelAdapterCache","Map","debugModelAdapter","getDebug","debugAdapterUnsupportedUserConfig","modelFamily","adapter","JSON","getModelAdapter","cacheKey","config","defaultOpenAICompatibleAdapterConfig","Error","ResolvedModelAdapter","getModelRuntime"],"mappings":";;;;;;;;;;AAiBO,MAAMA,wBAAwB;IACnC,GAAGC,YAAY;IACf,GAAGC,cAAc;IACjB,GAAGC,cAAc;IACjB,GAAGC,cAAc;IACjB,GAAGC,WAAW;IACd,GAAGC,eAAe;IAClB,GAAGC,WAAW;AAChB;AAIA,MAAMC,oBAAoB,IAAIC;AAC9B,MAAMC,oBAAoBC,SAAS;AAEnC,SAASC,kCACPC,WAAiC,EACjCC,OAAqB;IAErB,IAAIA,AAAwD,MAAxDA,QAAQ,cAAc,CAAC,qBAAqB,CAAC,MAAM,EACrD;IAGFJ,kBACE,CAAC,eAAe,EAAEG,YAAY,yBAAyB,EAAEE,KAAK,SAAS,CACrED,QAAQ,cAAc,CAAC,qBAAqB,GAC3C;AAEP;AAEO,SAASE,gBAAgBH,WAA0B;IACxD,MAAMI,WAAiCJ,eAAe;IACtD,IAAIC,UAAUN,kBAAkB,GAAG,CAACS;IACpC,IAAIH,SACF,OAAOA;IAGT,MAAMI,SAASL,cACXb,qBAAqB,CAACa,YAAY,GAClCM;IACJ,IAAI,CAACD,QACH,MAAM,IAAIE,MACR,CAAC,6CAA6C,EAAEP,aAAa;IAIjEC,UAAU,IAAIO,qBAAqBH,QAAQD;IAC3CT,kBAAkB,GAAG,CAACS,UAAUH;IAChCF,kCAAkCK,UAAUH;IAE5C,OAAOA;AACT;AAEO,SAASQ,gBAAgBJ,MAAoB;IAClD,OAAO;QACLA;QACA,SAASF,gBAAgBE,OAAO,WAAW;IAC7C;AACF"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { normalJsonParser } from "../service-caller/json.mjs";
|
|
2
|
+
import { createLocateResultAdapter } from "../shared/model-locate-result/factory.mjs";
|
|
3
|
+
function _define_property(obj, key, value) {
|
|
4
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
5
|
+
value: value,
|
|
6
|
+
enumerable: true,
|
|
7
|
+
configurable: true,
|
|
8
|
+
writable: true
|
|
9
|
+
});
|
|
10
|
+
else obj[key] = value;
|
|
11
|
+
return obj;
|
|
12
|
+
}
|
|
13
|
+
const defaultReplanningCycleLimit = 20;
|
|
14
|
+
const defaultImageDetail = (_input)=>void 0;
|
|
15
|
+
const defaultChatCompletionParams = ({ midsceneDefaults, userConfig })=>({
|
|
16
|
+
config: {
|
|
17
|
+
temperature: userConfig.temperature ?? midsceneDefaults.temperature
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
const midsceneChatCompletionDefaults = {
|
|
21
|
+
temperature: 0
|
|
22
|
+
};
|
|
23
|
+
const defaultLocateResultAdapterDefinition = {
|
|
24
|
+
coordinates: {
|
|
25
|
+
shape: 'bbox',
|
|
26
|
+
order: 'xy',
|
|
27
|
+
normalizedBy: 1000
|
|
28
|
+
}
|
|
29
|
+
};
|
|
30
|
+
function resolveJsonParser(jsonParser) {
|
|
31
|
+
if (!jsonParser || 'lenient-json' === jsonParser) return normalJsonParser;
|
|
32
|
+
if ('function' == typeof jsonParser) return jsonParser;
|
|
33
|
+
throw new Error(`Unknown json parser preset: ${jsonParser}`);
|
|
34
|
+
}
|
|
35
|
+
function resolveChatCompletion(chatCompletion) {
|
|
36
|
+
const buildChatCompletionParams = chatCompletion?.buildChatCompletionParams ?? defaultChatCompletionParams;
|
|
37
|
+
const resolveImageDetail = chatCompletion?.resolveImageDetail ?? defaultImageDetail;
|
|
38
|
+
const unsupportedUserConfig = chatCompletion?.unsupportedUserConfig ?? [];
|
|
39
|
+
return {
|
|
40
|
+
unsupportedUserConfig,
|
|
41
|
+
buildChatCompletionParams: (input)=>{
|
|
42
|
+
const context = {
|
|
43
|
+
...input,
|
|
44
|
+
userConfig: input.userConfig ?? {},
|
|
45
|
+
midsceneDefaults: midsceneChatCompletionDefaults
|
|
46
|
+
};
|
|
47
|
+
return buildChatCompletionParams(context);
|
|
48
|
+
},
|
|
49
|
+
resolveImageDetail: (input)=>resolveImageDetail({
|
|
50
|
+
...input,
|
|
51
|
+
userConfig: input.userConfig ?? {},
|
|
52
|
+
midsceneDefaults: midsceneChatCompletionDefaults
|
|
53
|
+
})
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
function resolveImagePreprocess(imagePreprocess) {
|
|
57
|
+
return {
|
|
58
|
+
padBlockSize: imagePreprocess?.padBlockSize
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
function resolvePlanning(planning) {
|
|
62
|
+
if (planning?.kind === 'custom') return {
|
|
63
|
+
kind: 'custom',
|
|
64
|
+
cacheEnabled: planning.cacheEnabled ?? true,
|
|
65
|
+
defaultReplanningCycleLimit: planning.defaultReplanningCycleLimit ?? defaultReplanningCycleLimit,
|
|
66
|
+
supportsActionDeepLocate: planning.supportsActionDeepLocate ?? false,
|
|
67
|
+
planFn: planning.planFn
|
|
68
|
+
};
|
|
69
|
+
return {
|
|
70
|
+
kind: 'standard',
|
|
71
|
+
cacheEnabled: planning?.cacheEnabled ?? true,
|
|
72
|
+
defaultReplanningCycleLimit: planning?.defaultReplanningCycleLimit ?? defaultReplanningCycleLimit,
|
|
73
|
+
supportsActionDeepLocate: planning?.supportsActionDeepLocate ?? true
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
function resolveLocate(locate) {
|
|
77
|
+
if (locate?.kind === 'custom') return {
|
|
78
|
+
kind: 'custom',
|
|
79
|
+
supportsSearchArea: locate.supportsSearchArea ?? false,
|
|
80
|
+
locateFn: locate.locateFn
|
|
81
|
+
};
|
|
82
|
+
return {
|
|
83
|
+
kind: 'standard',
|
|
84
|
+
supportsSearchArea: locate?.supportsSearchArea ?? true,
|
|
85
|
+
resultAdapter: createLocateResultAdapter(locate?.resultAdapter ?? defaultLocateResultAdapterDefinition)
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
class ResolvedModelAdapter {
|
|
89
|
+
constructor(config, modelFamily){
|
|
90
|
+
_define_property(this, "jsonParser", void 0);
|
|
91
|
+
_define_property(this, "chatCompletion", void 0);
|
|
92
|
+
_define_property(this, "imagePreprocess", void 0);
|
|
93
|
+
_define_property(this, "planning", void 0);
|
|
94
|
+
_define_property(this, "locate", void 0);
|
|
95
|
+
this.jsonParser = resolveJsonParser(config.jsonParser);
|
|
96
|
+
this.chatCompletion = resolveChatCompletion(config.chatCompletion);
|
|
97
|
+
this.imagePreprocess = resolveImagePreprocess(config.imagePreprocess);
|
|
98
|
+
this.planning = resolvePlanning(config.planning);
|
|
99
|
+
this.locate = resolveLocate(config.locate);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
export { ResolvedModelAdapter };
|
|
103
|
+
|
|
104
|
+
//# sourceMappingURL=resolved.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/models/resolved.mjs","sources":["../../../../src/ai-model/models/resolved.ts"],"sourcesContent":["import { normalJsonParser } from '../service-caller/json';\nimport { createLocateResultAdapter } from '../shared/model-locate-result/factory';\nimport type { LocateResultAdapterDefinition } from '../shared/model-locate-result/types';\nimport type {\n ChatCompletionAdapter,\n ChatCompletionCallContext,\n ChatCompletionCallInput,\n ImagePreprocessPolicy,\n JsonParser,\n LocateAdapter,\n MidsceneChatCompletionDefaults,\n ModelAdapter,\n ModelAdapterDefinition,\n PlanningAdapter,\n} from './types';\n\nconst defaultReplanningCycleLimit = 20;\n\nconst defaultImageDetail = (_input: unknown) => undefined;\n\nconst defaultChatCompletionParams = ({\n midsceneDefaults,\n userConfig,\n}: ChatCompletionCallContext) => ({\n config: {\n temperature: userConfig.temperature ?? midsceneDefaults.temperature,\n },\n});\n\nconst midsceneChatCompletionDefaults: MidsceneChatCompletionDefaults = {\n temperature: 0,\n};\n\nconst defaultLocateResultAdapterDefinition: LocateResultAdapterDefinition = {\n coordinates: { shape: 'bbox', order: 'xy', normalizedBy: 1000 },\n};\n\nfunction resolveJsonParser(\n jsonParser: ModelAdapterDefinition['jsonParser'],\n): JsonParser {\n if (!jsonParser || jsonParser === 'lenient-json') {\n return normalJsonParser;\n }\n\n if (typeof jsonParser === 'function') {\n return jsonParser;\n }\n\n throw new Error(`Unknown json parser preset: ${jsonParser}`);\n}\n\nfunction resolveChatCompletion(\n chatCompletion: ModelAdapterDefinition['chatCompletion'],\n): ChatCompletionAdapter {\n const buildChatCompletionParams =\n chatCompletion?.buildChatCompletionParams ?? defaultChatCompletionParams;\n const resolveImageDetail =\n chatCompletion?.resolveImageDetail ?? defaultImageDetail;\n const unsupportedUserConfig = chatCompletion?.unsupportedUserConfig ?? [];\n\n return {\n unsupportedUserConfig,\n buildChatCompletionParams: (input) => {\n const context = {\n ...input,\n userConfig: input.userConfig ?? {},\n midsceneDefaults: midsceneChatCompletionDefaults,\n };\n return buildChatCompletionParams(context);\n },\n resolveImageDetail: (input) =>\n resolveImageDetail({\n ...input,\n userConfig: input.userConfig ?? {},\n midsceneDefaults: midsceneChatCompletionDefaults,\n }),\n };\n}\n\nfunction resolveImagePreprocess(\n imagePreprocess: ModelAdapterDefinition['imagePreprocess'],\n): ImagePreprocessPolicy {\n return {\n padBlockSize: imagePreprocess?.padBlockSize,\n };\n}\n\nfunction resolvePlanning(\n planning: ModelAdapterDefinition['planning'],\n): PlanningAdapter {\n if (planning?.kind === 'custom') {\n return {\n kind: 'custom',\n cacheEnabled: planning.cacheEnabled ?? true,\n defaultReplanningCycleLimit:\n planning.defaultReplanningCycleLimit ?? defaultReplanningCycleLimit,\n supportsActionDeepLocate: planning.supportsActionDeepLocate ?? false,\n planFn: planning.planFn,\n };\n }\n\n return {\n kind: 'standard',\n cacheEnabled: planning?.cacheEnabled ?? true,\n defaultReplanningCycleLimit:\n planning?.defaultReplanningCycleLimit ?? defaultReplanningCycleLimit,\n supportsActionDeepLocate: planning?.supportsActionDeepLocate ?? true,\n };\n}\n\nfunction resolveLocate(\n locate: ModelAdapterDefinition['locate'],\n): LocateAdapter {\n if (locate?.kind === 'custom') {\n return {\n kind: 'custom',\n supportsSearchArea: locate.supportsSearchArea ?? false,\n locateFn: locate.locateFn,\n };\n }\n\n return {\n kind: 'standard',\n supportsSearchArea: locate?.supportsSearchArea ?? true,\n resultAdapter: createLocateResultAdapter(\n locate?.resultAdapter ?? defaultLocateResultAdapterDefinition,\n ),\n };\n}\n\nexport class ResolvedModelAdapter implements ModelAdapter {\n readonly jsonParser: JsonParser;\n readonly chatCompletion: ChatCompletionAdapter;\n readonly imagePreprocess: ImagePreprocessPolicy;\n readonly planning: PlanningAdapter;\n readonly locate: LocateAdapter;\n\n constructor(config: ModelAdapterDefinition, modelFamily: string) {\n this.jsonParser = resolveJsonParser(config.jsonParser);\n this.chatCompletion = resolveChatCompletion(config.chatCompletion);\n this.imagePreprocess = resolveImagePreprocess(config.imagePreprocess);\n this.planning = resolvePlanning(config.planning);\n this.locate = resolveLocate(config.locate);\n }\n}\n"],"names":["defaultReplanningCycleLimit","defaultImageDetail","_input","undefined","defaultChatCompletionParams","midsceneDefaults","userConfig","midsceneChatCompletionDefaults","defaultLocateResultAdapterDefinition","resolveJsonParser","jsonParser","normalJsonParser","Error","resolveChatCompletion","chatCompletion","buildChatCompletionParams","resolveImageDetail","unsupportedUserConfig","input","context","resolveImagePreprocess","imagePreprocess","resolvePlanning","planning","resolveLocate","locate","createLocateResultAdapter","ResolvedModelAdapter","config","modelFamily"],"mappings":";;;;;;;;;;;;AAgBA,MAAMA,8BAA8B;AAEpC,MAAMC,qBAAqB,CAACC,SAAoBC;AAEhD,MAAMC,8BAA8B,CAAC,EACnCC,gBAAgB,EAChBC,UAAU,EACgB,GAAM;QAChC,QAAQ;YACN,aAAaA,WAAW,WAAW,IAAID,iBAAiB,WAAW;QACrE;IACF;AAEA,MAAME,iCAAiE;IACrE,aAAa;AACf;AAEA,MAAMC,uCAAsE;IAC1E,aAAa;QAAE,OAAO;QAAQ,OAAO;QAAM,cAAc;IAAK;AAChE;AAEA,SAASC,kBACPC,UAAgD;IAEhD,IAAI,CAACA,cAAcA,AAAe,mBAAfA,YACjB,OAAOC;IAGT,IAAI,AAAsB,cAAtB,OAAOD,YACT,OAAOA;IAGT,MAAM,IAAIE,MAAM,CAAC,4BAA4B,EAAEF,YAAY;AAC7D;AAEA,SAASG,sBACPC,cAAwD;IAExD,MAAMC,4BACJD,gBAAgB,6BAA6BV;IAC/C,MAAMY,qBACJF,gBAAgB,sBAAsBb;IACxC,MAAMgB,wBAAwBH,gBAAgB,yBAAyB,EAAE;IAEzE,OAAO;QACLG;QACA,2BAA2B,CAACC;YAC1B,MAAMC,UAAU;gBACd,GAAGD,KAAK;gBACR,YAAYA,MAAM,UAAU,IAAI,CAAC;gBACjC,kBAAkBX;YACpB;YACA,OAAOQ,0BAA0BI;QACnC;QACA,oBAAoB,CAACD,QACnBF,mBAAmB;gBACjB,GAAGE,KAAK;gBACR,YAAYA,MAAM,UAAU,IAAI,CAAC;gBACjC,kBAAkBX;YACpB;IACJ;AACF;AAEA,SAASa,uBACPC,eAA0D;IAE1D,OAAO;QACL,cAAcA,iBAAiB;IACjC;AACF;AAEA,SAASC,gBACPC,QAA4C;IAE5C,IAAIA,UAAU,SAAS,UACrB,OAAO;QACL,MAAM;QACN,cAAcA,SAAS,YAAY,IAAI;QACvC,6BACEA,SAAS,2BAA2B,IAAIvB;QAC1C,0BAA0BuB,SAAS,wBAAwB,IAAI;QAC/D,QAAQA,SAAS,MAAM;IACzB;IAGF,OAAO;QACL,MAAM;QACN,cAAcA,UAAU,gBAAgB;QACxC,6BACEA,UAAU,+BAA+BvB;QAC3C,0BAA0BuB,UAAU,4BAA4B;IAClE;AACF;AAEA,SAASC,cACPC,MAAwC;IAExC,IAAIA,QAAQ,SAAS,UACnB,OAAO;QACL,MAAM;QACN,oBAAoBA,OAAO,kBAAkB,IAAI;QACjD,UAAUA,OAAO,QAAQ;IAC3B;IAGF,OAAO;QACL,MAAM;QACN,oBAAoBA,QAAQ,sBAAsB;QAClD,eAAeC,0BACbD,QAAQ,iBAAiBjB;IAE7B;AACF;AAEO,MAAMmB;IAOX,YAAYC,MAA8B,EAAEC,WAAmB,CAAE;QANjE,uBAAS,cAAT;QACA,uBAAS,kBAAT;QACA,uBAAS,mBAAT;QACA,uBAAS,YAAT;QACA,uBAAS,UAAT;QAGE,IAAI,CAAC,UAAU,GAAGpB,kBAAkBmB,OAAO,UAAU;QACrD,IAAI,CAAC,cAAc,GAAGf,sBAAsBe,OAAO,cAAc;QACjE,IAAI,CAAC,eAAe,GAAGR,uBAAuBQ,OAAO,eAAe;QACpE,IAAI,CAAC,QAAQ,GAAGN,gBAAgBM,OAAO,QAAQ;QAC/C,IAAI,CAAC,MAAM,GAAGJ,cAAcI,OAAO,MAAM;IAC3C;AACF"}
|
|
File without changes
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { UITarsModelVersion } from "@midscene/shared/env";
|
|
2
|
+
import { assert } from "@midscene/shared/utils";
|
|
3
|
+
import { jsonrepair } from "jsonrepair";
|
|
4
|
+
import { extractJSONFromCodeBlock, safeParseJson } from "../../service-caller/json.mjs";
|
|
5
|
+
import { unwrapCoordinateListLikeInput } from "../../shared/model-locate-result/index.mjs";
|
|
6
|
+
import { uiTarsPlanning } from "./planning.mjs";
|
|
7
|
+
const defaultVlmUiTarsReplanningCycleLimit = 40;
|
|
8
|
+
function normalizeJsonObject(obj, context = {}) {
|
|
9
|
+
if (null == obj) return obj;
|
|
10
|
+
if (Array.isArray(obj)) return obj.map((item)=>normalizeJsonObject(item, context));
|
|
11
|
+
if ('object' == typeof obj) {
|
|
12
|
+
const normalized = {};
|
|
13
|
+
for (const [key, value] of Object.entries(obj)){
|
|
14
|
+
const trimmedKey = key.trim();
|
|
15
|
+
const preserveStringValue = context.preserveStringValueKeys?.includes(trimmedKey) ?? false;
|
|
16
|
+
const normalizedValue = 'string' == typeof value ? preserveStringValue ? value : value.trim() : normalizeJsonObject(value, context);
|
|
17
|
+
normalized[trimmedKey] = normalizedValue;
|
|
18
|
+
}
|
|
19
|
+
return normalized;
|
|
20
|
+
}
|
|
21
|
+
return 'string' == typeof obj ? obj.trim() : obj;
|
|
22
|
+
}
|
|
23
|
+
function shouldRepairUiTarsLocateJson(source) {
|
|
24
|
+
return 'locate' === source || 'section-locator' === source || 'planning-action-param' === source;
|
|
25
|
+
}
|
|
26
|
+
function preprocessUiTarsLocateJson(input) {
|
|
27
|
+
if (input.includes('bbox')) while(/\d+\s+\d+/.test(input))input = input.replace(/(\d+)\s+(\d+)/g, '$1,$2');
|
|
28
|
+
return input;
|
|
29
|
+
}
|
|
30
|
+
const uiTarsJsonParser = (raw, context = {
|
|
31
|
+
source: 'generic-object'
|
|
32
|
+
})=>{
|
|
33
|
+
const { source } = context;
|
|
34
|
+
try {
|
|
35
|
+
return safeParseJson(raw, context);
|
|
36
|
+
} catch (firstError) {
|
|
37
|
+
if (!shouldRepairUiTarsLocateJson(source)) throw firstError;
|
|
38
|
+
const jsonString = preprocessUiTarsLocateJson(extractJSONFromCodeBlock(raw));
|
|
39
|
+
try {
|
|
40
|
+
return normalizeJsonObject(JSON.parse(jsonrepair(jsonString)), context);
|
|
41
|
+
} catch (error) {
|
|
42
|
+
throw Error(`failed to parse LLM response into JSON. Error - ${String(error ?? firstError ?? 'unknown error')}. Response - \n ${raw}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
function parseUiTarsRawLocateValue(input) {
|
|
47
|
+
const bbox = unwrapCoordinateListLikeInput(input);
|
|
48
|
+
if ('string' == typeof bbox) {
|
|
49
|
+
assert(/^(\d+)\s(\d+)\s(\d+)\s(\d+)$/.test(bbox.trim()), `invalid bbox data string for ui-tars mode: ${bbox}`);
|
|
50
|
+
const splitted = bbox.split(' ');
|
|
51
|
+
if (4 === splitted.length) return {
|
|
52
|
+
type: 'bbox',
|
|
53
|
+
coordinates: [
|
|
54
|
+
Number(splitted[0]),
|
|
55
|
+
Number(splitted[1]),
|
|
56
|
+
Number(splitted[2]),
|
|
57
|
+
Number(splitted[3])
|
|
58
|
+
]
|
|
59
|
+
};
|
|
60
|
+
throw new Error(`invalid bbox data string for ui-tars mode: ${bbox}`);
|
|
61
|
+
}
|
|
62
|
+
let bboxList = [];
|
|
63
|
+
if (Array.isArray(bbox) && 'string' == typeof bbox[0]) bbox.forEach((item)=>{
|
|
64
|
+
if ('string' == typeof item && item.includes(',')) {
|
|
65
|
+
const [x, y] = item.split(',');
|
|
66
|
+
bboxList.push(Number(x.trim()), Number(y.trim()));
|
|
67
|
+
} else if ('string' == typeof item && item.includes(' ')) {
|
|
68
|
+
const [x, y] = item.split(' ');
|
|
69
|
+
bboxList.push(Number(x.trim()), Number(y.trim()));
|
|
70
|
+
} else bboxList.push(Number(item));
|
|
71
|
+
});
|
|
72
|
+
else bboxList = bbox;
|
|
73
|
+
if (4 === bboxList.length || 5 === bboxList.length) return {
|
|
74
|
+
type: 'bbox',
|
|
75
|
+
coordinates: [
|
|
76
|
+
bboxList[0],
|
|
77
|
+
bboxList[1],
|
|
78
|
+
bboxList[2],
|
|
79
|
+
bboxList[3]
|
|
80
|
+
]
|
|
81
|
+
};
|
|
82
|
+
if (6 === bboxList.length || 2 === bboxList.length || 3 === bboxList.length || 7 === bboxList.length) return {
|
|
83
|
+
type: 'point',
|
|
84
|
+
coordinates: [
|
|
85
|
+
bboxList[0],
|
|
86
|
+
bboxList[1]
|
|
87
|
+
]
|
|
88
|
+
};
|
|
89
|
+
if (8 === bbox.length) return {
|
|
90
|
+
type: 'bbox',
|
|
91
|
+
coordinates: [
|
|
92
|
+
bboxList[0],
|
|
93
|
+
bboxList[1],
|
|
94
|
+
bboxList[4],
|
|
95
|
+
bboxList[5]
|
|
96
|
+
]
|
|
97
|
+
};
|
|
98
|
+
const msg = `invalid bbox data for ui-tars mode: ${JSON.stringify(bbox)} `;
|
|
99
|
+
throw new Error(msg);
|
|
100
|
+
}
|
|
101
|
+
function createUiTarsAdapter(uiTarsModelVersion) {
|
|
102
|
+
return {
|
|
103
|
+
jsonParser: uiTarsJsonParser,
|
|
104
|
+
chatCompletion: {
|
|
105
|
+
unsupportedUserConfig: [
|
|
106
|
+
'reasoningEnabled',
|
|
107
|
+
'reasoningEffort',
|
|
108
|
+
'reasoningBudget'
|
|
109
|
+
],
|
|
110
|
+
buildChatCompletionParams: ({ midsceneDefaults, userConfig })=>({
|
|
111
|
+
config: {
|
|
112
|
+
temperature: userConfig.temperature ?? midsceneDefaults.temperature
|
|
113
|
+
}
|
|
114
|
+
})
|
|
115
|
+
},
|
|
116
|
+
planning: {
|
|
117
|
+
kind: 'custom',
|
|
118
|
+
cacheEnabled: false,
|
|
119
|
+
defaultReplanningCycleLimit: defaultVlmUiTarsReplanningCycleLimit,
|
|
120
|
+
planFn: (userInstruction, options)=>uiTarsPlanning(userInstruction, options, uiTarsModelVersion)
|
|
121
|
+
},
|
|
122
|
+
locate: {
|
|
123
|
+
resultAdapter: {
|
|
124
|
+
coordinates: {
|
|
125
|
+
shape: 'bbox',
|
|
126
|
+
order: 'xy',
|
|
127
|
+
normalizedBy: 1000
|
|
128
|
+
},
|
|
129
|
+
parseRawLocateValue: parseUiTarsRawLocateValue
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
const uiTarsDoubao15Adapter = createUiTarsAdapter(UITarsModelVersion.DOUBAO_1_5_20B);
|
|
135
|
+
const uiTarsAdapters = {
|
|
136
|
+
'vlm-ui-tars': createUiTarsAdapter(UITarsModelVersion.V1_0),
|
|
137
|
+
'vlm-ui-tars-doubao': uiTarsDoubao15Adapter,
|
|
138
|
+
'vlm-ui-tars-doubao-1.5': uiTarsDoubao15Adapter
|
|
139
|
+
};
|
|
140
|
+
export { uiTarsAdapters };
|
|
141
|
+
|
|
142
|
+
//# sourceMappingURL=adapter.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/models/ui-tars/adapter.mjs","sources":["../../../../../src/ai-model/models/ui-tars/adapter.ts"],"sourcesContent":["import { type TModelFamily, UITarsModelVersion } from '@midscene/shared/env';\nimport { assert } from '@midscene/shared/utils';\nimport { jsonrepair } from 'jsonrepair';\nimport {\n extractJSONFromCodeBlock,\n safeParseJson,\n} from '../../service-caller/json';\nimport {\n type LocateResultValue,\n unwrapCoordinateListLikeInput,\n} from '../../shared/model-locate-result';\nimport type {\n JsonParserContext,\n JsonParserSource,\n ModelAdapterDefinition,\n} from '../types';\nimport { uiTarsPlanning } from './planning';\n\nconst defaultVlmUiTarsReplanningCycleLimit = 40;\n\nfunction normalizeJsonObject(\n obj: any,\n context: Pick<JsonParserContext, 'preserveStringValueKeys'> = {},\n): any {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => normalizeJsonObject(item, context));\n }\n\n if (typeof obj === 'object') {\n const normalized: any = {};\n for (const [key, value] of Object.entries(obj)) {\n const trimmedKey = key.trim();\n const preserveStringValue =\n context.preserveStringValueKeys?.includes(trimmedKey) ?? false;\n const normalizedValue =\n typeof value === 'string'\n ? preserveStringValue\n ? value\n : value.trim()\n : normalizeJsonObject(value, context);\n normalized[trimmedKey] = normalizedValue;\n }\n return normalized;\n }\n\n return typeof obj === 'string' ? obj.trim() : obj;\n}\n\nfunction shouldRepairUiTarsLocateJson(source: JsonParserSource) {\n return (\n source === 'locate' ||\n source === 'section-locator' ||\n source === 'planning-action-param'\n );\n}\n\nfunction preprocessUiTarsLocateJson(input: string) {\n if (input.includes('bbox')) {\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nconst uiTarsJsonParser: ModelAdapterDefinition['jsonParser'] = (\n raw,\n context = { source: 'generic-object' },\n) => {\n const { source } = context;\n try {\n return safeParseJson(raw, context);\n } catch (firstError) {\n if (!shouldRepairUiTarsLocateJson(source)) {\n throw firstError;\n }\n\n const jsonString = preprocessUiTarsLocateJson(\n extractJSONFromCodeBlock(raw),\n );\n try {\n return normalizeJsonObject(JSON.parse(jsonrepair(jsonString)), context);\n } catch (error) {\n throw Error(\n `failed to parse LLM response into JSON. Error - ${String(\n error ?? firstError ?? 'unknown error',\n )}. Response - \\n ${raw}`,\n );\n }\n }\n};\n\n// UI-TARS has not received active updates for a long time, so this parser is\n// intentionally kept separate from Doubao even though the current logic is the\n// same. This avoids coupling UI-TARS behavior to future Doubao adapter changes.\nfunction parseUiTarsRawLocateValue(input: unknown): LocateResultValue {\n const bbox = unwrapCoordinateListLikeInput(input as any);\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for ui-tars mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return {\n type: 'bbox',\n coordinates: [\n Number(splitted[0]),\n Number(splitted[1]),\n Number(splitted[2]),\n Number(splitted[3]),\n ],\n };\n }\n throw new Error(`invalid bbox data string for ui-tars mode: ${bbox}`);\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as number[];\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return {\n type: 'bbox',\n coordinates: [bboxList[0], bboxList[1], bboxList[2], bboxList[3]],\n };\n }\n\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return { type: 'point', coordinates: [bboxList[0], bboxList[1]] };\n }\n\n if (bbox.length === 8) {\n return {\n type: 'bbox',\n coordinates: [bboxList[0], bboxList[1], bboxList[4], bboxList[5]],\n };\n }\n\n const msg = `invalid bbox data for ui-tars mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nfunction createUiTarsAdapter(\n uiTarsModelVersion: UITarsModelVersion,\n): ModelAdapterDefinition {\n return {\n jsonParser: uiTarsJsonParser,\n chatCompletion: {\n unsupportedUserConfig: [\n 'reasoningEnabled',\n 'reasoningEffort',\n 'reasoningBudget',\n ],\n buildChatCompletionParams: ({ midsceneDefaults, userConfig }) => ({\n config: {\n temperature: userConfig.temperature ?? midsceneDefaults.temperature,\n },\n }),\n },\n planning: {\n kind: 'custom',\n cacheEnabled: false,\n defaultReplanningCycleLimit: defaultVlmUiTarsReplanningCycleLimit,\n planFn: (userInstruction, options) =>\n uiTarsPlanning(userInstruction, options, uiTarsModelVersion),\n },\n locate: {\n resultAdapter: {\n coordinates: { shape: 'bbox', order: 'xy', normalizedBy: 1000 },\n parseRawLocateValue: parseUiTarsRawLocateValue,\n },\n },\n };\n}\n\nconst uiTarsDoubao15Adapter = createUiTarsAdapter(\n UITarsModelVersion.DOUBAO_1_5_20B,\n);\n\nexport const uiTarsAdapters = {\n 'vlm-ui-tars': createUiTarsAdapter(UITarsModelVersion.V1_0),\n 'vlm-ui-tars-doubao': uiTarsDoubao15Adapter,\n 'vlm-ui-tars-doubao-1.5': uiTarsDoubao15Adapter,\n} satisfies Pick<\n Record<TModelFamily, ModelAdapterDefinition>,\n 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5'\n>;\n"],"names":["defaultVlmUiTarsReplanningCycleLimit","normalizeJsonObject","obj","context","Array","item","normalized","key","value","Object","trimmedKey","preserveStringValue","normalizedValue","shouldRepairUiTarsLocateJson","source","preprocessUiTarsLocateJson","input","uiTarsJsonParser","raw","safeParseJson","firstError","jsonString","extractJSONFromCodeBlock","JSON","jsonrepair","error","Error","String","parseUiTarsRawLocateValue","bbox","unwrapCoordinateListLikeInput","assert","splitted","Number","bboxList","x","y","msg","createUiTarsAdapter","uiTarsModelVersion","midsceneDefaults","userConfig","userInstruction","options","uiTarsPlanning","uiTarsDoubao15Adapter","UITarsModelVersion","uiTarsAdapters"],"mappings":";;;;;;AAkBA,MAAMA,uCAAuC;AAE7C,SAASC,oBACPC,GAAQ,EACRC,UAA8D,CAAC,CAAC;IAEhE,IAAID,QAAAA,KACF,OAAOA;IAGT,IAAIE,MAAM,OAAO,CAACF,MAChB,OAAOA,IAAI,GAAG,CAAC,CAACG,OAASJ,oBAAoBI,MAAMF;IAGrD,IAAI,AAAe,YAAf,OAAOD,KAAkB;QAC3B,MAAMI,aAAkB,CAAC;QACzB,KAAK,MAAM,CAACC,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACP,KAAM;YAC9C,MAAMQ,aAAaH,IAAI,IAAI;YAC3B,MAAMI,sBACJR,QAAQ,uBAAuB,EAAE,SAASO,eAAe;YAC3D,MAAME,kBACJ,AAAiB,YAAjB,OAAOJ,QACHG,sBACEH,QACAA,MAAM,IAAI,KACZP,oBAAoBO,OAAOL;YACjCG,UAAU,CAACI,WAAW,GAAGE;QAC3B;QACA,OAAON;IACT;IAEA,OAAO,AAAe,YAAf,OAAOJ,MAAmBA,IAAI,IAAI,KAAKA;AAChD;AAEA,SAASW,6BAA6BC,MAAwB;IAC5D,OACEA,AAAW,aAAXA,UACAA,AAAW,sBAAXA,UACAA,AAAW,4BAAXA;AAEJ;AAEA,SAASC,2BAA2BC,KAAa;IAC/C,IAAIA,MAAM,QAAQ,CAAC,SACjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEA,MAAMC,mBAAyD,CAC7DC,KACAf,UAAU;IAAE,QAAQ;AAAiB,CAAC;IAEtC,MAAM,EAAEW,MAAM,EAAE,GAAGX;IACnB,IAAI;QACF,OAAOgB,cAAcD,KAAKf;IAC5B,EAAE,OAAOiB,YAAY;QACnB,IAAI,CAACP,6BAA6BC,SAChC,MAAMM;QAGR,MAAMC,aAAaN,2BACjBO,yBAAyBJ;QAE3B,IAAI;YACF,OAAOjB,oBAAoBsB,KAAK,KAAK,CAACC,WAAWH,cAAclB;QACjE,EAAE,OAAOsB,OAAO;YACd,MAAMC,MACJ,CAAC,gDAAgD,EAAEC,OACjDF,SAASL,cAAc,iBACvB,gBAAgB,EAAEF,KAAK;QAE7B;IACF;AACF;AAKA,SAASU,0BAA0BZ,KAAc;IAC/C,MAAMa,OAAOC,8BAA8Bd;IAC3C,IAAI,AAAgB,YAAhB,OAAOa,MAAmB;QAC5BE,OACE,+BAA+B,IAAI,CAACF,KAAK,IAAI,KAC7C,CAAC,2CAA2C,EAAEA,MAAM;QAEtD,MAAMG,WAAWH,KAAK,KAAK,CAAC;QAC5B,IAAIG,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,MAAM;YACN,aAAa;gBACXC,OAAOD,QAAQ,CAAC,EAAE;gBAClBC,OAAOD,QAAQ,CAAC,EAAE;gBAClBC,OAAOD,QAAQ,CAAC,EAAE;gBAClBC,OAAOD,QAAQ,CAAC,EAAE;aACnB;QACH;QAEF,MAAM,IAAIN,MAAM,CAAC,2CAA2C,EAAEG,MAAM;IACtE;IAEA,IAAIK,WAAqB,EAAE;IAC3B,IAAI9B,MAAM,OAAO,CAACyB,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACxB;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAAC8B,GAAGC,EAAE,GAAG/B,KAAK,KAAK,CAAC;YAC1B6B,SAAS,IAAI,CAACD,OAAOE,EAAE,IAAI,KAAKF,OAAOG,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAO/B,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAAC8B,GAAGC,EAAE,GAAG/B,KAAK,KAAK,CAAC;YAC1B6B,SAAS,IAAI,CAACD,OAAOE,EAAE,IAAI,KAAKF,OAAOG,EAAE,IAAI;QAC/C,OACEF,SAAS,IAAI,CAACD,OAAO5B;IAEzB;SAEA6B,WAAWL;IAGb,IAAIK,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACL,MAAM;QACN,aAAa;YAACA,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;SAAC;IACnE;IAGF,IACEA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QAAE,MAAM;QAAS,aAAa;YAACA,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;SAAC;IAAC;IAGlE,IAAIL,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACL,MAAM;QACN,aAAa;YAACK,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;YAAEA,QAAQ,CAAC,EAAE;SAAC;IACnE;IAGF,MAAMG,MAAM,CAAC,oCAAoC,EAAEd,KAAK,SAAS,CAACM,MAAM,CAAC,CAAC;IAC1E,MAAM,IAAIH,MAAMW;AAClB;AAEA,SAASC,oBACPC,kBAAsC;IAEtC,OAAO;QACL,YAAYtB;QACZ,gBAAgB;YACd,uBAAuB;gBACrB;gBACA;gBACA;aACD;YACD,2BAA2B,CAAC,EAAEuB,gBAAgB,EAAEC,UAAU,EAAE,GAAM;oBAChE,QAAQ;wBACN,aAAaA,WAAW,WAAW,IAAID,iBAAiB,WAAW;oBACrE;gBACF;QACF;QACA,UAAU;YACR,MAAM;YACN,cAAc;YACd,6BAA6BxC;YAC7B,QAAQ,CAAC0C,iBAAiBC,UACxBC,eAAeF,iBAAiBC,SAASJ;QAC7C;QACA,QAAQ;YACN,eAAe;gBACb,aAAa;oBAAE,OAAO;oBAAQ,OAAO;oBAAM,cAAc;gBAAK;gBAC9D,qBAAqBX;YACvB;QACF;IACF;AACF;AAEA,MAAMiB,wBAAwBP,oBAC5BQ,mBAAmB,cAAc;AAG5B,MAAMC,iBAAiB;IAC5B,eAAeT,oBAAoBQ,mBAAmB,IAAI;IAC1D,sBAAsBD;IACtB,0BAA0BA;AAC5B"}
|
|
@@ -2,22 +2,33 @@ import { getDebug } from "@midscene/shared/logger";
|
|
|
2
2
|
import { transformHotkeyInput } from "@midscene/shared/us-keyboard-layout";
|
|
3
3
|
import { assert } from "@midscene/shared/utils";
|
|
4
4
|
import { actionParser } from "@ui-tars/action-parser";
|
|
5
|
-
import { getSummary, getUiTarsPlanningPrompt } from "
|
|
6
|
-
import { AIResponseParseError, callAIWithStringResponse } from "
|
|
5
|
+
import { getSummary, getUiTarsPlanningPrompt } from "../../prompt/ui-tars-planning.mjs";
|
|
6
|
+
import { AIResponseParseError, callAIWithStringResponse } from "../../service-caller/index.mjs";
|
|
7
|
+
import { finalizePixelBbox } from "../../shared/model-locate-result/bbox.mjs";
|
|
8
|
+
import { mapLocateResultToPixelBboxByCoordinates } from "../../shared/model-locate-result/pixel-bbox-mapper.mjs";
|
|
7
9
|
const debug = getDebug('ui-tars-planning');
|
|
8
10
|
const warnLog = getDebug('ui-tars-planning', {
|
|
9
11
|
console: true
|
|
10
12
|
});
|
|
11
|
-
|
|
12
|
-
const
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
function pointToLocateParam(point, thought, size) {
|
|
14
|
+
const ctx = {
|
|
15
|
+
preparedSize: size
|
|
16
|
+
};
|
|
17
|
+
const pixelBbox = mapLocateResultToPixelBboxByCoordinates({
|
|
18
|
+
type: 'point',
|
|
19
|
+
coordinates: point
|
|
20
|
+
}, ctx, {
|
|
21
|
+
shape: 'point',
|
|
22
|
+
order: 'xy',
|
|
23
|
+
normalizedBy: 1
|
|
24
|
+
});
|
|
25
|
+
return {
|
|
26
|
+
prompt: thought || '',
|
|
27
|
+
locatedPixelBbox: finalizePixelBbox(pixelBbox, point, ctx)
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
async function uiTarsPlanning(userInstruction, options, uiTarsModelVersion) {
|
|
31
|
+
const { conversationHistory, context, modelRuntime, actionContext } = options;
|
|
21
32
|
let instruction = userInstruction;
|
|
22
33
|
if (actionContext) instruction = `<high_priority_knowledge>${actionContext}</high_priority_knowledge>\n<user_instruction>${userInstruction}</user_instruction>`;
|
|
23
34
|
const systemPrompt = getUiTarsPlanningPrompt() + instruction;
|
|
@@ -39,7 +50,7 @@ async function uiTarsPlanning(userInstruction, options) {
|
|
|
39
50
|
content: systemPrompt
|
|
40
51
|
},
|
|
41
52
|
...conversationHistory.snapshot()
|
|
42
|
-
],
|
|
53
|
+
], modelRuntime, {
|
|
43
54
|
abortSignal: options.abortSignal
|
|
44
55
|
});
|
|
45
56
|
let convertedText;
|
|
@@ -73,76 +84,46 @@ async function uiTarsPlanning(userInstruction, options) {
|
|
|
73
84
|
const actionType = (action.action_type || '').toLowerCase();
|
|
74
85
|
if ('click' === actionType) {
|
|
75
86
|
assert(action.action_inputs.start_box, 'start_box is required');
|
|
76
|
-
const point = getPoint(action.action_inputs.start_box
|
|
77
|
-
const locate =
|
|
78
|
-
prompt: action.thought || '',
|
|
79
|
-
bbox: pointToBbox({
|
|
80
|
-
x: point[0],
|
|
81
|
-
y: point[1]
|
|
82
|
-
}, shotSize.width, shotSize.height)
|
|
83
|
-
};
|
|
87
|
+
const point = getPoint(action.action_inputs.start_box);
|
|
88
|
+
const locate = pointToLocateParam(point, action.thought, shotSize);
|
|
84
89
|
transformActions.push({
|
|
85
90
|
type: 'Tap',
|
|
86
91
|
param: {
|
|
87
|
-
locate
|
|
92
|
+
locate
|
|
88
93
|
}
|
|
89
94
|
});
|
|
90
95
|
} else if ('left_double' === actionType) {
|
|
91
96
|
assert(action.action_inputs.start_box, 'start_box is required');
|
|
92
|
-
const point = getPoint(action.action_inputs.start_box
|
|
93
|
-
const locate =
|
|
94
|
-
prompt: action.thought || '',
|
|
95
|
-
bbox: pointToBbox({
|
|
96
|
-
x: point[0],
|
|
97
|
-
y: point[1]
|
|
98
|
-
}, shotSize.width, shotSize.height)
|
|
99
|
-
};
|
|
97
|
+
const point = getPoint(action.action_inputs.start_box);
|
|
98
|
+
const locate = pointToLocateParam(point, action.thought, shotSize);
|
|
100
99
|
transformActions.push({
|
|
101
100
|
type: 'DoubleClick',
|
|
102
101
|
param: {
|
|
103
|
-
locate
|
|
102
|
+
locate
|
|
104
103
|
},
|
|
105
104
|
thought: action.thought || ''
|
|
106
105
|
});
|
|
107
106
|
} else if ('right_single' === actionType) {
|
|
108
107
|
assert(action.action_inputs.start_box, 'start_box is required');
|
|
109
|
-
const point = getPoint(action.action_inputs.start_box
|
|
110
|
-
const locate =
|
|
111
|
-
prompt: action.thought || '',
|
|
112
|
-
bbox: pointToBbox({
|
|
113
|
-
x: point[0],
|
|
114
|
-
y: point[1]
|
|
115
|
-
}, shotSize.width, shotSize.height)
|
|
116
|
-
};
|
|
108
|
+
const point = getPoint(action.action_inputs.start_box);
|
|
109
|
+
const locate = pointToLocateParam(point, action.thought, shotSize);
|
|
117
110
|
transformActions.push({
|
|
118
111
|
type: 'RightClick',
|
|
119
112
|
param: {
|
|
120
|
-
locate
|
|
113
|
+
locate
|
|
121
114
|
},
|
|
122
115
|
thought: action.thought || ''
|
|
123
116
|
});
|
|
124
117
|
} else if ('drag' === actionType) {
|
|
125
118
|
assert(action.action_inputs.start_box, 'start_box is required');
|
|
126
119
|
assert(action.action_inputs.end_box, 'end_box is required');
|
|
127
|
-
const startPoint = getPoint(action.action_inputs.start_box
|
|
128
|
-
const endPoint = getPoint(action.action_inputs.end_box
|
|
120
|
+
const startPoint = getPoint(action.action_inputs.start_box);
|
|
121
|
+
const endPoint = getPoint(action.action_inputs.end_box);
|
|
129
122
|
transformActions.push({
|
|
130
123
|
type: 'DragAndDrop',
|
|
131
124
|
param: {
|
|
132
|
-
from:
|
|
133
|
-
|
|
134
|
-
bbox: pointToBbox({
|
|
135
|
-
x: startPoint[0],
|
|
136
|
-
y: startPoint[1]
|
|
137
|
-
}, shotSize.width, shotSize.height)
|
|
138
|
-
},
|
|
139
|
-
to: {
|
|
140
|
-
prompt: action.thought || '',
|
|
141
|
-
bbox: pointToBbox({
|
|
142
|
-
x: endPoint[0],
|
|
143
|
-
y: endPoint[1]
|
|
144
|
-
}, shotSize.width, shotSize.height)
|
|
145
|
-
}
|
|
125
|
+
from: pointToLocateParam(startPoint, action.thought, shotSize),
|
|
126
|
+
to: pointToLocateParam(endPoint, action.thought, shotSize)
|
|
146
127
|
},
|
|
147
128
|
thought: action.thought || ''
|
|
148
129
|
});
|
|
@@ -165,7 +146,7 @@ async function uiTarsPlanning(userInstruction, options) {
|
|
|
165
146
|
transformActions.push({
|
|
166
147
|
type: 'Finished',
|
|
167
148
|
param: {},
|
|
168
|
-
thought: action.thought || ''
|
|
149
|
+
thought: action.action_inputs.content || action.thought || ''
|
|
169
150
|
});
|
|
170
151
|
} else if ('hotkey' === actionType) if (action.action_inputs.key) {
|
|
171
152
|
const keys = transformHotkeyInput(action.action_inputs.key);
|
|
@@ -234,16 +215,17 @@ function convertBboxToCoordinates(text) {
|
|
|
234
215
|
const y = Math.floor((y1Num + y2Num) / 2);
|
|
235
216
|
return `(${x},${y})`;
|
|
236
217
|
}
|
|
237
|
-
const cleanedText = text.replace(/\[EOS\]/g, '');
|
|
218
|
+
const cleanedText = text.replace(/\[EOS\]/g, '').replace(/```(?:[a-zA-Z0-9_-]+)?/g, '');
|
|
238
219
|
return cleanedText.replace(pattern, replaceMatch).trim();
|
|
239
220
|
}
|
|
240
|
-
function getPoint(startBox
|
|
221
|
+
function getPoint(startBox) {
|
|
241
222
|
const [x, y] = JSON.parse(startBox);
|
|
223
|
+
assert('number' == typeof x && Number.isFinite(x) && 'number' == typeof y && Number.isFinite(y), `invalid point data for ui-tars planning: ${startBox}`);
|
|
242
224
|
return [
|
|
243
|
-
x
|
|
244
|
-
y
|
|
225
|
+
x,
|
|
226
|
+
y
|
|
245
227
|
];
|
|
246
228
|
}
|
|
247
229
|
export { uiTarsPlanning };
|
|
248
230
|
|
|
249
|
-
//# sourceMappingURL=
|
|
231
|
+
//# sourceMappingURL=planning.mjs.map
|