@midscene/core 1.8.11 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +40 -50
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/task-builder.mjs +39 -19
- package/dist/es/agent/task-builder.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +24 -22
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +11 -14
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/connectivity.mjs +7 -3
- package/dist/es/ai-model/connectivity.mjs.map +1 -1
- package/dist/es/ai-model/errors.mjs +9 -0
- package/dist/es/ai-model/errors.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +3 -4
- package/dist/es/ai-model/inspect.mjs +132 -144
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +46 -28
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/actions.mjs +22 -44
- package/dist/es/ai-model/models/auto-glm/actions.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/adapter.mjs +45 -0
- package/dist/es/ai-model/models/auto-glm/adapter.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/locate.mjs +112 -0
- package/dist/es/ai-model/models/auto-glm/locate.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/parser.mjs.map +1 -0
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/planning.mjs +6 -7
- package/dist/es/ai-model/models/auto-glm/planning.mjs.map +1 -0
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/prompt.mjs +3 -11
- package/dist/es/ai-model/models/auto-glm/prompt.mjs.map +1 -0
- package/dist/es/ai-model/models/default.mjs +12 -0
- package/dist/es/ai-model/models/default.mjs.map +1 -0
- package/dist/es/ai-model/models/doubao.mjs +138 -0
- package/dist/es/ai-model/models/doubao.mjs.map +1 -0
- package/dist/es/ai-model/models/gemini.mjs +34 -0
- package/dist/es/ai-model/models/gemini.mjs.map +1 -0
- package/dist/es/ai-model/models/glm.mjs +37 -0
- package/dist/es/ai-model/models/glm.mjs.map +1 -0
- package/dist/es/ai-model/models/gpt.mjs +31 -0
- package/dist/es/ai-model/models/gpt.mjs.map +1 -0
- package/dist/es/ai-model/models/index.mjs +2 -0
- package/dist/es/ai-model/models/qwen.mjs +113 -0
- package/dist/es/ai-model/models/qwen.mjs.map +1 -0
- package/dist/es/ai-model/models/registry.mjs +45 -0
- package/dist/es/ai-model/models/registry.mjs.map +1 -0
- package/dist/es/ai-model/models/resolved.mjs +104 -0
- package/dist/es/ai-model/models/resolved.mjs.map +1 -0
- package/dist/es/ai-model/models/types.mjs +0 -0
- package/dist/es/ai-model/models/ui-tars/adapter.mjs +142 -0
- package/dist/es/ai-model/models/ui-tars/adapter.mjs.map +1 -0
- package/dist/es/ai-model/{ui-tars-planning.mjs → models/ui-tars/planning.mjs} +44 -62
- package/dist/es/ai-model/models/ui-tars/planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +3 -3
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +11 -11
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +25 -60
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -10
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/locate-grounding-rules.mjs +9 -0
- package/dist/es/ai-model/prompt/locate-grounding-rules.mjs.map +1 -0
- package/dist/es/ai-model/prompt/locate-param-example.mjs +15 -0
- package/dist/es/ai-model/prompt/locate-param-example.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +5 -5
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +5 -5
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompts/locate-result-coordinates.mjs +107 -0
- package/dist/es/ai-model/prompts/locate-result-coordinates.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +59 -190
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/json.mjs +60 -0
- package/dist/es/ai-model/service-caller/json.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/bbox.mjs +68 -0
- package/dist/es/ai-model/shared/model-locate-result/bbox.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/factory.mjs +96 -0
- package/dist/es/ai-model/shared/model-locate-result/factory.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/index.mjs +3 -0
- package/dist/es/ai-model/shared/model-locate-result/parse.mjs +41 -0
- package/dist/es/ai-model/shared/model-locate-result/parse.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/pixel-bbox-mapper.mjs +64 -0
- package/dist/es/ai-model/shared/model-locate-result/pixel-bbox-mapper.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/types.mjs +0 -0
- package/dist/es/ai-model/types.mjs +0 -0
- package/dist/es/ai-model/workflows/image-preprocess.mjs +27 -0
- package/dist/es/ai-model/workflows/image-preprocess.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/index.mjs +2 -0
- package/dist/es/ai-model/workflows/inspect/locate-result-rect.mjs +23 -0
- package/dist/es/ai-model/workflows/inspect/locate-result-rect.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/search-area-mapping.mjs +18 -0
- package/dist/es/ai-model/workflows/inspect/search-area-mapping.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/types.mjs +0 -0
- package/dist/es/ai-model/workflows/planning/index.mjs +5 -0
- package/dist/es/ai-model/workflows/planning/index.mjs.map +1 -0
- package/dist/es/ai-model/workflows/planning/types.mjs +0 -0
- package/dist/es/common.mjs +2 -174
- package/dist/es/common.mjs.map +1 -1
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/service/index.mjs +96 -69
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/player.mjs +4 -3
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/lib/agent/agent.js +43 -53
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/task-builder.js +38 -18
- package/dist/lib/agent/task-builder.js.map +1 -1
- package/dist/lib/agent/tasks.js +23 -21
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +17 -17
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/connectivity.js +7 -3
- package/dist/lib/ai-model/connectivity.js.map +1 -1
- package/dist/lib/ai-model/errors.js +46 -0
- package/dist/lib/ai-model/errors.js.map +1 -0
- package/dist/lib/ai-model/index.js +7 -14
- package/dist/lib/ai-model/inspect.js +141 -144
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +44 -26
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/actions.js +22 -44
- package/dist/lib/ai-model/models/auto-glm/actions.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/adapter.js +79 -0
- package/dist/lib/ai-model/models/auto-glm/adapter.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/locate.js +146 -0
- package/dist/lib/ai-model/models/auto-glm/locate.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/parser.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/planning.js +8 -9
- package/dist/lib/ai-model/models/auto-glm/planning.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/prompt.js +14 -16
- package/dist/lib/ai-model/models/auto-glm/prompt.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm/util.js → models/default.js} +13 -13
- package/dist/lib/ai-model/models/default.js.map +1 -0
- package/dist/lib/ai-model/models/doubao.js +184 -0
- package/dist/lib/ai-model/models/doubao.js.map +1 -0
- package/dist/lib/ai-model/models/gemini.js +68 -0
- package/dist/lib/ai-model/models/gemini.js.map +1 -0
- package/dist/lib/ai-model/models/glm.js +71 -0
- package/dist/lib/ai-model/models/glm.js.map +1 -0
- package/dist/lib/ai-model/models/gpt.js +65 -0
- package/dist/lib/ai-model/models/gpt.js.map +1 -0
- package/dist/lib/ai-model/{service-caller/image-detail.js → models/index.js} +8 -7
- package/dist/lib/ai-model/models/index.js.map +1 -0
- package/dist/lib/ai-model/models/qwen.js +147 -0
- package/dist/lib/ai-model/models/qwen.js.map +1 -0
- package/dist/lib/ai-model/models/registry.js +85 -0
- package/dist/lib/ai-model/models/registry.js.map +1 -0
- package/dist/lib/ai-model/models/resolved.js +138 -0
- package/dist/lib/ai-model/models/resolved.js.map +1 -0
- package/dist/lib/ai-model/models/types.js +20 -0
- package/dist/lib/ai-model/models/types.js.map +1 -0
- package/dist/lib/ai-model/models/ui-tars/adapter.js +176 -0
- package/dist/lib/ai-model/models/ui-tars/adapter.js.map +1 -0
- package/dist/lib/ai-model/{ui-tars-planning.js → models/ui-tars/planning.js} +44 -62
- package/dist/lib/ai-model/models/ui-tars/planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +3 -3
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +11 -11
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +25 -60
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js +15 -10
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/locate-grounding-rules.js +43 -0
- package/dist/lib/ai-model/prompt/locate-grounding-rules.js.map +1 -0
- package/dist/lib/ai-model/prompt/locate-param-example.js +52 -0
- package/dist/lib/ai-model/prompt/locate-param-example.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +5 -5
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +5 -5
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/prompts/locate-result-coordinates.js +150 -0
- package/dist/lib/ai-model/prompts/locate-result-coordinates.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +68 -199
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/service-caller/json.js +100 -0
- package/dist/lib/ai-model/service-caller/json.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/bbox.js +117 -0
- package/dist/lib/ai-model/shared/model-locate-result/bbox.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/factory.js +130 -0
- package/dist/lib/ai-model/shared/model-locate-result/factory.js.map +1 -0
- package/dist/lib/ai-model/{prompt/common.js → shared/model-locate-result/index.js} +9 -9
- package/dist/lib/ai-model/shared/model-locate-result/index.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/parse.js +78 -0
- package/dist/lib/ai-model/shared/model-locate-result/parse.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/pixel-bbox-mapper.js +98 -0
- package/dist/lib/ai-model/shared/model-locate-result/pixel-bbox-mapper.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/types.js +20 -0
- package/dist/lib/ai-model/shared/model-locate-result/types.js.map +1 -0
- package/dist/lib/ai-model/types.js +20 -0
- package/dist/lib/ai-model/types.js.map +1 -0
- package/dist/lib/ai-model/workflows/image-preprocess.js +61 -0
- package/dist/lib/ai-model/workflows/image-preprocess.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/index.js +50 -0
- package/dist/lib/ai-model/workflows/inspect/index.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/locate-result-rect.js +60 -0
- package/dist/lib/ai-model/workflows/inspect/locate-result-rect.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/search-area-mapping.js +52 -0
- package/dist/lib/ai-model/workflows/inspect/search-area-mapping.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/types.js +20 -0
- package/dist/lib/ai-model/workflows/inspect/types.js.map +1 -0
- package/dist/lib/ai-model/{model-family.js → workflows/planning/index.js} +6 -7
- package/dist/lib/ai-model/workflows/planning/index.js.map +1 -0
- package/dist/lib/ai-model/workflows/planning/types.js +20 -0
- package/dist/lib/ai-model/workflows/planning/types.js.map +1 -0
- package/dist/lib/common.js +4 -206
- package/dist/lib/common.js.map +1 -1
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/service/index.js +96 -69
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/player.js +4 -3
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/types/agent/agent.d.ts +14 -6
- package/dist/types/agent/task-builder.d.ts +2 -2
- package/dist/types/agent/tasks.d.ts +6 -6
- package/dist/types/agent/utils.d.ts +8 -5
- package/dist/types/ai-model/errors.d.ts +2 -0
- package/dist/types/ai-model/index.d.ts +2 -4
- package/dist/types/ai-model/inspect.d.ts +13 -33
- package/dist/types/ai-model/llm-planning.d.ts +6 -17
- package/dist/types/ai-model/{auto-glm → models/auto-glm}/actions.d.ts +2 -2
- package/dist/types/ai-model/models/auto-glm/adapter.d.ts +5 -0
- package/dist/types/ai-model/models/auto-glm/locate.d.ts +3 -0
- package/dist/types/ai-model/models/auto-glm/planning.d.ts +3 -0
- package/dist/types/ai-model/models/auto-glm/prompt.d.ts +4 -0
- package/dist/types/ai-model/models/default.d.ts +2 -0
- package/dist/types/ai-model/models/doubao.d.ts +10 -0
- package/dist/types/ai-model/models/gemini.d.ts +18 -0
- package/dist/types/ai-model/models/glm.d.ts +18 -0
- package/dist/types/ai-model/models/gpt.d.ts +18 -0
- package/dist/types/ai-model/models/index.d.ts +2 -0
- package/dist/types/ai-model/models/qwen.d.ts +30 -0
- package/dist/types/ai-model/models/registry.d.ts +81 -0
- package/dist/types/ai-model/models/resolved.d.ts +9 -0
- package/dist/types/ai-model/models/types.d.ts +102 -0
- package/dist/types/ai-model/models/ui-tars/adapter.d.ts +6 -0
- package/dist/types/ai-model/{ui-tars-planning.d.ts → models/ui-tars/planning.d.ts} +7 -11
- package/dist/types/ai-model/prompt/llm-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +5 -5
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/locate-grounding-rules.d.ts +1 -0
- package/dist/types/ai-model/prompt/locate-param-example.d.ts +3 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +3 -3
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +3 -3
- package/dist/types/ai-model/prompts/locate-result-coordinates.d.ts +6 -0
- package/dist/types/ai-model/service-caller/index.d.ts +19 -27
- package/dist/types/ai-model/service-caller/json.d.ts +9 -0
- package/dist/types/ai-model/shared/model-locate-result/bbox.d.ts +7 -0
- package/dist/types/ai-model/shared/model-locate-result/factory.d.ts +2 -0
- package/dist/types/ai-model/shared/model-locate-result/index.d.ts +3 -0
- package/dist/types/ai-model/shared/model-locate-result/parse.d.ts +5 -0
- package/dist/types/ai-model/shared/model-locate-result/pixel-bbox-mapper.d.ts +7 -0
- package/dist/types/ai-model/shared/model-locate-result/types.d.ts +157 -0
- package/dist/types/ai-model/types.d.ts +2 -0
- package/dist/types/ai-model/workflows/image-preprocess.d.ts +30 -0
- package/dist/types/ai-model/workflows/inspect/index.d.ts +1 -0
- package/dist/types/ai-model/workflows/inspect/locate-result-rect.d.ts +4 -0
- package/dist/types/ai-model/workflows/inspect/search-area-mapping.d.ts +3 -0
- package/dist/types/ai-model/workflows/inspect/types.d.ts +37 -0
- package/dist/types/ai-model/workflows/planning/index.d.ts +2 -0
- package/dist/types/ai-model/workflows/planning/types.d.ts +15 -0
- package/dist/types/common.d.ts +0 -30
- package/dist/types/device/index.d.ts +22 -22
- package/dist/types/service/index.d.ts +5 -4
- package/dist/types/types.d.ts +21 -9
- package/dist/types/yaml.d.ts +8 -2
- package/package.json +2 -2
- package/dist/es/ai-model/auto-glm/actions.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/index.mjs +0 -6
- package/dist/es/ai-model/auto-glm/parser.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/planning.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/util.mjs +0 -9
- package/dist/es/ai-model/auto-glm/util.mjs.map +0 -1
- package/dist/es/ai-model/model-family.mjs +0 -6
- package/dist/es/ai-model/model-family.mjs.map +0 -1
- package/dist/es/ai-model/prompt/common.mjs +0 -8
- package/dist/es/ai-model/prompt/common.mjs.map +0 -1
- package/dist/es/ai-model/service-caller/image-detail.mjs +0 -6
- package/dist/es/ai-model/service-caller/image-detail.mjs.map +0 -1
- package/dist/es/ai-model/ui-tars-planning.mjs.map +0 -1
- package/dist/lib/ai-model/auto-glm/actions.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/index.js +0 -66
- package/dist/lib/ai-model/auto-glm/index.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/parser.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/planning.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/prompt.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/util.js.map +0 -1
- package/dist/lib/ai-model/model-family.js.map +0 -1
- package/dist/lib/ai-model/prompt/common.js.map +0 -1
- package/dist/lib/ai-model/service-caller/image-detail.js.map +0 -1
- package/dist/lib/ai-model/ui-tars-planning.js.map +0 -1
- package/dist/types/ai-model/auto-glm/index.d.ts +0 -6
- package/dist/types/ai-model/auto-glm/planning.d.ts +0 -12
- package/dist/types/ai-model/auto-glm/prompt.d.ts +0 -27
- package/dist/types/ai-model/auto-glm/util.d.ts +0 -13
- package/dist/types/ai-model/model-family.d.ts +0 -7
- package/dist/types/ai-model/prompt/common.d.ts +0 -2
- package/dist/types/ai-model/service-caller/image-detail.d.ts +0 -2
- /package/dist/es/ai-model/{auto-glm → models/auto-glm}/parser.mjs +0 -0
- /package/dist/lib/ai-model/{auto-glm → models/auto-glm}/parser.js +0 -0
- /package/dist/types/ai-model/{auto-glm → models/auto-glm}/parser.d.ts +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export interface ImagePreprocessPolicy {
|
|
2
|
+
padBlockSize?: number;
|
|
3
|
+
}
|
|
4
|
+
export interface PreparedModelImage {
|
|
5
|
+
imageBase64: string;
|
|
6
|
+
/**
|
|
7
|
+
* Size of the image sent to the model after preprocessing. This can be larger
|
|
8
|
+
* than the original screenshot when padding is applied to satisfy model block
|
|
9
|
+
* size requirements.
|
|
10
|
+
*/
|
|
11
|
+
preparedSize: {
|
|
12
|
+
width: number;
|
|
13
|
+
height: number;
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* Size of the real screenshot content inside the prepared image. Pixel bboxes
|
|
17
|
+
* are parsed against `preparedSize`, then clipped to `contentSize` so padding
|
|
18
|
+
* added for the model is not treated as valid UI content.
|
|
19
|
+
*/
|
|
20
|
+
contentSize: {
|
|
21
|
+
width: number;
|
|
22
|
+
height: number;
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export declare function prepareModelImage(options: {
|
|
26
|
+
imageBase64: string;
|
|
27
|
+
width: number;
|
|
28
|
+
height: number;
|
|
29
|
+
policy: ImagePreprocessPolicy;
|
|
30
|
+
}): Promise<PreparedModelImage>;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { AiExtractElementInfo, AiJudgeOrderSensitive, AiLocateElement, AiLocateSection, buildSearchAreaConfig, } from '../../inspect';
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { PixelBbox } from '../../shared/model-locate-result';
|
|
2
|
+
import type { Rect } from '../../../types';
|
|
3
|
+
export declare function mergePixelBboxesToRect(pixelBboxes: PixelBbox[]): Rect;
|
|
4
|
+
export declare function pixelBboxToRect([left, top, right, bottom]: PixelBbox): Rect;
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import type { PixelBbox } from '../../shared/model-locate-result';
|
|
2
|
+
import type { SearchAreaImageMapping } from './types';
|
|
3
|
+
export declare function mapSearchAreaPixelBboxToOriginalPixelBbox([left, top, right, bottom]: PixelBbox, mapping?: SearchAreaImageMapping): PixelBbox;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { AIUsageInfo, Rect, UIContext } from '../../../types';
|
|
2
|
+
import type { LocateResultElement } from '@midscene/shared/types';
|
|
3
|
+
import type { TUserPrompt } from '../../../common';
|
|
4
|
+
import type { ModelRuntime } from '../../models';
|
|
5
|
+
export interface SearchAreaImageMapping {
|
|
6
|
+
offset: {
|
|
7
|
+
x: number;
|
|
8
|
+
y: number;
|
|
9
|
+
};
|
|
10
|
+
scale: number;
|
|
11
|
+
}
|
|
12
|
+
export interface SearchAreaConfig {
|
|
13
|
+
sourceRect: Rect;
|
|
14
|
+
image: {
|
|
15
|
+
imageBase64: string;
|
|
16
|
+
width: number;
|
|
17
|
+
height: number;
|
|
18
|
+
};
|
|
19
|
+
mapping: SearchAreaImageMapping;
|
|
20
|
+
}
|
|
21
|
+
export interface LocateOptions {
|
|
22
|
+
context: UIContext;
|
|
23
|
+
searchConfig?: SearchAreaConfig;
|
|
24
|
+
modelRuntime: ModelRuntime;
|
|
25
|
+
abortSignal?: AbortSignal;
|
|
26
|
+
}
|
|
27
|
+
export interface LocateResult {
|
|
28
|
+
parseResult: {
|
|
29
|
+
element?: LocateResultElement;
|
|
30
|
+
errors?: string[];
|
|
31
|
+
};
|
|
32
|
+
rect?: Rect;
|
|
33
|
+
rawResponse: string;
|
|
34
|
+
usage?: AIUsageInfo;
|
|
35
|
+
reasoning_content?: string;
|
|
36
|
+
}
|
|
37
|
+
export type LocateFn = (elementDescription: TUserPrompt, options: LocateOptions) => Promise<LocateResult>;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { DeviceAction, PlanningAIResponse, UIContext } from '../../../types';
|
|
2
|
+
import type { ConversationHistory } from '../../conversation-history';
|
|
3
|
+
import type { ModelRuntime } from '../../models';
|
|
4
|
+
export interface PlanOptions {
|
|
5
|
+
context: UIContext;
|
|
6
|
+
actionSpace: DeviceAction<any>[];
|
|
7
|
+
actionContext?: string;
|
|
8
|
+
modelRuntime: ModelRuntime;
|
|
9
|
+
conversationHistory: ConversationHistory;
|
|
10
|
+
includeLocateInPlanning: boolean;
|
|
11
|
+
imagesIncludeCount?: number;
|
|
12
|
+
deepThink?: boolean;
|
|
13
|
+
abortSignal?: AbortSignal;
|
|
14
|
+
}
|
|
15
|
+
export type PlanFn = (userInstruction: string, options: PlanOptions) => Promise<PlanningAIResponse>;
|
package/dist/types/common.d.ts
CHANGED
|
@@ -1,34 +1,5 @@
|
|
|
1
1
|
import type { BaseElement, DeviceAction, ElementTreeNode, MidsceneYamlFlowItem, PlanningAction, Rect, Size } from './types';
|
|
2
|
-
import type { ChatCompletionMessageParam } from 'openai/resources/index';
|
|
3
|
-
import type { PlanningLocateParam } from './types';
|
|
4
|
-
import type { TModelFamily } from '@midscene/shared/env';
|
|
5
2
|
import { z } from 'zod';
|
|
6
|
-
export type AIArgs = ChatCompletionMessageParam[];
|
|
7
|
-
type AdaptBboxInput = number[] | string[] | string | (number[] | string[])[];
|
|
8
|
-
/**
|
|
9
|
-
* Convert a point coordinate [0, 1000] to a small bbox [0, 1000]
|
|
10
|
-
* Creates a small bbox around the center point in the same coordinate space
|
|
11
|
-
*
|
|
12
|
-
* @param x - X coordinate in [0, 1000] range
|
|
13
|
-
* @param y - Y coordinate in [0, 1000] range
|
|
14
|
-
* @param bboxSize - Size of the bbox to create (default: 20)
|
|
15
|
-
* @returns [x1, y1, x2, y2] bbox in [0, 1000] coordinate space
|
|
16
|
-
*/
|
|
17
|
-
export declare function pointToBbox(x: number, y: number, bboxSize?: number): [number, number, number, number];
|
|
18
|
-
export declare function fillBboxParam(locate: PlanningLocateParam, width: number, height: number, modelFamily: TModelFamily | undefined): PlanningLocateParam;
|
|
19
|
-
export declare function adaptQwen2_5Bbox(bbox: number[]): [number, number, number, number];
|
|
20
|
-
export declare function adaptGpt5Bbox(bbox: number[] | string[] | string): [number, number, number, number];
|
|
21
|
-
export declare function adaptDoubaoBbox(bbox: string[] | number[] | string, width: number, height: number): [number, number, number, number];
|
|
22
|
-
export declare function adaptBbox(bbox: AdaptBboxInput, width: number, height: number, modelFamily: TModelFamily | undefined): [number, number, number, number];
|
|
23
|
-
export declare function normalized01000(bbox: number[] | string[] | string, width: number, height: number, modelFamily?: TModelFamily | undefined): [number, number, number, number];
|
|
24
|
-
export declare function adaptGeminiBbox(bbox: number[], width: number, height: number): [number, number, number, number];
|
|
25
|
-
export declare function adaptBboxToRect(bbox: number[], width: number, height: number, offsetX?: number, offsetY?: number, rightLimit?: number, bottomLimit?: number, modelFamily?: TModelFamily | undefined, scale?: number): Rect;
|
|
26
|
-
export declare function mergeRects(rects: Rect[]): {
|
|
27
|
-
left: number;
|
|
28
|
-
top: number;
|
|
29
|
-
width: number;
|
|
30
|
-
height: number;
|
|
31
|
-
};
|
|
32
3
|
/**
|
|
33
4
|
* Expand the search area to at least 400 x 400 pixels
|
|
34
5
|
*
|
|
@@ -285,4 +256,3 @@ export declare const finalizeActionName = "Finalize";
|
|
|
285
256
|
* @returns A formatted time string with format label
|
|
286
257
|
*/
|
|
287
258
|
export declare const getReadableTimeString: (format?: string, timestamp?: number) => string;
|
|
288
|
-
export {};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
+
import type { ModelRuntime } from '../ai-model/models';
|
|
1
2
|
import type { ActionScrollParam, DeviceAction, LocateResultElement } from '../types';
|
|
2
|
-
import type { IModelConfig } from '@midscene/shared/env';
|
|
3
3
|
import type { ElementNode } from '@midscene/shared/extractor';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import type { ElementCacheFeature, Rect, Size } from '../types';
|
|
@@ -118,7 +118,7 @@ export declare abstract class AbstractInterface {
|
|
|
118
118
|
abstract actionSpace(): DeviceAction[];
|
|
119
119
|
abstract cacheFeatureForPoint?(center: [number, number], options?: {
|
|
120
120
|
targetDescription?: string;
|
|
121
|
-
|
|
121
|
+
modelRuntime?: ModelRuntime;
|
|
122
122
|
}): Promise<ElementCacheFeature>;
|
|
123
123
|
abstract rectMatchesCacheFeature?(feature: ElementCacheFeature): Promise<Rect>;
|
|
124
124
|
abstract destroy?(): Promise<void>;
|
|
@@ -295,8 +295,8 @@ export declare const actionTapParamSchema: z.ZodObject<{
|
|
|
295
295
|
}[] | undefined;
|
|
296
296
|
convertHttpImage2Base64?: boolean | undefined;
|
|
297
297
|
});
|
|
298
|
-
deepLocate?: boolean | undefined;
|
|
299
298
|
deepThink?: boolean | undefined;
|
|
299
|
+
deepLocate?: boolean | undefined;
|
|
300
300
|
cacheable?: boolean | undefined;
|
|
301
301
|
xpath?: string | boolean | undefined;
|
|
302
302
|
} & {
|
|
@@ -313,8 +313,8 @@ export declare const actionTapParamSchema: z.ZodObject<{
|
|
|
313
313
|
}[] | undefined;
|
|
314
314
|
convertHttpImage2Base64?: boolean | undefined;
|
|
315
315
|
});
|
|
316
|
-
deepLocate?: boolean | undefined;
|
|
317
316
|
deepThink?: boolean | undefined;
|
|
317
|
+
deepLocate?: boolean | undefined;
|
|
318
318
|
cacheable?: boolean | undefined;
|
|
319
319
|
xpath?: string | boolean | undefined;
|
|
320
320
|
} & {
|
|
@@ -446,8 +446,8 @@ export declare const actionRightClickParamSchema: z.ZodObject<{
|
|
|
446
446
|
}[] | undefined;
|
|
447
447
|
convertHttpImage2Base64?: boolean | undefined;
|
|
448
448
|
});
|
|
449
|
-
deepLocate?: boolean | undefined;
|
|
450
449
|
deepThink?: boolean | undefined;
|
|
450
|
+
deepLocate?: boolean | undefined;
|
|
451
451
|
cacheable?: boolean | undefined;
|
|
452
452
|
xpath?: string | boolean | undefined;
|
|
453
453
|
} & {
|
|
@@ -464,8 +464,8 @@ export declare const actionRightClickParamSchema: z.ZodObject<{
|
|
|
464
464
|
}[] | undefined;
|
|
465
465
|
convertHttpImage2Base64?: boolean | undefined;
|
|
466
466
|
});
|
|
467
|
-
deepLocate?: boolean | undefined;
|
|
468
467
|
deepThink?: boolean | undefined;
|
|
468
|
+
deepLocate?: boolean | undefined;
|
|
469
469
|
cacheable?: boolean | undefined;
|
|
470
470
|
xpath?: string | boolean | undefined;
|
|
471
471
|
} & {
|
|
@@ -597,8 +597,8 @@ export declare const actionDoubleClickParamSchema: z.ZodObject<{
|
|
|
597
597
|
}[] | undefined;
|
|
598
598
|
convertHttpImage2Base64?: boolean | undefined;
|
|
599
599
|
});
|
|
600
|
-
deepLocate?: boolean | undefined;
|
|
601
600
|
deepThink?: boolean | undefined;
|
|
601
|
+
deepLocate?: boolean | undefined;
|
|
602
602
|
cacheable?: boolean | undefined;
|
|
603
603
|
xpath?: string | boolean | undefined;
|
|
604
604
|
} & {
|
|
@@ -615,8 +615,8 @@ export declare const actionDoubleClickParamSchema: z.ZodObject<{
|
|
|
615
615
|
}[] | undefined;
|
|
616
616
|
convertHttpImage2Base64?: boolean | undefined;
|
|
617
617
|
});
|
|
618
|
-
deepLocate?: boolean | undefined;
|
|
619
618
|
deepThink?: boolean | undefined;
|
|
619
|
+
deepLocate?: boolean | undefined;
|
|
620
620
|
cacheable?: boolean | undefined;
|
|
621
621
|
xpath?: string | boolean | undefined;
|
|
622
622
|
} & {
|
|
@@ -748,8 +748,8 @@ export declare const actionHoverParamSchema: z.ZodObject<{
|
|
|
748
748
|
}[] | undefined;
|
|
749
749
|
convertHttpImage2Base64?: boolean | undefined;
|
|
750
750
|
});
|
|
751
|
-
deepLocate?: boolean | undefined;
|
|
752
751
|
deepThink?: boolean | undefined;
|
|
752
|
+
deepLocate?: boolean | undefined;
|
|
753
753
|
cacheable?: boolean | undefined;
|
|
754
754
|
xpath?: string | boolean | undefined;
|
|
755
755
|
} & {
|
|
@@ -766,8 +766,8 @@ export declare const actionHoverParamSchema: z.ZodObject<{
|
|
|
766
766
|
}[] | undefined;
|
|
767
767
|
convertHttpImage2Base64?: boolean | undefined;
|
|
768
768
|
});
|
|
769
|
-
deepLocate?: boolean | undefined;
|
|
770
769
|
deepThink?: boolean | undefined;
|
|
770
|
+
deepLocate?: boolean | undefined;
|
|
771
771
|
cacheable?: boolean | undefined;
|
|
772
772
|
xpath?: string | boolean | undefined;
|
|
773
773
|
} & {
|
|
@@ -1290,8 +1290,8 @@ export declare const actionScrollParamSchema: z.ZodObject<{
|
|
|
1290
1290
|
xpath: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodBoolean]>>;
|
|
1291
1291
|
}, z.ZodTypeAny, "passthrough">>>;
|
|
1292
1292
|
}, "strip", z.ZodTypeAny, {
|
|
1293
|
+
direction: "left" | "right" | "up" | "down";
|
|
1293
1294
|
scrollType: "singleAction" | "scrollToBottom" | "scrollToTop" | "scrollToRight" | "scrollToLeft";
|
|
1294
|
-
direction: "left" | "right" | "down" | "up";
|
|
1295
1295
|
locate?: z.objectOutputType<{
|
|
1296
1296
|
prompt: z.ZodUnion<[z.ZodString, z.ZodIntersection<z.ZodObject<{
|
|
1297
1297
|
prompt: z.ZodString;
|
|
@@ -1368,8 +1368,8 @@ export declare const actionScrollParamSchema: z.ZodObject<{
|
|
|
1368
1368
|
cacheable: z.ZodOptional<z.ZodBoolean>;
|
|
1369
1369
|
xpath: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodBoolean]>>;
|
|
1370
1370
|
}, z.ZodTypeAny, "passthrough"> | undefined;
|
|
1371
|
+
direction?: "left" | "right" | "up" | "down" | undefined;
|
|
1371
1372
|
scrollType?: "singleAction" | "scrollToBottom" | "scrollToTop" | "scrollToRight" | "scrollToLeft" | undefined;
|
|
1372
|
-
direction?: "left" | "right" | "down" | "up" | undefined;
|
|
1373
1373
|
distance?: number | null | undefined;
|
|
1374
1374
|
}>;
|
|
1375
1375
|
export declare const defineActionScroll: (scroll: ScrollInputPrimitives["scroll"]) => DeviceAction<ActionScrollParam>;
|
|
@@ -1603,8 +1603,8 @@ export declare const actionDragAndDropParamSchema: z.ZodObject<{
|
|
|
1603
1603
|
}[] | undefined;
|
|
1604
1604
|
convertHttpImage2Base64?: boolean | undefined;
|
|
1605
1605
|
});
|
|
1606
|
-
deepLocate?: boolean | undefined;
|
|
1607
1606
|
deepThink?: boolean | undefined;
|
|
1607
|
+
deepLocate?: boolean | undefined;
|
|
1608
1608
|
cacheable?: boolean | undefined;
|
|
1609
1609
|
xpath?: string | boolean | undefined;
|
|
1610
1610
|
} & {
|
|
@@ -1620,8 +1620,8 @@ export declare const actionDragAndDropParamSchema: z.ZodObject<{
|
|
|
1620
1620
|
}[] | undefined;
|
|
1621
1621
|
convertHttpImage2Base64?: boolean | undefined;
|
|
1622
1622
|
});
|
|
1623
|
-
deepLocate?: boolean | undefined;
|
|
1624
1623
|
deepThink?: boolean | undefined;
|
|
1624
|
+
deepLocate?: boolean | undefined;
|
|
1625
1625
|
cacheable?: boolean | undefined;
|
|
1626
1626
|
xpath?: string | boolean | undefined;
|
|
1627
1627
|
} & {
|
|
@@ -1638,8 +1638,8 @@ export declare const actionDragAndDropParamSchema: z.ZodObject<{
|
|
|
1638
1638
|
}[] | undefined;
|
|
1639
1639
|
convertHttpImage2Base64?: boolean | undefined;
|
|
1640
1640
|
});
|
|
1641
|
-
deepLocate?: boolean | undefined;
|
|
1642
1641
|
deepThink?: boolean | undefined;
|
|
1642
|
+
deepLocate?: boolean | undefined;
|
|
1643
1643
|
cacheable?: boolean | undefined;
|
|
1644
1644
|
xpath?: string | boolean | undefined;
|
|
1645
1645
|
} & {
|
|
@@ -1655,8 +1655,8 @@ export declare const actionDragAndDropParamSchema: z.ZodObject<{
|
|
|
1655
1655
|
}[] | undefined;
|
|
1656
1656
|
convertHttpImage2Base64?: boolean | undefined;
|
|
1657
1657
|
});
|
|
1658
|
-
deepLocate?: boolean | undefined;
|
|
1659
1658
|
deepThink?: boolean | undefined;
|
|
1659
|
+
deepLocate?: boolean | undefined;
|
|
1660
1660
|
cacheable?: boolean | undefined;
|
|
1661
1661
|
xpath?: string | boolean | undefined;
|
|
1662
1662
|
} & {
|
|
@@ -1790,8 +1790,8 @@ export declare const ActionLongPressParamSchema: z.ZodObject<{
|
|
|
1790
1790
|
}[] | undefined;
|
|
1791
1791
|
convertHttpImage2Base64?: boolean | undefined;
|
|
1792
1792
|
});
|
|
1793
|
-
deepLocate?: boolean | undefined;
|
|
1794
1793
|
deepThink?: boolean | undefined;
|
|
1794
|
+
deepLocate?: boolean | undefined;
|
|
1795
1795
|
cacheable?: boolean | undefined;
|
|
1796
1796
|
xpath?: string | boolean | undefined;
|
|
1797
1797
|
} & {
|
|
@@ -1809,8 +1809,8 @@ export declare const ActionLongPressParamSchema: z.ZodObject<{
|
|
|
1809
1809
|
}[] | undefined;
|
|
1810
1810
|
convertHttpImage2Base64?: boolean | undefined;
|
|
1811
1811
|
});
|
|
1812
|
-
deepLocate?: boolean | undefined;
|
|
1813
1812
|
deepThink?: boolean | undefined;
|
|
1813
|
+
deepLocate?: boolean | undefined;
|
|
1814
1814
|
cacheable?: boolean | undefined;
|
|
1815
1815
|
xpath?: string | boolean | undefined;
|
|
1816
1816
|
} & {
|
|
@@ -2048,7 +2048,8 @@ export declare const ActionSwipeParamSchema: z.ZodObject<{
|
|
|
2048
2048
|
repeat: z.ZodOptional<z.ZodNumber>;
|
|
2049
2049
|
}, "strip", z.ZodTypeAny, {
|
|
2050
2050
|
duration: number;
|
|
2051
|
-
|
|
2051
|
+
repeat?: number | undefined;
|
|
2052
|
+
direction?: "left" | "right" | "up" | "down" | undefined;
|
|
2052
2053
|
distance?: number | undefined;
|
|
2053
2054
|
start?: z.objectOutputType<{
|
|
2054
2055
|
prompt: z.ZodUnion<[z.ZodString, z.ZodIntersection<z.ZodObject<{
|
|
@@ -2124,9 +2125,9 @@ export declare const ActionSwipeParamSchema: z.ZodObject<{
|
|
|
2124
2125
|
cacheable: z.ZodOptional<z.ZodBoolean>;
|
|
2125
2126
|
xpath: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodBoolean]>>;
|
|
2126
2127
|
}, z.ZodTypeAny, "passthrough"> | undefined;
|
|
2127
|
-
repeat?: number | undefined;
|
|
2128
2128
|
}, {
|
|
2129
|
-
|
|
2129
|
+
repeat?: number | undefined;
|
|
2130
|
+
direction?: "left" | "right" | "up" | "down" | undefined;
|
|
2130
2131
|
distance?: number | undefined;
|
|
2131
2132
|
duration?: number | undefined;
|
|
2132
2133
|
start?: z.objectInputType<{
|
|
@@ -2203,7 +2204,6 @@ export declare const ActionSwipeParamSchema: z.ZodObject<{
|
|
|
2203
2204
|
cacheable: z.ZodOptional<z.ZodBoolean>;
|
|
2204
2205
|
xpath: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodBoolean]>>;
|
|
2205
2206
|
}, z.ZodTypeAny, "passthrough"> | undefined;
|
|
2206
|
-
repeat?: number | undefined;
|
|
2207
2207
|
}>;
|
|
2208
2208
|
export type ActionSwipeParam = {
|
|
2209
2209
|
start?: LocateResultElement;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
+
import type { ModelRuntime } from '../ai-model/models';
|
|
1
2
|
import type { AIDescribeElementResponse, LocateResultElement, LocateResultWithDump, PlanningLocateParam, Rect, ServiceExtractOption, ServiceExtractParam, ServiceExtractResult, ServiceTaskInfo, UIContext } from '../types';
|
|
2
|
-
import type { IModelConfig } from '@midscene/shared/env';
|
|
3
3
|
import type { TMultimodalPrompt } from '../common';
|
|
4
4
|
export interface LocateOpts {
|
|
5
5
|
context?: UIContext;
|
|
@@ -15,9 +15,10 @@ export default class Service {
|
|
|
15
15
|
contextRetrieverFn: () => Promise<UIContext> | UIContext;
|
|
16
16
|
taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;
|
|
17
17
|
constructor(context: UIContext | (() => Promise<UIContext> | UIContext), opt?: ServiceOptions);
|
|
18
|
-
locate(query: PlanningLocateParam, opt: LocateOpts,
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
locate(query: PlanningLocateParam, opt: LocateOpts, modelRuntime: ModelRuntime, abortSignal?: AbortSignal): Promise<LocateResultWithDump>;
|
|
19
|
+
private resolveLocateSearchArea;
|
|
20
|
+
extract<T>(dataDemand: ServiceExtractParam, modelRuntime: ModelRuntime, opt?: ServiceExtractOption, pageDescription?: string, multimodalPrompt?: TMultimodalPrompt, context?: UIContext): Promise<ServiceExtractResult<T>>;
|
|
21
|
+
describe(target: Rect | [number, number], modelRuntime: ModelRuntime, opt?: {
|
|
21
22
|
deepLocate?: boolean;
|
|
22
23
|
}): Promise<Pick<AIDescribeElementResponse, 'description'>>;
|
|
23
24
|
}
|
package/dist/types/types.d.ts
CHANGED
|
@@ -38,19 +38,25 @@ export type AISingleElementResponseByPosition = {
|
|
|
38
38
|
reason: string;
|
|
39
39
|
text: string;
|
|
40
40
|
};
|
|
41
|
-
export
|
|
42
|
-
|
|
41
|
+
export type LocateResultPoint = [number, number];
|
|
42
|
+
export type Bbox = [number, number, number, number];
|
|
43
|
+
export type LocateResultBbox = Bbox;
|
|
44
|
+
export type PixelBbox = Bbox;
|
|
45
|
+
export interface AIElementLocateResponse {
|
|
46
|
+
bbox?: LocateResultBbox;
|
|
47
|
+
point?: LocateResultPoint;
|
|
43
48
|
errors?: string[];
|
|
44
49
|
}
|
|
45
|
-
export type AIElementResponse = AIElementCoordinatesResponse;
|
|
46
50
|
export interface AIDataExtractionResponse<DataDemand> {
|
|
47
51
|
data: DataDemand;
|
|
48
52
|
errors?: string[];
|
|
49
53
|
thought?: string;
|
|
50
54
|
}
|
|
51
55
|
export interface AISectionLocatorResponse {
|
|
52
|
-
bbox
|
|
53
|
-
|
|
56
|
+
bbox?: LocateResultBbox;
|
|
57
|
+
point?: LocateResultPoint;
|
|
58
|
+
references_bbox?: LocateResultBbox[];
|
|
59
|
+
references_point?: LocateResultPoint[];
|
|
54
60
|
error?: string;
|
|
55
61
|
}
|
|
56
62
|
export interface AIAssertionResponse {
|
|
@@ -140,7 +146,7 @@ export interface ServiceDump extends DumpMeta {
|
|
|
140
146
|
dataDemand?: ServiceExtractParam;
|
|
141
147
|
assertion?: TUserPrompt;
|
|
142
148
|
};
|
|
143
|
-
matchedElement
|
|
149
|
+
matchedElement?: LocateResultElement[];
|
|
144
150
|
matchedRect?: Rect;
|
|
145
151
|
deepLocate?: boolean;
|
|
146
152
|
data: any;
|
|
@@ -187,8 +193,13 @@ export interface AgentAssertOpt {
|
|
|
187
193
|
*
|
|
188
194
|
*/
|
|
189
195
|
export interface PlanningLocateParam extends DetailedLocateParam {
|
|
190
|
-
bbox?:
|
|
196
|
+
bbox?: LocateResultBbox;
|
|
197
|
+
point?: LocateResultPoint;
|
|
191
198
|
}
|
|
199
|
+
export type PlanningLocateParamWithLocatedPixelBbox = PlanningLocateParam & {
|
|
200
|
+
/** Pixel bbox of the located element in screenshot coordinates. */
|
|
201
|
+
locatedPixelBbox: PixelBbox;
|
|
202
|
+
};
|
|
192
203
|
export interface PlanningAction<ParamType = any> {
|
|
193
204
|
thought?: string;
|
|
194
205
|
log?: string;
|
|
@@ -438,7 +449,7 @@ export interface DeviceAction<TParam = any, TReturn = any> {
|
|
|
438
449
|
delayAfterRunner?: number;
|
|
439
450
|
/**
|
|
440
451
|
* An example param object for this action.
|
|
441
|
-
* Locate fields with { prompt }
|
|
452
|
+
* Locate fields with { prompt } may be resolved to internal pixel bboxes when needed.
|
|
442
453
|
*/
|
|
443
454
|
sample?: {
|
|
444
455
|
[K in keyof TParam]?: any;
|
|
@@ -511,7 +522,8 @@ export interface AgentOpt {
|
|
|
511
522
|
cache?: Cache;
|
|
512
523
|
/**
|
|
513
524
|
* Maximum number of replanning cycles for aiAct.
|
|
514
|
-
* Defaults
|
|
525
|
+
* Defaults are resolved by the active model adapter: 20 for standard planning,
|
|
526
|
+
* 40 for UI-TARS, and 100 for Auto-GLM.
|
|
515
527
|
* If omitted, the agent will also read `MIDSCENE_REPLANNING_CYCLE_LIMIT` for backward compatibility.
|
|
516
528
|
*/
|
|
517
529
|
replanningCycleLimit?: number;
|
package/dist/types/yaml.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { TMultimodalPrompt, TUserPrompt } from './common';
|
|
2
|
-
import type { AndroidDeviceOpt, IOSDeviceOpt } from './device';
|
|
2
|
+
import type { AndroidDeviceOpt, HarmonyDeviceOpt, IOSDeviceOpt } from './device';
|
|
3
3
|
import type { AgentOpt, LocateResultElement } from './types';
|
|
4
4
|
import type { UIContext } from './types';
|
|
5
5
|
export interface LocateOption extends Partial<TMultimodalPrompt> {
|
|
@@ -33,6 +33,7 @@ export interface MidsceneYamlScript {
|
|
|
33
33
|
web?: MidsceneYamlScriptWebEnv;
|
|
34
34
|
android?: MidsceneYamlScriptAndroidEnv;
|
|
35
35
|
ios?: MidsceneYamlScriptIOSEnv;
|
|
36
|
+
harmony?: MidsceneYamlScriptHarmonyEnv;
|
|
36
37
|
computer?: MidsceneYamlScriptComputerEnv;
|
|
37
38
|
interface?: MidsceneYamlScriptEnvGeneralInterface;
|
|
38
39
|
config?: MidsceneYamlScriptConfig;
|
|
@@ -133,10 +134,15 @@ export interface MidsceneYamlScriptAndroidEnv extends MidsceneYamlScriptConfig,
|
|
|
133
134
|
export interface MidsceneYamlScriptIOSEnv extends MidsceneYamlScriptConfig, Omit<IOSDeviceOpt, 'customActions'> {
|
|
134
135
|
launch?: string;
|
|
135
136
|
}
|
|
137
|
+
export interface MidsceneYamlScriptHarmonyEnv extends MidsceneYamlScriptConfig, Omit<HarmonyDeviceOpt, 'customActions'> {
|
|
138
|
+
deviceId?: string;
|
|
139
|
+
launch?: string;
|
|
140
|
+
appNameMapping?: Record<string, string>;
|
|
141
|
+
}
|
|
136
142
|
export interface MidsceneYamlScriptComputerEnv extends MidsceneYamlScriptConfig {
|
|
137
143
|
displayId?: string;
|
|
138
144
|
}
|
|
139
|
-
export type MidsceneYamlScriptEnv = MidsceneYamlScriptWebEnv | MidsceneYamlScriptAndroidEnv | MidsceneYamlScriptIOSEnv | MidsceneYamlScriptComputerEnv;
|
|
145
|
+
export type MidsceneYamlScriptEnv = MidsceneYamlScriptWebEnv | MidsceneYamlScriptAndroidEnv | MidsceneYamlScriptIOSEnv | MidsceneYamlScriptHarmonyEnv | MidsceneYamlScriptComputerEnv;
|
|
140
146
|
export interface MidsceneYamlFlowItemAIAction {
|
|
141
147
|
aiAction?: string;
|
|
142
148
|
ai?: string;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@midscene/core",
|
|
3
3
|
"description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
|
|
4
|
-
"version": "1.
|
|
4
|
+
"version": "1.9.0",
|
|
5
5
|
"repository": "https://github.com/web-infra-dev/midscene",
|
|
6
6
|
"homepage": "https://midscenejs.com/",
|
|
7
7
|
"main": "./dist/lib/index.js",
|
|
@@ -97,7 +97,7 @@
|
|
|
97
97
|
"semver": "7.5.2",
|
|
98
98
|
"undici": "^6.0.0",
|
|
99
99
|
"zod": "^3.25.1",
|
|
100
|
-
"@midscene/shared": "1.
|
|
100
|
+
"@midscene/shared": "1.9.0"
|
|
101
101
|
},
|
|
102
102
|
"devDependencies": {
|
|
103
103
|
"@rslib/core": "^0.18.3",
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/actions.mjs","sources":["../../../../src/ai-model/auto-glm/actions.ts"],"sourcesContent":["import { adaptBbox, pointToBbox } from '@/common';\nimport type { DeviceAction } from '@/device';\nimport type { PlanningAction } from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\n\nconst debug = getDebug('auto-glm-actions');\n\n/**\n * Auto-GLM coordinate system range: [0, AUTO_GLM_COORDINATE_MAX]\n */\nconst AUTO_GLM_COORDINATE_MAX = 1000;\n\n/**\n * Convert auto-glm coordinate [0,1000] to bbox in pixel coordinates\n */\nfunction autoGLMCoordinateToBbox(\n x: number,\n y: number,\n width: number,\n height: number,\n): [number, number, number, number] {\n const bbox = pointToBbox(x, y, 10);\n return adaptBbox(bbox, width, height, 'auto-glm');\n}\n\nexport interface BaseAction {\n _metadata: string;\n think?: string;\n}\n\nexport interface TapAction extends BaseAction {\n _metadata: 'do';\n action: 'Tap';\n element: [number, number];\n}\n\nexport interface DoubleTapAction extends BaseAction {\n _metadata: 'do';\n action: 'Double Tap';\n element: [number, number];\n}\n\nexport interface TypeAction extends BaseAction {\n _metadata: 'do';\n action: 'Type';\n text: string;\n}\n\nexport interface SwipeAction extends BaseAction {\n _metadata: 'do';\n action: 'Swipe';\n start: [number, number];\n end: [number, number];\n}\n\nexport interface LongPressAction extends BaseAction {\n _metadata: 'do';\n action: 'Long Press';\n element: [number, number];\n}\n\nexport interface LaunchAction extends BaseAction {\n _metadata: 'do';\n action: 'Launch';\n app: string;\n}\n\nexport interface BackAction extends BaseAction {\n _metadata: 'do';\n action: 'Back';\n}\n\nexport interface HomeAction extends BaseAction {\n _metadata: 'do';\n action: 'Home';\n}\n\nexport interface WaitAction extends BaseAction {\n _metadata: 'do';\n action: 'Wait';\n durationMs: number;\n}\n\nexport interface InteractAction extends BaseAction {\n _metadata: 'do';\n action: 'Interact';\n}\n\nexport interface CallAPIAction extends BaseAction {\n _metadata: 'do';\n action: 'Call_API';\n instruction: string;\n}\n\nexport interface TakeoverAction extends BaseAction {\n _metadata: 'do';\n action: 'Take_over';\n message: string;\n}\n\nexport interface NoteAction extends BaseAction {\n _metadata: 'do';\n action: 'Note';\n message: string;\n}\n\nexport interface FinishAction extends BaseAction {\n _metadata: 'finish';\n message: string;\n}\n\nexport type ParsedAction =\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction\n | FinishAction;\n\nconst BACK_BUTTON_NAMES = ['AndroidBackButton', 'HarmonyBackButton'];\nconst HOME_BUTTON_NAMES = ['AndroidHomeButton', 'HarmonyHomeButton'];\n\n/**\n * Find the action name in actionSpace that matches one of the known names.\n * Falls back to defaultName if no match found or actionSpace is not provided.\n */\nfunction findActionName(\n actionSpace: DeviceAction[] | undefined,\n knownNames: string[],\n defaultName: string,\n): string {\n if (!actionSpace) return defaultName;\n const match = actionSpace.find((a) => knownNames.includes(a.name));\n return match ? match.name : defaultName;\n}\n\nexport function transformAutoGLMAction(\n action: ParsedAction,\n size: { width: number; height: number },\n actionSpace?: DeviceAction[],\n): PlanningAction[] {\n try {\n switch (action._metadata) {\n case 'finish': {\n const finishAction = action as FinishAction;\n debug('Transform finish action:', finishAction);\n return [\n {\n type: 'Finished',\n param: {},\n thought: finishAction.message,\n },\n ];\n }\n case 'do': {\n const doAction = action as\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction;\n\n switch ((doAction as any).action) {\n case 'Tap': {\n const tapAction = doAction as TapAction;\n debug('Transform Tap action:', tapAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n tapAction.element[0],\n tapAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'Tap',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Double Tap': {\n const doubleTapAction = doAction as DoubleTapAction;\n debug('Transform Double Tap action:', doubleTapAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n doubleTapAction.element[0],\n doubleTapAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'DoubleClick',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Type': {\n const typeAction = doAction as TypeAction;\n debug('Transform Type action:', typeAction);\n\n return [\n {\n type: 'Input',\n param: {\n value: typeAction.text,\n },\n },\n ];\n }\n case 'Swipe': {\n const swipeAction = doAction as SwipeAction;\n debug('Transform Swipe action:', swipeAction);\n\n // Calculate locate using start coordinate\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n swipeAction.start[0],\n swipeAction.start[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n // Calculate horizontal and vertical delta in [0,AUTO_GLM_COORDINATE_MAX] coordinate system\n const deltaX = swipeAction.end[0] - swipeAction.start[0];\n const deltaY = swipeAction.end[1] - swipeAction.start[1];\n\n // Determine direction and distance\n let direction: 'up' | 'down' | 'left' | 'right';\n let distance: number;\n\n const absDeltaX = Math.abs(deltaX);\n const absDeltaY = Math.abs(deltaY);\n\n if (absDeltaY > absDeltaX) {\n // Vertical scroll\n distance = Math.round(\n (absDeltaY * size.height) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaY > 0 ? 'up' : 'down';\n } else {\n // Horizontal scroll\n distance = Math.round(\n (absDeltaX * size.width) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaX > 0 ? 'left' : 'right';\n }\n\n debug(\n `Calculate swipe direction: ${direction}, distance: ${distance}`,\n );\n\n return [\n {\n type: 'Scroll',\n param: {\n locate,\n // The scrolling direction here all refers to which direction of the page's content will appear on the screen.\n distance,\n direction,\n },\n thought: swipeAction.think || '',\n },\n ];\n }\n case 'Long Press': {\n const longPressAction = doAction as LongPressAction;\n debug('Transform Long Press action:', longPressAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n longPressAction.element[0],\n longPressAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'LongPress',\n param: {\n locate,\n },\n thought: longPressAction.think || '',\n },\n ];\n }\n case 'Back': {\n const backAction = doAction as BackAction;\n debug('Transform Back action:', backAction);\n return [\n {\n type: findActionName(\n actionSpace,\n BACK_BUTTON_NAMES,\n 'AndroidBackButton',\n ),\n param: {},\n thought: backAction.think || '',\n },\n ];\n }\n case 'Home': {\n const homeAction = doAction as HomeAction;\n debug('Transform Home action:', homeAction);\n return [\n {\n type: findActionName(\n actionSpace,\n HOME_BUTTON_NAMES,\n 'AndroidHomeButton',\n ),\n param: {},\n thought: homeAction.think || '',\n },\n ];\n }\n case 'Wait': {\n const waitAction = doAction as WaitAction;\n debug('Transform Wait action:', waitAction);\n return [\n {\n type: 'Sleep',\n param: {\n timeMs: waitAction.durationMs,\n },\n thought: waitAction.think || '',\n },\n ];\n }\n case 'Launch': {\n const launchAction = doAction as LaunchAction;\n debug('Transform Launch action:', launchAction);\n return [\n {\n type: 'Launch',\n param: { uri: launchAction.app },\n thought: launchAction.think || '',\n },\n ];\n }\n case 'Interact': {\n throw new Error(\n `Action \"Interact\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Call_API': {\n throw new Error(\n `Action \"Call_API\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Take_over': {\n throw new Error(\n `Action \"Take_over\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Note': {\n throw new Error(\n `Action \"Note\" from auto-glm is not supported in the current implementation.`,\n );\n }\n default:\n throw new Error(\n `Unknown do() action type: ${(doAction as any).action}`,\n );\n }\n }\n default:\n throw new Error(\n `Unknown action metadata: ${(action as any)._metadata}`,\n );\n }\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n debug('Transform error:', errorMessage);\n throw new Error(`Failed to transform action: ${errorMessage}`);\n }\n}\n"],"names":["debug","getDebug","AUTO_GLM_COORDINATE_MAX","autoGLMCoordinateToBbox","x","y","width","height","bbox","pointToBbox","adaptBbox","BACK_BUTTON_NAMES","HOME_BUTTON_NAMES","findActionName","actionSpace","knownNames","defaultName","match","a","transformAutoGLMAction","action","size","finishAction","doAction","tapAction","x1","y1","x2","y2","locate","doubleTapAction","typeAction","swipeAction","deltaX","deltaY","direction","distance","absDeltaX","Math","absDeltaY","longPressAction","backAction","homeAction","waitAction","launchAction","Error","error","errorMessage","String"],"mappings":";;AAKA,MAAMA,QAAQC,SAAS;AAKvB,MAAMC,0BAA0B;AAKhC,SAASC,wBACPC,CAAS,EACTC,CAAS,EACTC,KAAa,EACbC,MAAc;IAEd,MAAMC,OAAOC,YAAYL,GAAGC,GAAG;IAC/B,OAAOK,UAAUF,MAAMF,OAAOC,QAAQ;AACxC;AAwGA,MAAMI,oBAAoB;IAAC;IAAqB;CAAoB;AACpE,MAAMC,oBAAoB;IAAC;IAAqB;CAAoB;AAMpE,SAASC,eACPC,WAAuC,EACvCC,UAAoB,EACpBC,WAAmB;IAEnB,IAAI,CAACF,aAAa,OAAOE;IACzB,MAAMC,QAAQH,YAAY,IAAI,CAAC,CAACI,IAAMH,WAAW,QAAQ,CAACG,EAAE,IAAI;IAChE,OAAOD,QAAQA,MAAM,IAAI,GAAGD;AAC9B;AAEO,SAASG,uBACdC,MAAoB,EACpBC,IAAuC,EACvCP,WAA4B;IAE5B,IAAI;QACF,OAAQM,OAAO,SAAS;YACtB,KAAK;gBAAU;oBACb,MAAME,eAAeF;oBACrBpB,MAAM,4BAA4BsB;oBAClC,OAAO;wBACL;4BACE,MAAM;4BACN,OAAO,CAAC;4BACR,SAASA,aAAa,OAAO;wBAC/B;qBACD;gBACH;YACA,KAAK;gBAAM;oBACT,MAAMC,WAAWH;oBAejB,OAASG,SAAiB,MAAM;wBAC9B,KAAK;4BAAO;gCACV,MAAMC,YAAYD;gCAClBvB,MAAM,yBAAyBwB;gCAC/B,MAAM,CAACC,IAAIC,IAAIC,IAAIC,GAAG,GAAGzB,wBACvBqB,UAAU,OAAO,CAAC,EAAE,EACpBA,UAAU,OAAO,CAAC,EAAE,EACpBH,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMC,kBAAkBP;gCACxBvB,MAAM,gCAAgC8B;gCACtC,MAAM,CAACL,IAAIC,IAAIC,IAAIC,GAAG,GAAGzB,wBACvB2B,gBAAgB,OAAO,CAAC,EAAE,EAC1BA,gBAAgB,OAAO,CAAC,EAAE,EAC1BT,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAME,aAAaR;gCACnBvB,MAAM,0BAA0B+B;gCAEhC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,OAAOA,WAAW,IAAI;wCACxB;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAS;gCACZ,MAAMC,cAAcT;gCACpBvB,MAAM,2BAA2BgC;gCAGjC,MAAM,CAACP,IAAIC,IAAIC,IAAIC,GAAG,GAAGzB,wBACvB6B,YAAY,KAAK,CAAC,EAAE,EACpBA,YAAY,KAAK,CAAC,EAAE,EACpBX,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAGA,MAAMK,SAASD,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCACxD,MAAME,SAASF,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCAGxD,IAAIG;gCACJ,IAAIC;gCAEJ,MAAMC,YAAYC,KAAK,GAAG,CAACL;gCAC3B,MAAMM,YAAYD,KAAK,GAAG,CAACJ;gCAE3B,IAAIK,YAAYF,WAAW;oCAEzBD,WAAWE,KAAK,KAAK,CAClBC,YAAYlB,KAAK,MAAM,GAAInB;oCAE9BiC,YAAYD,SAAS,IAAI,OAAO;gCAClC,OAAO;oCAELE,WAAWE,KAAK,KAAK,CAClBD,YAAYhB,KAAK,KAAK,GAAInB;oCAE7BiC,YAAYF,SAAS,IAAI,SAAS;gCACpC;gCAEAjC,MACE,CAAC,2BAA2B,EAAEmC,UAAU,YAAY,EAAEC,UAAU;gCAGlE,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLP;4CAEAO;4CACAD;wCACF;wCACA,SAASH,YAAY,KAAK,IAAI;oCAChC;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMQ,kBAAkBjB;gCACxBvB,MAAM,gCAAgCwC;gCACtC,MAAM,CAACf,IAAIC,IAAIC,IAAIC,GAAG,GAAGzB,wBACvBqC,gBAAgB,OAAO,CAAC,EAAE,EAC1BA,gBAAgB,OAAO,CAAC,EAAE,EAC1BnB,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;wCACA,SAASW,gBAAgB,KAAK,IAAI;oCACpC;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAalB;gCACnBvB,MAAM,0BAA0ByC;gCAChC,OAAO;oCACL;wCACE,MAAM5B,eACJC,aACAH,mBACA;wCAEF,OAAO,CAAC;wCACR,SAAS8B,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAanB;gCACnBvB,MAAM,0BAA0B0C;gCAChC,OAAO;oCACL;wCACE,MAAM7B,eACJC,aACAF,mBACA;wCAEF,OAAO,CAAC;wCACR,SAAS8B,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAapB;gCACnBvB,MAAM,0BAA0B2C;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,QAAQA,WAAW,UAAU;wCAC/B;wCACA,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAU;gCACb,MAAMC,eAAerB;gCACrBvB,MAAM,4BAA4B4C;gCAClC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CAAE,KAAKA,aAAa,GAAG;wCAAC;wCAC/B,SAASA,aAAa,KAAK,IAAI;oCACjC;iCACD;4BACH;wBACA,KAAK;4BACH,MAAM,IAAIC,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ;4BACE,MAAM,IAAIA,MACR,CAAC,0BAA0B,EAAGtB,SAAiB,MAAM,EAAE;oBAE7D;gBACF;YACA;gBACE,MAAM,IAAIsB,MACR,CAAC,yBAAyB,EAAGzB,OAAe,SAAS,EAAE;QAE7D;IACF,EAAE,OAAO0B,OAAO;QACd,MAAMC,eAAeD,iBAAiBD,QAAQC,MAAM,OAAO,GAAGE,OAAOF;QACrE9C,MAAM,oBAAoB+C;QAC1B,MAAM,IAAIF,MAAM,CAAC,4BAA4B,EAAEE,cAAc;IAC/D;AACF"}
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
import { getAutoGLMLocatePrompt, getAutoGLMPlanPrompt } from "./prompt.mjs";
|
|
2
|
-
import { parseAction, parseAutoGLMLocateResponse, parseAutoGLMResponse } from "./parser.mjs";
|
|
3
|
-
import { autoGLMPlanning } from "./planning.mjs";
|
|
4
|
-
import { transformAutoGLMAction } from "./actions.mjs";
|
|
5
|
-
import { isAutoGLM, isUITars } from "./util.mjs";
|
|
6
|
-
export { autoGLMPlanning, getAutoGLMLocatePrompt, getAutoGLMPlanPrompt, isAutoGLM, isUITars, parseAction, parseAutoGLMLocateResponse, parseAutoGLMResponse, transformAutoGLMAction };
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/parser.mjs","sources":["../../../../src/ai-model/auto-glm/parser.ts"],"sourcesContent":["import { getDebug } from '@midscene/shared/logger';\nimport type { ParsedAction } from './actions';\n\nconst debug = getDebug('auto-glm-parser');\n\n// Do not rely on regex extraction here; regex can fail on malformed input.\n// Bad Case: finish(message=\"Finished! Now There is a contact whose name is \\\"Tom\\\" in the list.\")\nexport const extractValueAfter = (src: string, key: string): string => {\n const idx = src.indexOf(key);\n if (idx === -1) {\n throw new Error(`Missing key ${key} in action payload ${src}`);\n }\n let rest = src.slice(idx + key.length).trim();\n if (rest.endsWith('\")')) {\n rest = rest.slice(0, -2);\n }\n return rest;\n};\n\nexport function parseAction(response: {\n think: string;\n content: string;\n}): ParsedAction {\n debug('Parsing action:', response);\n let trimmedResponse = '';\n try {\n trimmedResponse = response.content.trim();\n\n if (\n trimmedResponse.startsWith('do(action=\"Type\"') ||\n trimmedResponse.startsWith('do(action=\"Type_Name\"')\n ) {\n const text = extractValueAfter(trimmedResponse, 'text=\"');\n return {\n _metadata: 'do',\n action: 'Type',\n text,\n think: response.think,\n } as ParsedAction;\n }\n\n if (trimmedResponse.startsWith('finish(message=')) {\n let message = extractValueAfter(trimmedResponse, 'finish(message=\"');\n if (message.endsWith(')')) message = message.slice(0, -1);\n return {\n _metadata: 'finish',\n message,\n think: response.think,\n } as ParsedAction;\n }\n\n if (trimmedResponse.startsWith('do(')) {\n const actionMatch = trimmedResponse.match(/do\\(action=\"([^\"]+)\"/);\n if (!actionMatch)\n throw new Error(\n `Failed to extract action type from do() call; raw=\"${trimmedResponse}\"`,\n );\n const actionType = actionMatch[1];\n\n const baseAction = { _metadata: 'do' as const, think: response.think };\n switch (actionType) {\n case 'Tap': {\n const elementMatch = trimmedResponse.match(/element=\\[(\\d+),(\\d+)\\]/);\n if (!elementMatch)\n throw new Error(\n `Failed to extract element coordinates for Tap; raw=\"${trimmedResponse}\"`,\n );\n return {\n ...baseAction,\n action: 'Tap',\n element: [Number(elementMatch[1]), Number(elementMatch[2])],\n } as ParsedAction;\n }\n case 'Double Tap': {\n const elementMatch = trimmedResponse.match(/element=\\[(\\d+),(\\d+)\\]/);\n if (!elementMatch)\n throw new Error(\n `Failed to extract element coordinates for Double Tap; raw=\"${trimmedResponse}\"`,\n );\n return {\n ...baseAction,\n action: 'Double Tap',\n element: [Number(elementMatch[1]), Number(elementMatch[2])],\n } as ParsedAction;\n }\n case 'Swipe': {\n const startMatch = trimmedResponse.match(/start=\\[(\\d+),(\\d+)\\]/);\n const endMatch = trimmedResponse.match(/end=\\[(\\d+),(\\d+)\\]/);\n if (!startMatch || !endMatch)\n throw new Error(\n `Failed to extract start/end coordinates for Swipe; raw=\"${trimmedResponse}\"`,\n );\n return {\n ...baseAction,\n action: 'Swipe',\n start: [Number(startMatch[1]), Number(startMatch[2])],\n end: [Number(endMatch[1]), Number(endMatch[2])],\n } as ParsedAction;\n }\n case 'Long Press': {\n const elementMatch = trimmedResponse.match(/element=\\[(\\d+),(\\d+)\\]/);\n if (!elementMatch)\n throw new Error(\n `Failed to extract element coordinates for Long Press; raw=\"${trimmedResponse}\"`,\n );\n return {\n ...baseAction,\n action: 'Long Press',\n element: [Number(elementMatch[1]), Number(elementMatch[2])],\n } as ParsedAction;\n }\n case 'Launch': {\n const app = extractValueAfter(trimmedResponse, 'app=\"');\n return { ...baseAction, action: 'Launch', app } as ParsedAction;\n }\n case 'Back': {\n return { ...baseAction, action: 'Back' } as ParsedAction;\n }\n case 'Home': {\n return { ...baseAction, action: 'Home' } as ParsedAction;\n }\n case 'Wait': {\n const durationMatch = trimmedResponse.match(\n /duration=(?:[\"\\[])?(\\d+)/,\n );\n if (!durationMatch) {\n throw new Error(\n `Failed to extract duration for Wait; raw=\"${trimmedResponse}\"`,\n );\n }\n const seconds = Number.parseInt(durationMatch[1], 10);\n const durationMs = seconds * 1000;\n return {\n ...baseAction,\n action: 'Wait',\n durationMs,\n } as ParsedAction;\n }\n case 'Interact': {\n return { ...baseAction, action: 'Interact' } as ParsedAction;\n }\n case 'Call_API': {\n const instruction = extractValueAfter(\n trimmedResponse,\n 'instruction=\"',\n );\n return {\n ...baseAction,\n action: 'Call_API',\n instruction,\n } as ParsedAction;\n }\n case 'Take_over': {\n const message = extractValueAfter(trimmedResponse, 'message=\"');\n return {\n ...baseAction,\n action: 'Take_over',\n message,\n } as ParsedAction;\n }\n case 'Note': {\n const message = extractValueAfter(trimmedResponse, 'message=\"');\n return {\n ...baseAction,\n action: 'Note',\n message,\n } as ParsedAction;\n }\n default:\n throw new Error(\n `Unknown action type: ${actionType}; raw=\"${trimmedResponse}\"`,\n );\n }\n }\n throw new Error(`Failed to parse action: ${trimmedResponse}`);\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n throw new Error(\n `Failed to parse action: ${errorMessage}; raw=\"${trimmedResponse}\"`,\n );\n }\n}\n\nexport function parseAutoGLMResponse(content: string): {\n think: string;\n content: string;\n} {\n if (content.includes('finish(message=')) {\n const parts = content.split('finish(message=');\n const think = parts[0].trim();\n const actionContent = `finish(message=${parts[1]}`;\n return { think, content: actionContent };\n }\n if (content.includes('do(action=')) {\n const parts = content.split('do(action=');\n const think = parts[0].trim();\n const actionContent = `do(action=${parts[1]}`;\n return { think, content: actionContent };\n }\n if (content.includes('<answer>')) {\n const parts = content.split('<answer>');\n const think = parts[0]\n .replace(/<think>/g, '')\n .replace(/<\\/think>/g, '')\n .trim();\n const actionContent = parts[1].replace(/<\\/answer>/g, '').trim();\n return { think, content: actionContent };\n }\n return { think: '', content };\n}\n\nexport function parseAutoGLMLocateResponse(rawResponse: string): {\n think: string;\n coordinates: { x: number; y: number } | null;\n error?: string;\n} {\n const { think, content: actionContent } = parseAutoGLMResponse(rawResponse);\n if (!actionContent.startsWith('do(action=\"Tap\"')) {\n return {\n think,\n coordinates: null,\n error: `Unexpected action type in auto-glm locate response: ${actionContent}`,\n };\n }\n try {\n const elementMatch = actionContent.match(/element=\\[(\\d+),(\\d+)\\]/);\n if (!elementMatch) {\n return {\n think,\n coordinates: null,\n error: `Failed to extract element coordinates from auto-glm response: ${actionContent}`,\n };\n }\n const x = Number(elementMatch[1]);\n const y = Number(elementMatch[2]);\n return { think, coordinates: { x, y } };\n } catch (e) {\n const errorMessage = e instanceof Error ? e.message : String(e);\n return {\n think,\n coordinates: null,\n error: `Failed to parse coordinates \"${actionContent}\" with errorMessage: ${errorMessage}`,\n };\n }\n}\n"],"names":["debug","getDebug","extractValueAfter","src","key","idx","Error","rest","parseAction","response","trimmedResponse","text","message","actionMatch","actionType","baseAction","elementMatch","Number","startMatch","endMatch","app","durationMatch","seconds","durationMs","instruction","error","errorMessage","String","parseAutoGLMResponse","content","parts","think","actionContent","parseAutoGLMLocateResponse","rawResponse","x","y","e"],"mappings":";AAGA,MAAMA,QAAQC,SAAS;AAIhB,MAAMC,oBAAoB,CAACC,KAAaC;IAC7C,MAAMC,MAAMF,IAAI,OAAO,CAACC;IACxB,IAAIC,AAAQ,OAARA,KACF,MAAM,IAAIC,MAAM,CAAC,YAAY,EAAEF,IAAI,mBAAmB,EAAED,KAAK;IAE/D,IAAII,OAAOJ,IAAI,KAAK,CAACE,MAAMD,IAAI,MAAM,EAAE,IAAI;IAC3C,IAAIG,KAAK,QAAQ,CAAC,OAChBA,OAAOA,KAAK,KAAK,CAAC,GAAG;IAEvB,OAAOA;AACT;AAEO,SAASC,YAAYC,QAG3B;IACCT,MAAM,mBAAmBS;IACzB,IAAIC,kBAAkB;IACtB,IAAI;QACFA,kBAAkBD,SAAS,OAAO,CAAC,IAAI;QAEvC,IACEC,gBAAgB,UAAU,CAAC,uBAC3BA,gBAAgB,UAAU,CAAC,0BAC3B;YACA,MAAMC,OAAOT,kBAAkBQ,iBAAiB;YAChD,OAAO;gBACL,WAAW;gBACX,QAAQ;gBACRC;gBACA,OAAOF,SAAS,KAAK;YACvB;QACF;QAEA,IAAIC,gBAAgB,UAAU,CAAC,oBAAoB;YACjD,IAAIE,UAAUV,kBAAkBQ,iBAAiB;YACjD,IAAIE,QAAQ,QAAQ,CAAC,MAAMA,UAAUA,QAAQ,KAAK,CAAC,GAAG;YACtD,OAAO;gBACL,WAAW;gBACXA;gBACA,OAAOH,SAAS,KAAK;YACvB;QACF;QAEA,IAAIC,gBAAgB,UAAU,CAAC,QAAQ;YACrC,MAAMG,cAAcH,gBAAgB,KAAK,CAAC;YAC1C,IAAI,CAACG,aACH,MAAM,IAAIP,MACR,CAAC,mDAAmD,EAAEI,gBAAgB,CAAC,CAAC;YAE5E,MAAMI,aAAaD,WAAW,CAAC,EAAE;YAEjC,MAAME,aAAa;gBAAE,WAAW;gBAAe,OAAON,SAAS,KAAK;YAAC;YACrE,OAAQK;gBACN,KAAK;oBAAO;wBACV,MAAME,eAAeN,gBAAgB,KAAK,CAAC;wBAC3C,IAAI,CAACM,cACH,MAAM,IAAIV,MACR,CAAC,oDAAoD,EAAEI,gBAAgB,CAAC,CAAC;wBAE7E,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACR,SAAS;gCAACE,OAAOD,YAAY,CAAC,EAAE;gCAAGC,OAAOD,YAAY,CAAC,EAAE;6BAAE;wBAC7D;oBACF;gBACA,KAAK;oBAAc;wBACjB,MAAMA,eAAeN,gBAAgB,KAAK,CAAC;wBAC3C,IAAI,CAACM,cACH,MAAM,IAAIV,MACR,CAAC,2DAA2D,EAAEI,gBAAgB,CAAC,CAAC;wBAEpF,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACR,SAAS;gCAACE,OAAOD,YAAY,CAAC,EAAE;gCAAGC,OAAOD,YAAY,CAAC,EAAE;6BAAE;wBAC7D;oBACF;gBACA,KAAK;oBAAS;wBACZ,MAAME,aAAaR,gBAAgB,KAAK,CAAC;wBACzC,MAAMS,WAAWT,gBAAgB,KAAK,CAAC;wBACvC,IAAI,CAACQ,cAAc,CAACC,UAClB,MAAM,IAAIb,MACR,CAAC,wDAAwD,EAAEI,gBAAgB,CAAC,CAAC;wBAEjF,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACR,OAAO;gCAACE,OAAOC,UAAU,CAAC,EAAE;gCAAGD,OAAOC,UAAU,CAAC,EAAE;6BAAE;4BACrD,KAAK;gCAACD,OAAOE,QAAQ,CAAC,EAAE;gCAAGF,OAAOE,QAAQ,CAAC,EAAE;6BAAE;wBACjD;oBACF;gBACA,KAAK;oBAAc;wBACjB,MAAMH,eAAeN,gBAAgB,KAAK,CAAC;wBAC3C,IAAI,CAACM,cACH,MAAM,IAAIV,MACR,CAAC,2DAA2D,EAAEI,gBAAgB,CAAC,CAAC;wBAEpF,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACR,SAAS;gCAACE,OAAOD,YAAY,CAAC,EAAE;gCAAGC,OAAOD,YAAY,CAAC,EAAE;6BAAE;wBAC7D;oBACF;gBACA,KAAK;oBAAU;wBACb,MAAMI,MAAMlB,kBAAkBQ,iBAAiB;wBAC/C,OAAO;4BAAE,GAAGK,UAAU;4BAAE,QAAQ;4BAAUK;wBAAI;oBAChD;gBACA,KAAK;oBACH,OAAO;wBAAE,GAAGL,UAAU;wBAAE,QAAQ;oBAAO;gBAEzC,KAAK;oBACH,OAAO;wBAAE,GAAGA,UAAU;wBAAE,QAAQ;oBAAO;gBAEzC,KAAK;oBAAQ;wBACX,MAAMM,gBAAgBX,gBAAgB,KAAK,CACzC;wBAEF,IAAI,CAACW,eACH,MAAM,IAAIf,MACR,CAAC,0CAA0C,EAAEI,gBAAgB,CAAC,CAAC;wBAGnE,MAAMY,UAAUL,OAAO,QAAQ,CAACI,aAAa,CAAC,EAAE,EAAE;wBAClD,MAAME,aAAaD,AAAU,OAAVA;wBACnB,OAAO;4BACL,GAAGP,UAAU;4BACb,QAAQ;4BACRQ;wBACF;oBACF;gBACA,KAAK;oBACH,OAAO;wBAAE,GAAGR,UAAU;wBAAE,QAAQ;oBAAW;gBAE7C,KAAK;oBAAY;wBACf,MAAMS,cAActB,kBAClBQ,iBACA;wBAEF,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACRS;wBACF;oBACF;gBACA,KAAK;oBAAa;wBAChB,MAAMZ,UAAUV,kBAAkBQ,iBAAiB;wBACnD,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACRH;wBACF;oBACF;gBACA,KAAK;oBAAQ;wBACX,MAAMA,UAAUV,kBAAkBQ,iBAAiB;wBACnD,OAAO;4BACL,GAAGK,UAAU;4BACb,QAAQ;4BACRH;wBACF;oBACF;gBACA;oBACE,MAAM,IAAIN,MACR,CAAC,qBAAqB,EAAEQ,WAAW,OAAO,EAAEJ,gBAAgB,CAAC,CAAC;YAEpE;QACF;QACA,MAAM,IAAIJ,MAAM,CAAC,wBAAwB,EAAEI,iBAAiB;IAC9D,EAAE,OAAOe,OAAO;QACd,MAAMC,eAAeD,iBAAiBnB,QAAQmB,MAAM,OAAO,GAAGE,OAAOF;QACrE,MAAM,IAAInB,MACR,CAAC,wBAAwB,EAAEoB,aAAa,OAAO,EAAEhB,gBAAgB,CAAC,CAAC;IAEvE;AACF;AAEO,SAASkB,qBAAqBC,OAAe;IAIlD,IAAIA,QAAQ,QAAQ,CAAC,oBAAoB;QACvC,MAAMC,QAAQD,QAAQ,KAAK,CAAC;QAC5B,MAAME,QAAQD,KAAK,CAAC,EAAE,CAAC,IAAI;QAC3B,MAAME,gBAAgB,CAAC,eAAe,EAAEF,KAAK,CAAC,EAAE,EAAE;QAClD,OAAO;YAAEC;YAAO,SAASC;QAAc;IACzC;IACA,IAAIH,QAAQ,QAAQ,CAAC,eAAe;QAClC,MAAMC,QAAQD,QAAQ,KAAK,CAAC;QAC5B,MAAME,QAAQD,KAAK,CAAC,EAAE,CAAC,IAAI;QAC3B,MAAME,gBAAgB,CAAC,UAAU,EAAEF,KAAK,CAAC,EAAE,EAAE;QAC7C,OAAO;YAAEC;YAAO,SAASC;QAAc;IACzC;IACA,IAAIH,QAAQ,QAAQ,CAAC,aAAa;QAChC,MAAMC,QAAQD,QAAQ,KAAK,CAAC;QAC5B,MAAME,QAAQD,KAAK,CAAC,EAAE,CACnB,OAAO,CAAC,YAAY,IACpB,OAAO,CAAC,cAAc,IACtB,IAAI;QACP,MAAME,gBAAgBF,KAAK,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,IAAI,IAAI;QAC9D,OAAO;YAAEC;YAAO,SAASC;QAAc;IACzC;IACA,OAAO;QAAE,OAAO;QAAIH;IAAQ;AAC9B;AAEO,SAASI,2BAA2BC,WAAmB;IAK5D,MAAM,EAAEH,KAAK,EAAE,SAASC,aAAa,EAAE,GAAGJ,qBAAqBM;IAC/D,IAAI,CAACF,cAAc,UAAU,CAAC,oBAC5B,OAAO;QACLD;QACA,aAAa;QACb,OAAO,CAAC,oDAAoD,EAAEC,eAAe;IAC/E;IAEF,IAAI;QACF,MAAMhB,eAAegB,cAAc,KAAK,CAAC;QACzC,IAAI,CAAChB,cACH,OAAO;YACLe;YACA,aAAa;YACb,OAAO,CAAC,8DAA8D,EAAEC,eAAe;QACzF;QAEF,MAAMG,IAAIlB,OAAOD,YAAY,CAAC,EAAE;QAChC,MAAMoB,IAAInB,OAAOD,YAAY,CAAC,EAAE;QAChC,OAAO;YAAEe;YAAO,aAAa;gBAAEI;gBAAGC;YAAE;QAAE;IACxC,EAAE,OAAOC,GAAG;QACV,MAAMX,eAAeW,aAAa/B,QAAQ+B,EAAE,OAAO,GAAGV,OAAOU;QAC7D,OAAO;YACLN;YACA,aAAa;YACb,OAAO,CAAC,6BAA6B,EAAEC,cAAc,qBAAqB,EAAEN,cAAc;QAC5F;IACF;AACF"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/planning.mjs","sources":["../../../../src/ai-model/auto-glm/planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/device';\nimport type { PlanningAIResponse, UIContext } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { ConversationHistory } from '../conversation-history';\nimport {\n AIResponseParseError,\n callAIWithStringResponse,\n} from '../service-caller/index';\nimport { transformAutoGLMAction } from './actions';\nimport { parseAction, parseAutoGLMResponse } from './parser';\nimport { getAutoGLMPlanPrompt } from './prompt';\n\nconst debug = getDebug('auto-glm-planning');\n\nexport async function autoGLMPlanning(\n userInstruction: string,\n options: {\n conversationHistory: ConversationHistory;\n context: UIContext;\n modelConfig: IModelConfig;\n actionContext?: string;\n actionSpace?: DeviceAction[];\n abortSignal?: AbortSignal;\n },\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, modelConfig, actionContext } = options;\n\n const systemPrompt =\n getAutoGLMPlanPrompt(modelConfig.modelFamily) +\n (actionContext\n ? `<high_priority_knowledge>${actionContext}</high_priority_knowledge>`\n : '');\n\n const imagePayloadBase64 = context.screenshot.base64;\n\n conversationHistory.append({\n role: 'user',\n content: [{ type: 'text', text: userInstruction }],\n });\n conversationHistory.append({\n role: 'user',\n content: [{ type: 'image_url', image_url: { url: imagePayloadBase64 } }],\n });\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...conversationHistory.snapshot(1),\n ];\n\n const { content: rawResponse, usage } = await callAIWithStringResponse(\n msgs,\n modelConfig,\n {\n abortSignal: options.abortSignal,\n },\n );\n\n debug('autoGLMPlanning rawResponse:', rawResponse);\n\n let parsedResponse: ReturnType<typeof parseAutoGLMResponse>;\n let transformedActions: ReturnType<typeof transformAutoGLMAction>;\n\n try {\n parsedResponse = parseAutoGLMResponse(rawResponse);\n debug('thinking in response:', parsedResponse.think);\n debug('action in response:', parsedResponse.content);\n\n const parsedAction = parseAction(parsedResponse);\n debug('Parsed action object:', parsedAction);\n transformedActions = transformAutoGLMAction(\n parsedAction,\n context.shotSize,\n options.actionSpace,\n );\n debug('Transformed actions:', transformedActions);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `Parse error: ${errorMessage}`,\n JSON.stringify(rawResponse, undefined, 2),\n usage,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: `<think>${parsedResponse.think}</think><answer>${parsedResponse.content}</answer>`,\n });\n\n const shouldContinuePlanning = !parsedResponse.content.startsWith('finish(');\n\n return {\n actions: transformedActions,\n log: rawResponse,\n usage,\n shouldContinuePlanning,\n rawResponse: JSON.stringify(rawResponse, undefined, 2),\n };\n}\n"],"names":["debug","getDebug","autoGLMPlanning","userInstruction","options","conversationHistory","context","modelConfig","actionContext","systemPrompt","getAutoGLMPlanPrompt","imagePayloadBase64","msgs","rawResponse","usage","callAIWithStringResponse","parsedResponse","transformedActions","parseAutoGLMResponse","parsedAction","parseAction","transformAutoGLMAction","parseError","errorMessage","Error","String","AIResponseParseError","JSON","undefined","shouldContinuePlanning"],"mappings":";;;;;AAcA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,gBACpBC,eAAuB,EACvBC,OAOC;IAED,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,WAAW,EAAEC,aAAa,EAAE,GAAGJ;IAErE,MAAMK,eACJC,qBAAqBH,YAAY,WAAW,IAC3CC,CAAAA,gBACG,CAAC,yBAAyB,EAAEA,cAAc,0BAA0B,CAAC,GACrE,EAAC;IAEP,MAAMG,qBAAqBL,QAAQ,UAAU,CAAC,MAAM;IAEpDD,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAQ,MAAMF;YAAgB;SAAE;IACpD;IACAE,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAa,WAAW;oBAAE,KAAKM;gBAAmB;YAAE;SAAE;IAC1E;IAEA,MAAMC,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASH;QAAa;WACrCJ,oBAAoB,QAAQ,CAAC;KACjC;IAED,MAAM,EAAE,SAASQ,WAAW,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC5CH,MACAL,aACA;QACE,aAAaH,QAAQ,WAAW;IAClC;IAGFJ,MAAM,gCAAgCa;IAEtC,IAAIG;IACJ,IAAIC;IAEJ,IAAI;QACFD,iBAAiBE,qBAAqBL;QACtCb,MAAM,yBAAyBgB,eAAe,KAAK;QACnDhB,MAAM,uBAAuBgB,eAAe,OAAO;QAEnD,MAAMG,eAAeC,YAAYJ;QACjChB,MAAM,yBAAyBmB;QAC/BF,qBAAqBI,uBACnBF,cACAb,QAAQ,QAAQ,EAChBF,QAAQ,WAAW;QAErBJ,MAAM,wBAAwBiB;IAChC,EAAE,OAAOK,YAAY;QAEnB,MAAMC,eACJD,sBAAsBE,QAAQF,WAAW,OAAO,GAAGG,OAAOH;QAC5D,MAAM,IAAII,qBACR,CAAC,aAAa,EAAEH,cAAc,EAC9BI,KAAK,SAAS,CAACd,aAAae,QAAW,IACvCd;IAEJ;IAEAT,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS,CAAC,OAAO,EAAEW,eAAe,KAAK,CAAC,gBAAgB,EAAEA,eAAe,OAAO,CAAC,SAAS,CAAC;IAC7F;IAEA,MAAMa,yBAAyB,CAACb,eAAe,OAAO,CAAC,UAAU,CAAC;IAElE,OAAO;QACL,SAASC;QACT,KAAKJ;QACLC;QACAe;QACA,aAAaF,KAAK,SAAS,CAACd,aAAae,QAAW;IACtD;AACF"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/prompt.mjs","sources":["../../../../src/ai-model/auto-glm/prompt.ts"],"sourcesContent":["/**\n * Auto-GLM Prompt Templates\n *\n * Portions of this file are derived from Open-AutoGLM\n * Copyright (c) 2024 zai-org\n * Licensed under the Apache License, Version 2.0\n *\n * Source: https://github.com/zai-org/Open-AutoGLM\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n * Modifications:\n * - Adapted prompts for Midscene.js integration\n */\n\nimport type { TModelFamily } from '@midscene/shared/env';\n\n/**\n * Get formatted date string for system prompts\n * @returns Formatted date string like \"2026-01-12, Sunday\"\n */\nfunction getMultilingualFormattedDate(): string {\n const today = new Date();\n const year = today.getFullYear();\n const month = String(today.getMonth() + 1).padStart(2, '0');\n const date = String(today.getDate()).padStart(2, '0');\n const dayOfWeek = [\n 'Sunday',\n 'Monday',\n 'Tuesday',\n 'Wednesday',\n 'Thursday',\n 'Friday',\n 'Saturday',\n ][today.getDay()];\n\n return `${year}-${month}-${date}, ${dayOfWeek}`;\n}\n\n/**\n * Get formatted Chinese date (e.g., \"2026年01月13日 星期一\")\n */\nfunction getChineseFormattedDate(): string {\n const today = new Date();\n const year = today.getFullYear();\n const month = String(today.getMonth() + 1).padStart(2, '0');\n const date = String(today.getDate()).padStart(2, '0');\n const weekdayNames = [\n '星期日',\n '星期一',\n '星期二',\n '星期三',\n '星期四',\n '星期五',\n '星期六',\n ];\n const weekday = weekdayNames[today.getDay()];\n\n return `${year}年${month}月${date}日 ${weekday}`;\n}\n\nconst getAutoGLMMultilingualPlanPrompt = (): string => {\n return `\nThe current date: ${getMultilingualFormattedDate()}\n\n# Setup\nYou are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.\n\n# More details about the code\nYour response format must be structured as follows:\n\nThink first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.\nProvide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.\n\nYour output should STRICTLY follow the format:\n<think>\n[Your thought]\n</think>\n<answer>\n[Your operation code]\n</answer>\n\n- **Tap**\n Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.\n **Example**:\n <answer>\n do(action=\"Tap\", element=[x,y])\n </answer>\n- **Type**\n Enter text into the currently focused input field.\n **Example**:\n <answer>\n do(action=\"Type\", text=\"Hello World\")\n </answer>\n- **Swipe**\n Perform a swipe action with start point and end point.\n **Examples**:\n <answer>\n do(action=\"Swipe\", start=[x1,y1], end=[x2,y2])\n </answer>\n- **Long Press**\n Perform a long press action on a specified screen area.\n You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.\n **Example**:\n <answer>\n do(action=\"Long Press\", element=[x,y])\n </answer>\n- **Launch**\n Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.\n **Example**:\n <answer>\n do(action=\"Launch\", app=\"Settings\")\n </answer>\n- **Back**\n Press the Back button to navigate to the previous screen.\n **Example**:\n <answer>\n do(action=\"Back\")\n </answer>\n- **Finish**\n Terminate the program and optionally print a message.\n **Example**:\n <answer>\n finish(message=\"Task completed.\")\n </answer>\n\n\nREMEMBER:\n- Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in <think> part.\n- Only ONE LINE of action in <answer> part per response: Each step must contain exactly one line of executable code.\n- Generate execution code strictly according to format requirements.\n `;\n};\n\nconst getAutoGLMChinesePlanPrompt = (): string => {\n return `\n今天的日期是: ${getChineseFormattedDate()}\n\n你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。\n你必须严格按照要求输出以下格式:\n<think>{think}</think>\n<answer>{action}</answer>\n\n其中:\n- {think} 是对你为什么选择这个操作的简短推理说明。\n- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。\n\n操作指令及其作用如下:\n- do(action=\"Launch\", app=\"xxx\") \n Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Tap\", element=[x,y]) \n Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Tap\", element=[x,y], message=\"重要操作\") \n 基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。\n- do(action=\"Type\", text=\"xxx\") \n Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。\n- do(action=\"Type_Name\", text=\"xxx\") \n Type_Name是输入人名的操作,基本功能同Type。\n- do(action=\"Swipe\", start=[x1,y1], end=[x2,y2]) \n Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Long Press\", element=[x,y]) \n Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。\n- do(action=\"Double Tap\", element=[x,y]) \n Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Back\") \n 导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Home\") \n Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Wait\", duration=\"x seconds\") \n 等待页面加载,x为需要等待多少秒。\n- finish(message=\"xxx\") \n finish是结束任务的操作,表示准确完整完成任务,message是终止信息。 \n\n必须遵循的规则:\n0. 严禁调用 Interact、Take_over、Note、Call_API 这四个操作,这些操作暂不支持。\n1. 在执行任何操作前,先检查当前app是否是目标app,如果不是,先执行 Launch。\n2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化,请点击页面左上角的返回键进行返回,或者右上角的X号关闭。\n3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。\n4. 如果页面显示网络问题,需要重新加载,请点击重新加载。\n5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。\n6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。\n7. 在做小红书总结类任务时一定要筛选图文笔记。\n8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。\n9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。\n10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。\n11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将\"群\"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。\n12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。\n13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。\n14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。\n15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。\n16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。\n17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message=\"原因\")。\n18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。\n`;\n};\n\nexport const getAutoGLMPlanPrompt = (\n modelFamily: TModelFamily | undefined,\n): string => {\n if (modelFamily === 'auto-glm-multilingual') {\n return getAutoGLMMultilingualPlanPrompt();\n } else if (modelFamily === 'auto-glm') {\n return getAutoGLMChinesePlanPrompt();\n }\n throw new Error(\n `Unsupported modelFamily for Auto-GLM plan prompt: ${modelFamily}`,\n );\n};\n\nexport const getAutoGLMLocatePrompt = (\n modelFamily: TModelFamily | undefined,\n): string => {\n if (modelFamily === 'auto-glm-multilingual') {\n return `\nThe current date: ${getMultilingualFormattedDate()}\n\n# Setup\nYou are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.\n\n# More details about the code\nYour response format must be structured as follows:\n\nThink first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.\nProvide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.\n\nYour output should STRICTLY follow the format:\n<think>\n[Your thought]\n</think>\n<answer>\n[Your operation code]\n</answer>\n\n- **Tap**\n Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.\n **Example**:\n <answer>\n do(action=\"Tap\", element=[x,y])\n </answer>\n\nREMEMBER:\n- Your goal is to locate and tap the UI element specified by the user (e.g., button, icon, link, etc.). Do not attempt any other actions.\n `;\n } else if (modelFamily === 'auto-glm') {\n return `\n今天的日期是: ${getChineseFormattedDate()}\n\n你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。\n你必须严格按照要求输出以下格式:\n<think>{think}</think>\n<answer>{action}</answer>\n\n其中:\n- {think} 是对你为什么选择这个操作的简短推理说明。\n- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。\n\n操作指令及其作用如下:\n- do(action=\"Tap\", element=[x,y]) \n Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n\n必须遵循的规则:\n- 你的目标是定位并点击用户指定的UI元素(例如按钮、图标、链接等),请不要尝试任何其他的操作。\n `;\n }\n throw new Error(\n `Unsupported modelFamily for Auto-GLM locate prompt: ${modelFamily}`,\n );\n};\n"],"names":["getMultilingualFormattedDate","today","Date","year","month","String","date","dayOfWeek","getChineseFormattedDate","weekdayNames","weekday","getAutoGLMMultilingualPlanPrompt","getAutoGLMChinesePlanPrompt","getAutoGLMPlanPrompt","modelFamily","Error","getAutoGLMLocatePrompt"],"mappings":"AA+BA,SAASA;IACP,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,OAAOF,MAAM,WAAW;IAC9B,MAAMG,QAAQC,OAAOJ,MAAM,QAAQ,KAAK,GAAG,QAAQ,CAAC,GAAG;IACvD,MAAMK,OAAOD,OAAOJ,MAAM,OAAO,IAAI,QAAQ,CAAC,GAAG;IACjD,MAAMM,YAAY;QAChB;QACA;QACA;QACA;QACA;QACA;QACA;KACD,CAACN,MAAM,MAAM,GAAG;IAEjB,OAAO,GAAGE,KAAK,CAAC,EAAEC,MAAM,CAAC,EAAEE,KAAK,EAAE,EAAEC,WAAW;AACjD;AAKA,SAASC;IACP,MAAMP,QAAQ,IAAIC;IAClB,MAAMC,OAAOF,MAAM,WAAW;IAC9B,MAAMG,QAAQC,OAAOJ,MAAM,QAAQ,KAAK,GAAG,QAAQ,CAAC,GAAG;IACvD,MAAMK,OAAOD,OAAOJ,MAAM,OAAO,IAAI,QAAQ,CAAC,GAAG;IACjD,MAAMQ,eAAe;QACnB;QACA;QACA;QACA;QACA;QACA;QACA;KACD;IACD,MAAMC,UAAUD,YAAY,CAACR,MAAM,MAAM,GAAG;IAE5C,OAAO,GAAGE,KAAK,CAAC,EAAEC,MAAM,CAAC,EAAEE,KAAK,EAAE,EAAEI,SAAS;AAC/C;AAEA,MAAMC,mCAAmC,IAChC,CAAC;kBACQ,EAAEX,+BAA+B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAoEjD,CAAC;AAGH,MAAMY,8BAA8B,IAC3B,CAAC;QACF,EAAEJ,0BAA0B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAyDpC,CAAC;AAGM,MAAMK,uBAAuB,CAClCC;IAEA,IAAIA,AAAgB,4BAAhBA,aACF,OAAOH;IACF,IAAIG,AAAgB,eAAhBA,aACT,OAAOF;IAET,MAAM,IAAIG,MACR,CAAC,kDAAkD,EAAED,aAAa;AAEtE;AAEO,MAAME,yBAAyB,CACpCF;IAEA,IAAIA,AAAgB,4BAAhBA,aACF,OAAO,CAAC;kBACM,EAAEd,+BAA+B;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA4BjD,CAAC;IACM,IAAIc,AAAgB,eAAhBA,aACT,OAAO,CAAC;QACJ,EAAEN,0BAA0B;;;;;;;;;;;;;;;;;IAiBhC,CAAC;IAEH,MAAM,IAAIO,MACR,CAAC,oDAAoD,EAAED,aAAa;AAExE"}
|