@midscene/core 1.8.10 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +40 -50
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/task-builder.mjs +39 -19
- package/dist/es/agent/task-builder.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +24 -22
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +11 -14
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/connectivity.mjs +7 -3
- package/dist/es/ai-model/connectivity.mjs.map +1 -1
- package/dist/es/ai-model/errors.mjs +9 -0
- package/dist/es/ai-model/errors.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +3 -4
- package/dist/es/ai-model/inspect.mjs +132 -144
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +46 -28
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/actions.mjs +22 -44
- package/dist/es/ai-model/models/auto-glm/actions.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/adapter.mjs +45 -0
- package/dist/es/ai-model/models/auto-glm/adapter.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/locate.mjs +112 -0
- package/dist/es/ai-model/models/auto-glm/locate.mjs.map +1 -0
- package/dist/es/ai-model/models/auto-glm/parser.mjs.map +1 -0
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/planning.mjs +6 -7
- package/dist/es/ai-model/models/auto-glm/planning.mjs.map +1 -0
- package/dist/es/ai-model/{auto-glm → models/auto-glm}/prompt.mjs +3 -11
- package/dist/es/ai-model/models/auto-glm/prompt.mjs.map +1 -0
- package/dist/es/ai-model/models/default.mjs +12 -0
- package/dist/es/ai-model/models/default.mjs.map +1 -0
- package/dist/es/ai-model/models/doubao.mjs +138 -0
- package/dist/es/ai-model/models/doubao.mjs.map +1 -0
- package/dist/es/ai-model/models/gemini.mjs +34 -0
- package/dist/es/ai-model/models/gemini.mjs.map +1 -0
- package/dist/es/ai-model/models/glm.mjs +37 -0
- package/dist/es/ai-model/models/glm.mjs.map +1 -0
- package/dist/es/ai-model/models/gpt.mjs +31 -0
- package/dist/es/ai-model/models/gpt.mjs.map +1 -0
- package/dist/es/ai-model/models/index.mjs +2 -0
- package/dist/es/ai-model/models/qwen.mjs +113 -0
- package/dist/es/ai-model/models/qwen.mjs.map +1 -0
- package/dist/es/ai-model/models/registry.mjs +45 -0
- package/dist/es/ai-model/models/registry.mjs.map +1 -0
- package/dist/es/ai-model/models/resolved.mjs +104 -0
- package/dist/es/ai-model/models/resolved.mjs.map +1 -0
- package/dist/es/ai-model/models/types.mjs +0 -0
- package/dist/es/ai-model/models/ui-tars/adapter.mjs +142 -0
- package/dist/es/ai-model/models/ui-tars/adapter.mjs.map +1 -0
- package/dist/es/ai-model/{ui-tars-planning.mjs → models/ui-tars/planning.mjs} +44 -62
- package/dist/es/ai-model/models/ui-tars/planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +3 -3
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +11 -11
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +25 -60
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -10
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/locate-grounding-rules.mjs +9 -0
- package/dist/es/ai-model/prompt/locate-grounding-rules.mjs.map +1 -0
- package/dist/es/ai-model/prompt/locate-param-example.mjs +15 -0
- package/dist/es/ai-model/prompt/locate-param-example.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +5 -5
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +5 -5
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompts/locate-result-coordinates.mjs +107 -0
- package/dist/es/ai-model/prompts/locate-result-coordinates.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +59 -190
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/json.mjs +60 -0
- package/dist/es/ai-model/service-caller/json.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/bbox.mjs +68 -0
- package/dist/es/ai-model/shared/model-locate-result/bbox.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/factory.mjs +96 -0
- package/dist/es/ai-model/shared/model-locate-result/factory.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/index.mjs +3 -0
- package/dist/es/ai-model/shared/model-locate-result/parse.mjs +41 -0
- package/dist/es/ai-model/shared/model-locate-result/parse.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/pixel-bbox-mapper.mjs +64 -0
- package/dist/es/ai-model/shared/model-locate-result/pixel-bbox-mapper.mjs.map +1 -0
- package/dist/es/ai-model/shared/model-locate-result/types.mjs +0 -0
- package/dist/es/ai-model/types.mjs +0 -0
- package/dist/es/ai-model/workflows/image-preprocess.mjs +27 -0
- package/dist/es/ai-model/workflows/image-preprocess.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/index.mjs +2 -0
- package/dist/es/ai-model/workflows/inspect/locate-result-rect.mjs +23 -0
- package/dist/es/ai-model/workflows/inspect/locate-result-rect.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/search-area-mapping.mjs +18 -0
- package/dist/es/ai-model/workflows/inspect/search-area-mapping.mjs.map +1 -0
- package/dist/es/ai-model/workflows/inspect/types.mjs +0 -0
- package/dist/es/ai-model/workflows/planning/index.mjs +5 -0
- package/dist/es/ai-model/workflows/planning/index.mjs.map +1 -0
- package/dist/es/ai-model/workflows/planning/types.mjs +0 -0
- package/dist/es/common.mjs +2 -174
- package/dist/es/common.mjs.map +1 -1
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/service/index.mjs +96 -69
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/player.mjs +4 -3
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/lib/agent/agent.js +43 -53
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/task-builder.js +38 -18
- package/dist/lib/agent/task-builder.js.map +1 -1
- package/dist/lib/agent/tasks.js +23 -21
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +17 -17
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/connectivity.js +7 -3
- package/dist/lib/ai-model/connectivity.js.map +1 -1
- package/dist/lib/ai-model/errors.js +46 -0
- package/dist/lib/ai-model/errors.js.map +1 -0
- package/dist/lib/ai-model/index.js +7 -14
- package/dist/lib/ai-model/inspect.js +141 -144
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +44 -26
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/actions.js +22 -44
- package/dist/lib/ai-model/models/auto-glm/actions.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/adapter.js +79 -0
- package/dist/lib/ai-model/models/auto-glm/adapter.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/locate.js +146 -0
- package/dist/lib/ai-model/models/auto-glm/locate.js.map +1 -0
- package/dist/lib/ai-model/models/auto-glm/parser.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/planning.js +8 -9
- package/dist/lib/ai-model/models/auto-glm/planning.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm → models/auto-glm}/prompt.js +14 -16
- package/dist/lib/ai-model/models/auto-glm/prompt.js.map +1 -0
- package/dist/lib/ai-model/{auto-glm/util.js → models/default.js} +13 -13
- package/dist/lib/ai-model/models/default.js.map +1 -0
- package/dist/lib/ai-model/models/doubao.js +184 -0
- package/dist/lib/ai-model/models/doubao.js.map +1 -0
- package/dist/lib/ai-model/models/gemini.js +68 -0
- package/dist/lib/ai-model/models/gemini.js.map +1 -0
- package/dist/lib/ai-model/models/glm.js +71 -0
- package/dist/lib/ai-model/models/glm.js.map +1 -0
- package/dist/lib/ai-model/models/gpt.js +65 -0
- package/dist/lib/ai-model/models/gpt.js.map +1 -0
- package/dist/lib/ai-model/{service-caller/image-detail.js → models/index.js} +8 -7
- package/dist/lib/ai-model/models/index.js.map +1 -0
- package/dist/lib/ai-model/models/qwen.js +147 -0
- package/dist/lib/ai-model/models/qwen.js.map +1 -0
- package/dist/lib/ai-model/models/registry.js +85 -0
- package/dist/lib/ai-model/models/registry.js.map +1 -0
- package/dist/lib/ai-model/models/resolved.js +138 -0
- package/dist/lib/ai-model/models/resolved.js.map +1 -0
- package/dist/lib/ai-model/models/types.js +20 -0
- package/dist/lib/ai-model/models/types.js.map +1 -0
- package/dist/lib/ai-model/models/ui-tars/adapter.js +176 -0
- package/dist/lib/ai-model/models/ui-tars/adapter.js.map +1 -0
- package/dist/lib/ai-model/{ui-tars-planning.js → models/ui-tars/planning.js} +44 -62
- package/dist/lib/ai-model/models/ui-tars/planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +3 -3
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +11 -11
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +25 -60
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js +15 -10
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/locate-grounding-rules.js +43 -0
- package/dist/lib/ai-model/prompt/locate-grounding-rules.js.map +1 -0
- package/dist/lib/ai-model/prompt/locate-param-example.js +52 -0
- package/dist/lib/ai-model/prompt/locate-param-example.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +5 -5
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +5 -5
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/prompts/locate-result-coordinates.js +150 -0
- package/dist/lib/ai-model/prompts/locate-result-coordinates.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +68 -199
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/service-caller/json.js +100 -0
- package/dist/lib/ai-model/service-caller/json.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/bbox.js +117 -0
- package/dist/lib/ai-model/shared/model-locate-result/bbox.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/factory.js +130 -0
- package/dist/lib/ai-model/shared/model-locate-result/factory.js.map +1 -0
- package/dist/lib/ai-model/{prompt/common.js → shared/model-locate-result/index.js} +9 -9
- package/dist/lib/ai-model/shared/model-locate-result/index.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/parse.js +78 -0
- package/dist/lib/ai-model/shared/model-locate-result/parse.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/pixel-bbox-mapper.js +98 -0
- package/dist/lib/ai-model/shared/model-locate-result/pixel-bbox-mapper.js.map +1 -0
- package/dist/lib/ai-model/shared/model-locate-result/types.js +20 -0
- package/dist/lib/ai-model/shared/model-locate-result/types.js.map +1 -0
- package/dist/lib/ai-model/types.js +20 -0
- package/dist/lib/ai-model/types.js.map +1 -0
- package/dist/lib/ai-model/workflows/image-preprocess.js +61 -0
- package/dist/lib/ai-model/workflows/image-preprocess.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/index.js +50 -0
- package/dist/lib/ai-model/workflows/inspect/index.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/locate-result-rect.js +60 -0
- package/dist/lib/ai-model/workflows/inspect/locate-result-rect.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/search-area-mapping.js +52 -0
- package/dist/lib/ai-model/workflows/inspect/search-area-mapping.js.map +1 -0
- package/dist/lib/ai-model/workflows/inspect/types.js +20 -0
- package/dist/lib/ai-model/workflows/inspect/types.js.map +1 -0
- package/dist/lib/ai-model/{model-family.js → workflows/planning/index.js} +6 -7
- package/dist/lib/ai-model/workflows/planning/index.js.map +1 -0
- package/dist/lib/ai-model/workflows/planning/types.js +20 -0
- package/dist/lib/ai-model/workflows/planning/types.js.map +1 -0
- package/dist/lib/common.js +4 -206
- package/dist/lib/common.js.map +1 -1
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/service/index.js +96 -69
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/player.js +4 -3
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/types/agent/agent.d.ts +14 -6
- package/dist/types/agent/task-builder.d.ts +2 -2
- package/dist/types/agent/tasks.d.ts +6 -6
- package/dist/types/agent/utils.d.ts +8 -5
- package/dist/types/ai-model/errors.d.ts +2 -0
- package/dist/types/ai-model/index.d.ts +2 -4
- package/dist/types/ai-model/inspect.d.ts +13 -33
- package/dist/types/ai-model/llm-planning.d.ts +6 -17
- package/dist/types/ai-model/{auto-glm → models/auto-glm}/actions.d.ts +2 -2
- package/dist/types/ai-model/models/auto-glm/adapter.d.ts +5 -0
- package/dist/types/ai-model/models/auto-glm/locate.d.ts +3 -0
- package/dist/types/ai-model/models/auto-glm/planning.d.ts +3 -0
- package/dist/types/ai-model/models/auto-glm/prompt.d.ts +4 -0
- package/dist/types/ai-model/models/default.d.ts +2 -0
- package/dist/types/ai-model/models/doubao.d.ts +10 -0
- package/dist/types/ai-model/models/gemini.d.ts +18 -0
- package/dist/types/ai-model/models/glm.d.ts +18 -0
- package/dist/types/ai-model/models/gpt.d.ts +18 -0
- package/dist/types/ai-model/models/index.d.ts +2 -0
- package/dist/types/ai-model/models/qwen.d.ts +30 -0
- package/dist/types/ai-model/models/registry.d.ts +81 -0
- package/dist/types/ai-model/models/resolved.d.ts +9 -0
- package/dist/types/ai-model/models/types.d.ts +102 -0
- package/dist/types/ai-model/models/ui-tars/adapter.d.ts +6 -0
- package/dist/types/ai-model/{ui-tars-planning.d.ts → models/ui-tars/planning.d.ts} +7 -11
- package/dist/types/ai-model/prompt/llm-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +5 -5
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/locate-grounding-rules.d.ts +1 -0
- package/dist/types/ai-model/prompt/locate-param-example.d.ts +3 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +3 -3
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +3 -3
- package/dist/types/ai-model/prompts/locate-result-coordinates.d.ts +6 -0
- package/dist/types/ai-model/service-caller/index.d.ts +19 -27
- package/dist/types/ai-model/service-caller/json.d.ts +9 -0
- package/dist/types/ai-model/shared/model-locate-result/bbox.d.ts +7 -0
- package/dist/types/ai-model/shared/model-locate-result/factory.d.ts +2 -0
- package/dist/types/ai-model/shared/model-locate-result/index.d.ts +3 -0
- package/dist/types/ai-model/shared/model-locate-result/parse.d.ts +5 -0
- package/dist/types/ai-model/shared/model-locate-result/pixel-bbox-mapper.d.ts +7 -0
- package/dist/types/ai-model/shared/model-locate-result/types.d.ts +157 -0
- package/dist/types/ai-model/types.d.ts +2 -0
- package/dist/types/ai-model/workflows/image-preprocess.d.ts +30 -0
- package/dist/types/ai-model/workflows/inspect/index.d.ts +1 -0
- package/dist/types/ai-model/workflows/inspect/locate-result-rect.d.ts +4 -0
- package/dist/types/ai-model/workflows/inspect/search-area-mapping.d.ts +3 -0
- package/dist/types/ai-model/workflows/inspect/types.d.ts +37 -0
- package/dist/types/ai-model/workflows/planning/index.d.ts +2 -0
- package/dist/types/ai-model/workflows/planning/types.d.ts +15 -0
- package/dist/types/common.d.ts +0 -30
- package/dist/types/device/index.d.ts +22 -22
- package/dist/types/service/index.d.ts +5 -4
- package/dist/types/types.d.ts +21 -9
- package/dist/types/yaml.d.ts +8 -2
- package/package.json +2 -2
- package/dist/es/ai-model/auto-glm/actions.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/index.mjs +0 -6
- package/dist/es/ai-model/auto-glm/parser.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/planning.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +0 -1
- package/dist/es/ai-model/auto-glm/util.mjs +0 -9
- package/dist/es/ai-model/auto-glm/util.mjs.map +0 -1
- package/dist/es/ai-model/model-family.mjs +0 -6
- package/dist/es/ai-model/model-family.mjs.map +0 -1
- package/dist/es/ai-model/prompt/common.mjs +0 -8
- package/dist/es/ai-model/prompt/common.mjs.map +0 -1
- package/dist/es/ai-model/service-caller/image-detail.mjs +0 -6
- package/dist/es/ai-model/service-caller/image-detail.mjs.map +0 -1
- package/dist/es/ai-model/ui-tars-planning.mjs.map +0 -1
- package/dist/lib/ai-model/auto-glm/actions.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/index.js +0 -66
- package/dist/lib/ai-model/auto-glm/index.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/parser.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/planning.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/prompt.js.map +0 -1
- package/dist/lib/ai-model/auto-glm/util.js.map +0 -1
- package/dist/lib/ai-model/model-family.js.map +0 -1
- package/dist/lib/ai-model/prompt/common.js.map +0 -1
- package/dist/lib/ai-model/service-caller/image-detail.js.map +0 -1
- package/dist/lib/ai-model/ui-tars-planning.js.map +0 -1
- package/dist/types/ai-model/auto-glm/index.d.ts +0 -6
- package/dist/types/ai-model/auto-glm/planning.d.ts +0 -12
- package/dist/types/ai-model/auto-glm/prompt.d.ts +0 -27
- package/dist/types/ai-model/auto-glm/util.d.ts +0 -13
- package/dist/types/ai-model/model-family.d.ts +0 -7
- package/dist/types/ai-model/prompt/common.d.ts +0 -2
- package/dist/types/ai-model/service-caller/image-detail.d.ts +0 -2
- /package/dist/es/ai-model/{auto-glm → models/auto-glm}/parser.mjs +0 -0
- /package/dist/lib/ai-model/{auto-glm → models/auto-glm}/parser.js +0 -0
- /package/dist/types/ai-model/{auto-glm → models/auto-glm}/parser.d.ts +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n generateElementByPoint,\n generateElementByRect,\n} from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n scaleImage,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nexport async function buildSearchAreaConfig(options: {\n context: UIContext;\n baseRect: Rect;\n modelFamily: IModelConfig['modelFamily'];\n}): Promise<{ rect: Rect; imageBase64: string; scale: number }> {\n const { context, baseRect, modelFamily } = options;\n const scaleRatio = 2;\n const sectionRect = expandSearchArea(baseRect, context.shotSize);\n\n const croppedResult = await cropByRect(\n context.screenshot.base64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n\n const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);\n sectionRect.width = scaledResult.width;\n sectionRect.height = scaledResult.height;\n return {\n rect: sectionRect,\n imageBase64: scaledResult.imageBase64,\n scale: scaleRatio,\n };\n}\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images. These reference images are supporting context only, not the current screenshot being evaluated, unless the task explicitly asks for comparison or matching.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}'. It is a reference image, not the current screenshot:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.shotSize.width;\n let imageHeight = context.shotSize.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig, {\n abortSignal: options.abortSignal,\n });\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Apply offset if searching in a cropped area\n let finalX = pixelX;\n let finalY = pixelY;\n if (options.searchConfig?.rect) {\n finalX += options.searchConfig.rect.left;\n finalY += options.searchConfig.rect.top;\n }\n\n const element: LocateResultElement = generateElementByPoint(\n [finalX, finalY],\n targetElementDescriptionText as string,\n );\n\n resRect = element.rect;\n debugInspect('auto-glm resRect:', resRect);\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n let res: Awaited<\n ReturnType<\n typeof callAIWithObjectResponse<AIElementResponse | [number, number]>\n >\n >;\n try {\n res = await callAIWithObjectResponse<AIElementResponse | [number, number]>(\n msgs,\n modelConfig,\n {\n abortSignal: options.abortSignal,\n },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n elements: [],\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n options.searchConfig?.scale,\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n scale?: number;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n {\n abortSignal: options.abortSignal,\n },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n imageBase64: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let searchAreaConfig:\n | Awaited<ReturnType<typeof buildSearchAreaConfig>>\n | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n const expandedRect = expandSearchArea(mergedRect, context.shotSize);\n const originalWidth = expandedRect.width;\n const originalHeight = expandedRect.height;\n debugSection('expanded sectionRect %j', expandedRect);\n\n searchAreaConfig = await buildSearchAreaConfig({\n context,\n baseRect: mergedRect,\n modelFamily,\n });\n\n debugSection(\n 'scaled sectionRect from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n searchAreaConfig.rect.width,\n searchAreaConfig.rect.height,\n searchAreaConfig.scale,\n );\n }\n\n return {\n rect: searchAreaConfig?.rect,\n imageBase64: searchAreaConfig?.imageBase64,\n scale: searchAreaConfig?.scale,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract({\n screenshotIncluded: extractOption?.screenshotIncluded !== false,\n referenceImagesIncluded: !!multimodalPrompt?.images?.length,\n });\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'text',\n text: 'This is the current screenshot to evaluate. Unless <DATA_DEMAND> explicitly asks for comparison or matching against reference images, base your answer on this screenshot and its contents when provided.',\n });\n\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig);\n\n // Parse XML response to JSON object\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n debugInspect('AiJudgeOrderSensitive: description=%s', description);\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","buildSearchAreaConfig","options","context","baseRect","modelFamily","scaleRatio","sectionRect","expandSearchArea","croppedResult","cropByRect","scaledResult","scaleImage","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","targetElementDescription","modelConfig","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","finalX","finalY","element","generateElementByPoint","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","undefined","JSON","Array","adaptBboxToRect","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","searchAreaConfig","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandedRect","originalWidth","originalHeight","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","callAIFn","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AA6DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAEvB,eAAeE,sBAAsBC,OAI3C;IACC,MAAM,EAAEC,OAAO,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGH;IAC3C,MAAMI,aAAa;IACnB,MAAMC,cAAcC,iBAAiBJ,UAAUD,QAAQ,QAAQ;IAE/D,MAAMM,gBAAgB,MAAMC,WAC1BP,QAAQ,UAAU,CAAC,MAAM,EACzBI,aACAF,AAAgB,iBAAhBA;IAGF,MAAMM,eAAe,MAAMC,WAAWH,cAAc,WAAW,EAAEH;IACjEC,YAAY,KAAK,GAAGI,aAAa,KAAK;IACtCJ,YAAY,MAAM,GAAGI,aAAa,MAAM;IACxC,OAAO;QACL,MAAMJ;QACN,aAAaI,aAAa,WAAW;QACrC,OAAOL;IACT;AACF;AAEA,MAAMO,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,uDAAuD,CAAC;oBAChH;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBnB,OAMrC;IAUC,MAAM,EAAEC,OAAO,EAAEmB,wBAAwB,EAAEC,WAAW,EAAE,GAAGrB;IAC3D,MAAM,EAAEG,WAAW,EAAE,GAAGkB;IACxB,MAAMC,mBAAmBrB,QAAQ,UAAU,CAAC,MAAM;IAElDsB,OACEH,0BACA;IAEF,MAAMI,+BAA+Bb,wBACnCS;IAEF,MAAMK,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUzB,eAC3B0B,uBAAuB1B,eACvB2B,4BAA4B3B;IAEhC,IAAI4B,eAAeT;IACnB,IAAIU,aAAa/B,QAAQ,QAAQ,CAAC,KAAK;IACvC,IAAIgC,cAAchC,QAAQ,QAAQ,CAAC,MAAM;IACzC,IAAIiC,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjC,QAAQ,YAAY,EAAE;QACxBuB,OACEvB,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFuB,OACEvB,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGF+B,eAAe/B,QAAQ,YAAY,CAAC,WAAW;QAC/CgC,aAAahC,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiC,cAAcjC,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkC,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAI9B,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMiC,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMrB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUzB,eACZ,CAAC,KAAK,EAAEsB,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOL,0BAAuC;QAChD,MAAMkB,SAAS,MAAMzB,mBAAmB;YACtC,QAAQO,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAL,KAAK,IAAI,IAAIuB;IACf;IAEA,IAAIV,UAAUzB,cAAc;QAC1B,MAAM,EAAE,SAASoC,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB1B,MAAMM,aAAa;YAChD,aAAarB,QAAQ,WAAW;QAClC;QAEFJ,aAAa,yBAAyB2C;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1C3C,aAAa,sBAAsB8C,OAAO,KAAK;QAC/C9C,aAAa,yBAAyB8C,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9D9C,aAAa,yBAAyBkD,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnC9C,aAAa,iCAAiC;gBAAEmD;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9CrC,aAAa,+BAA+B;gBAAEqD;gBAAQE;YAAO;YAG7D,IAAIC,SAASH;YACb,IAAII,SAASF;YACb,IAAInD,QAAQ,YAAY,EAAE,MAAM;gBAC9BoD,UAAUpD,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBACxCqD,UAAUrD,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YACzC;YAEA,MAAMsD,UAA+BC,uBACnC;gBAACH;gBAAQC;aAAO,EAChB7B;YAGFoB,UAAUU,QAAQ,IAAI;YACtB1D,aAAa,qBAAqBgD;YAElC,IAAIU,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMV;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,IAAIc;IAKJ,IAAI;QACFA,MAAM,MAAMC,yBACV1C,MACAM,aACA;YACE,aAAarB,QAAQ,WAAW;QAClC;IAEJ,EAAE,OAAO0D,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,UAAU,EAAE;gBACZ,QAAQ;oBAAC,CAAC,eAAe,EAAEL,cAAc;iBAAC;YAC5C;YACAG;YACAtB;YACA,mBAAmBwB;QACrB;IACF;IAEA,MAAMF,cAAcG,KAAK,SAAS,CAACT,IAAI,OAAO;IAE9C,IAAIZ;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYU,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBU,MAAM,OAAO,CAACV,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAZ,UAAUuB,gBACRX,IAAI,OAAO,CAAC,IAAI,EAChBxB,YACAC,aACAjC,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkC,oBACAC,qBACAhC,aACAH,QAAQ,YAAY,EAAE;YAGxBJ,aAAa,WAAWgD;YAExB,MAAMU,UAA+Bc,sBACnCxB,SACApB;YAEFsB,SAAS,EAAE;YAEX,IAAIQ,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;IACF,EAAE,OAAOe,GAAG;QACV,MAAMC,MACJD,aAAaT,QACT,CAAC,sBAAsB,EAAES,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACvB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEwB,IAAI,CAAC,CAAC;aAFtBxB,SAAS;YAACwB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAM1B;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAgB;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAee,gBAAgBvE,OAKrC;IAQC,MAAM,EAAEC,OAAO,EAAEuE,kBAAkB,EAAEnD,WAAW,EAAE,GAAGrB;IACrD,MAAM,EAAEG,WAAW,EAAE,GAAGkB;IACxB,MAAMC,mBAAmBrB,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM0B,eAAe8C,4BAA4BtE;IACjD,MAAMuE,gCAAgCC,0BACpChE,wBAAwB6D;IAE1B,MAAMzD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMoD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMlC,SAAS,MAAMzB,mBAAmB;YACtC,QAAQ2D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAzD,KAAK,IAAI,IAAIuB;IACf;IAEA,IAAIsC;IAGJ,IAAI;QACFA,SAAS,MAAMnB,yBACb1C,MACAM,aACA;YACE,aAAarB,QAAQ,WAAW;QAClC;IAEJ,EAAE,OAAO0D,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAaA;YACb,OAAO,CAAC,eAAe,EAAEL,cAAc;YACvCG;YACAtB;QACF;IACF;IAEA,IAAIqC;IAGJ,MAAMC,cAAcF,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIE,aAAa;QACf,MAAMC,aAAaZ,gBACjBW,aACA7E,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBE;QAEFL,aAAa,0BAA0BiF;QAEvC,MAAMC,oBAAoBJ,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D9E,aAAa,wBAAwBkF;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAShB,MAAM,OAAO,CAACgB,OAC/B,GAAG,CAAC,CAACA,OACGf,gBACLe,MACAjF,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBE;QAGNL,aAAa,qBAAqBmF;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DnF,aAAa,iBAAiBqF;QAE9B,MAAME,eAAe/E,iBAAiB6E,YAAYlF,QAAQ,QAAQ;QAClE,MAAMqF,gBAAgBD,aAAa,KAAK;QACxC,MAAME,iBAAiBF,aAAa,MAAM;QAC1CvF,aAAa,2BAA2BuF;QAExCR,mBAAmB,MAAM9E,sBAAsB;YAC7CE;YACA,UAAUkF;YACVhF;QACF;QAEAL,aACE,qDACAwF,eACAC,gBACAV,iBAAiB,IAAI,CAAC,KAAK,EAC3BA,iBAAiB,IAAI,CAAC,MAAM,EAC5BA,iBAAiB,KAAK;IAE1B;IAEA,OAAO;QACL,MAAMA,kBAAkB;QACxB,aAAaA,kBAAkB;QAC/B,OAAOA,kBAAkB;QACzB,OAAOD,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAeY,qBAAwBxF,OAO7C;IACC,MAAM,EAAEyF,SAAS,EAAExF,OAAO,EAAEyF,aAAa,EAAE5E,gBAAgB,EAAEO,WAAW,EAAE,GACxErB;IACF,MAAM2B,eAAegE,sBAAsB;QACzC,oBAAoBD,eAAe,uBAAuB;QAC1D,yBAAyB,CAAC,CAAC5E,kBAAkB,QAAQ;IACvD;IACA,MAAMQ,mBAAmBrB,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM2F,wBAAwBC,uBAC5B7F,QAAQ,eAAe,IAAI,IAC3ByF;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OAAO;QAC/CI,YAAY,IAAI,CAAC;YACf,MAAM;YACN,MAAM;QACR;QAEAA,YAAY,IAAI,CAAC;YACf,MAAM;YACN,WAAW;gBACT,KAAKxE;gBACL,QAAQ;YACV;QACF;IACF;IAEAwE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM7E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAASmE;QACX;KACD;IAED,IAAIhF,kBAAkB;QACpB,MAAMwB,SAAS,MAAMzB,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM,EACJ,SAASwB,WAAW,EACpBtB,KAAK,EACLuD,iBAAiB,EAClB,GAAG,MAAMC,OAAOjF,MAAMM;IAGvB,IAAI4E;IACJ,IAAI;QACFA,cAAcC,2BAA8BpC;IAC9C,EAAE,OAAOqC,YAAY;QAEnB,MAAMxC,eACJwC,sBAAsBvC,QAAQuC,WAAW,OAAO,GAAGtC,OAAOsC;QAC5D,MAAM,IAAIpC,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAtB;IAEJ;IAEA,OAAO;QACLyD;QACAnC;QACAtB;QACAuD;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBC,QAAwE,EACxEjF,WAAyB;IAKzB,MAAMM,eAAe4E;IACrB,MAAMC,aAAaC,0BAA0BJ;IAE7C,MAAMtF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAAS6E;QACX;KACD;IAED5G,aAAa,yCAAyCyG;IAEtD,MAAMzB,SAAS,MAAM0B,SAASvF,MAAMM;IAEpC,OAAO;QACL,kBAAkBuD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocateResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport { generateElementByRect } from '@midscene/shared/extractor';\nimport {\n cropByRect,\n preProcessImageUrl,\n scaleImage,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { expandSearchArea } from '../common';\nimport type { ModelRuntime } from './models';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n} from './service-caller/index';\nimport { prepareModelImage } from './workflows/image-preprocess';\nimport {\n mergePixelBboxesToRect,\n pixelBboxToRect,\n} from './workflows/inspect/locate-result-rect';\nimport { mapSearchAreaPixelBboxToOriginalPixelBbox } from './workflows/inspect/search-area-mapping';\nimport type {\n LocateOptions,\n LocateResult,\n SearchAreaConfig,\n} from './workflows/inspect/types';\n\nexport type InspectAIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nfunction hasLocateResult(input: unknown, resultKey: string) {\n if (!input || typeof input !== 'object') {\n return false;\n }\n\n const record = input as Record<string, unknown>;\n const locateResult = record[resultKey];\n return Array.isArray(locateResult)\n ? locateResult.length > 0\n : locateResult !== undefined;\n}\n\nexport async function buildSearchAreaConfig(options: {\n context: UIContext;\n baseRect: Rect;\n}): Promise<SearchAreaConfig> {\n const { context, baseRect } = options;\n const scaleRatio = 2;\n const sectionRect = expandSearchArea(baseRect, context.shotSize);\n\n const croppedResult = await cropByRect(\n context.screenshot.base64,\n sectionRect,\n );\n\n const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);\n return {\n sourceRect: sectionRect,\n image: {\n imageBase64: scaledResult.imageBase64,\n width: scaledResult.width,\n height: scaledResult.height,\n },\n mapping: {\n offset: {\n x: sectionRect.left,\n y: sectionRect.top,\n },\n scale: scaleRatio,\n },\n };\n}\n\nexport const extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n }\n return prompt.prompt;\n};\n\nexport const promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images. These reference images are supporting context only, not the current screenshot being evaluated, unless the task explicitly asks for comparison or matching.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}'. It is a reference image, not the current screenshot:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(\n options: LocateOptions & { targetElementDescription: TUserPrompt },\n): Promise<LocateResult> {\n const { targetElementDescription, ...locateOptions } = options;\n const locateAdapter = options.modelRuntime.adapter.locate;\n if (locateAdapter.kind === 'custom') {\n return locateAdapter.locateFn(targetElementDescription, locateOptions);\n }\n return genericLocate(targetElementDescription, locateOptions);\n}\n\nexport async function genericLocate(\n elementDescription: TUserPrompt,\n options: LocateOptions,\n): Promise<LocateResult> {\n const { context } = options;\n const modelRuntime = options.modelRuntime;\n const { adapter } = modelRuntime;\n assert(\n adapter.locate.kind === 'standard',\n 'generic locate requires a standard locate adapter',\n );\n const screenshotBase64 = context.screenshot.base64;\n\n assert(elementDescription, 'cannot find the target element description');\n const elementDescriptionText = extraTextFromUserPrompt(elementDescription);\n const userInstructionPrompt = findElementPrompt(elementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(\n adapter.locate.resultAdapter.promptSpec,\n );\n\n const modelImage = options.searchConfig?.image ?? {\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n };\n const preparedImage = await prepareModelImage({\n imageBase64: modelImage.imageBase64,\n width: modelImage.width,\n height: modelImage.height,\n policy: adapter.imagePreprocess,\n });\n\n const imagePayload = preparedImage.imageBase64;\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof elementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: elementDescription.images,\n convertHttpImage2Base64: elementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let res: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AIElementLocateResponse>>\n >;\n try {\n res = await callAIWithObjectResponse<AIElementLocateResponse>(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n jsonParserSource: 'locate',\n },\n );\n } catch (callError) {\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n element: undefined,\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElement: LocateResultElement | undefined;\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n const resultAdapter = adapter.locate.resultAdapter;\n if (!hasLocateResult(res.content, resultAdapter.promptSpec.resultKey)) {\n return {\n rect: undefined,\n parseResult: {\n element: undefined,\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n }\n\n try {\n const mapping = options.searchConfig?.mapping;\n const targetPixelBbox = resultAdapter.adaptElementLocateResultToPixelBbox(\n res.content,\n {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n },\n );\n resRect = pixelBboxToRect(\n mapSearchAreaPixelBboxToOriginalPixelBbox(targetPixelBbox, mapping),\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n elementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElement = element;\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse locate result: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n element: matchedElement,\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelRuntime: ModelRuntime;\n abortSignal?: AbortSignal;\n}): Promise<{\n searchAreaConfig?: SearchAreaConfig;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const modelRuntime = options.modelRuntime;\n const { adapter } = modelRuntime;\n assert(\n adapter.locate.kind === 'standard',\n 'section locate requires a standard locate adapter',\n );\n const screenshotBase64 = context.screenshot.base64;\n const preparedImage = await prepareModelImage({\n imageBase64: screenshotBase64,\n width: context.shotSize.width,\n height: context.shotSize.height,\n policy: adapter.imagePreprocess,\n });\n\n const systemPrompt = systemPromptToLocateSection(\n adapter.locate.resultAdapter.promptSpec,\n );\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: preparedImage.imageBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelRuntime,\n {\n abortSignal: options.abortSignal,\n jsonParserSource: 'section-locator',\n },\n );\n } catch (callError) {\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n searchAreaConfig: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let searchAreaConfig:\n | Awaited<ReturnType<typeof buildSearchAreaConfig>>\n | undefined;\n let sectionError = result.content.error;\n const resultAdapter = adapter.locate.resultAdapter;\n if (!hasLocateResult(result.content, resultAdapter.promptSpec.resultKey)) {\n return {\n searchAreaConfig: undefined,\n error: sectionError,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n }\n\n try {\n const adaptedResult =\n resultAdapter.adaptSectionLocateResultToPixelBboxGroup(result.content, {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n });\n const mergedRect = mergePixelBboxesToRect([\n adaptedResult.target,\n ...(adaptedResult.references ?? []),\n ]);\n debugSection('mergedRect %j', mergedRect);\n\n const expandedRect = expandSearchArea(mergedRect, context.shotSize);\n const originalWidth = expandedRect.width;\n const originalHeight = expandedRect.height;\n debugSection('expanded sectionRect %j', expandedRect);\n\n searchAreaConfig = await buildSearchAreaConfig({\n context,\n baseRect: mergedRect,\n });\n\n debugSection(\n 'scaled section image from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n searchAreaConfig.image.width,\n searchAreaConfig.image.height,\n searchAreaConfig.mapping.scale,\n );\n } catch (error) {\n const parseErrorMessage =\n error instanceof Error\n ? `Failed to parse section locate result: ${error.message}`\n : 'unknown error in section locate';\n sectionError = sectionError\n ? `${sectionError} (${parseErrorMessage})`\n : parseErrorMessage;\n }\n\n return {\n searchAreaConfig,\n error: sectionError,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelRuntime: ModelRuntime;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelRuntime } =\n options;\n const systemPrompt = systemPromptToExtract({\n screenshotIncluded: extractOption?.screenshotIncluded !== false,\n referenceImagesIncluded: !!multimodalPrompt?.images?.length,\n });\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'text',\n text: 'This is the current screenshot to evaluate. Unless <DATA_DEMAND> explicitly asks for comparison or matching against reference images, base your answer on this screenshot and its contents when provided.',\n });\n\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelRuntime);\n\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n modelRuntime: ModelRuntime,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: InspectAIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n debugInspect('AiJudgeOrderSensitive: description=%s', description);\n\n const result = await callAIWithObjectResponse<{ isOrderSensitive: boolean }>(\n msgs,\n modelRuntime,\n {\n jsonParserSource: 'generic-object',\n },\n );\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","hasLocateResult","input","resultKey","record","locateResult","Array","undefined","buildSearchAreaConfig","options","context","baseRect","scaleRatio","sectionRect","expandSearchArea","croppedResult","cropByRect","scaledResult","scaleImage","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","targetElementDescription","locateOptions","locateAdapter","genericLocate","elementDescription","modelRuntime","adapter","assert","screenshotBase64","elementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","modelImage","preparedImage","prepareModelImage","imagePayload","addOns","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","usage","JSON","resRect","matchedElement","errors","resultAdapter","mapping","targetPixelBbox","pixelBboxToRect","mapSearchAreaPixelBboxToOriginalPixelBbox","element","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","searchAreaConfig","sectionError","adaptedResult","mergedRect","mergePixelBboxesToRect","expandedRect","originalWidth","originalHeight","error","parseErrorMessage","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AAgEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,SAASE,gBAAgBC,KAAc,EAAEC,SAAiB;IACxD,IAAI,CAACD,SAAS,AAAiB,YAAjB,OAAOA,OACnB,OAAO;IAGT,MAAME,SAASF;IACf,MAAMG,eAAeD,MAAM,CAACD,UAAU;IACtC,OAAOG,MAAM,OAAO,CAACD,gBACjBA,aAAa,MAAM,GAAG,IACtBA,AAAiBE,WAAjBF;AACN;AAEO,eAAeG,sBAAsBC,OAG3C;IACC,MAAM,EAAEC,OAAO,EAAEC,QAAQ,EAAE,GAAGF;IAC9B,MAAMG,aAAa;IACnB,MAAMC,cAAcC,iBAAiBH,UAAUD,QAAQ,QAAQ;IAE/D,MAAMK,gBAAgB,MAAMC,WAC1BN,QAAQ,UAAU,CAAC,MAAM,EACzBG;IAGF,MAAMI,eAAe,MAAMC,WAAWH,cAAc,WAAW,EAAEH;IACjE,OAAO;QACL,YAAYC;QACZ,OAAO;YACL,aAAaI,aAAa,WAAW;YACrC,OAAOA,aAAa,KAAK;YACzB,QAAQA,aAAa,MAAM;QAC7B;QACA,SAAS;YACP,QAAQ;gBACN,GAAGJ,YAAY,IAAI;gBACnB,GAAGA,YAAY,GAAG;YACpB;YACA,OAAOD;QACT;IACF;AACF;AAEO,MAAMO,0BAA0B,CAACC;IACtC,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAET,OAAOA,OAAO,MAAM;AACtB;AAEO,MAAMC,qBAAqB,OAChCC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,uDAAuD,CAAC;oBAChH;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBACpBlB,OAAkE;IAElE,MAAM,EAAEmB,wBAAwB,EAAE,GAAGC,eAAe,GAAGpB;IACvD,MAAMqB,gBAAgBrB,QAAQ,YAAY,CAAC,OAAO,CAAC,MAAM;IACzD,IAAIqB,AAAuB,aAAvBA,cAAc,IAAI,EACpB,OAAOA,cAAc,QAAQ,CAACF,0BAA0BC;IAE1D,OAAOE,cAAcH,0BAA0BC;AACjD;AAEO,eAAeE,cACpBC,kBAA+B,EAC/BvB,OAAsB;IAEtB,MAAM,EAAEC,OAAO,EAAE,GAAGD;IACpB,MAAMwB,eAAexB,QAAQ,YAAY;IACzC,MAAM,EAAEyB,OAAO,EAAE,GAAGD;IACpBE,OACED,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,EACnB;IAEF,MAAME,mBAAmB1B,QAAQ,UAAU,CAAC,MAAM;IAElDyB,OAAOH,oBAAoB;IAC3B,MAAMK,yBAAyBlB,wBAAwBa;IACvD,MAAMM,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,4BACnBP,QAAQ,MAAM,CAAC,aAAa,CAAC,UAAU;IAGzC,MAAMQ,aAAajC,QAAQ,YAAY,EAAE,SAAS;QAChD,aAAa2B;QACb,OAAO1B,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;IACjC;IACA,MAAMiC,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaF,WAAW,WAAW;QACnC,OAAOA,WAAW,KAAK;QACvB,QAAQA,WAAW,MAAM;QACzB,QAAQR,QAAQ,eAAe;IACjC;IAEA,MAAMW,eAAeF,cAAc,WAAW;IAE9C,MAAMpB,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKK;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMP;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAON,oBAAiC;QAC1C,MAAMc,SAAS,MAAMzB,mBAAmB;YACtC,QAAQW,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAT,KAAK,IAAI,IAAIuB;IACf;IAEA,IAAIC;IAGJ,IAAI;QACFA,MAAM,MAAMC,yBACVzB,MACAU,cACA;YACE,aAAaxB,QAAQ,WAAW;YAChC,kBAAkB;QACpB;IAEJ,EAAE,OAAOwC,WAAW;QAClB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMK,QACJN,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAG1C;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,SAASA;gBACT,QAAQ;oBAAC,CAAC,eAAe,EAAE2C,cAAc;iBAAC;YAC5C;YACAG;YACAE;YACA,mBAAmBhD;QACrB;IACF;IAEA,MAAM8C,cAAcG,KAAK,SAAS,CAACT,IAAI,OAAO;IAE9C,IAAIU;IACJ,IAAIC;IACJ,IAAIC,SACF,YAAYZ,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,MAAMa,gBAAgB1B,QAAQ,MAAM,CAAC,aAAa;IAClD,IAAI,CAACjC,gBAAgB8C,IAAI,OAAO,EAAEa,cAAc,UAAU,CAAC,SAAS,GAClE,OAAO;QACL,MAAMrD;QACN,aAAa;YACX,SAASA;YACT,QAAQoD;QACV;QACAN;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;IAGF,IAAI;QACF,MAAMc,UAAUpD,QAAQ,YAAY,EAAE;QACtC,MAAMqD,kBAAkBF,cAAc,mCAAmC,CACvEb,IAAI,OAAO,EACX;YACE,cAAcJ,cAAc,YAAY;YACxC,aAAaA,cAAc,WAAW;QACxC;QAEFc,UAAUM,gBACRC,0CAA0CF,iBAAiBD;QAG7D/D,aAAa,WAAW2D;QAExB,MAAMQ,UAA+BC,sBACnCT,SACApB;QAEFsB,SAAS,EAAE;QAEX,IAAIM,SACFP,iBAAiBO;IAErB,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAahB,QACT,CAAC,+BAA+B,EAAEgB,EAAE,OAAO,EAAE,GAC7C;QACN,IAAI,AAACR,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAES,IAAI,CAAC,CAAC;aAFtBT,SAAS;YAACS;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMX;QACN,aAAa;YACX,SAASC;YACT,QAAQC;QACV;QACAN;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAesB,gBAAgB5D,OAKrC;IAMC,MAAM,EAAEC,OAAO,EAAE4D,kBAAkB,EAAE,GAAG7D;IACxC,MAAMwB,eAAexB,QAAQ,YAAY;IACzC,MAAM,EAAEyB,OAAO,EAAE,GAAGD;IACpBE,OACED,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,EACnB;IAEF,MAAME,mBAAmB1B,QAAQ,UAAU,CAAC,MAAM;IAClD,MAAMiC,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaR;QACb,OAAO1B,QAAQ,QAAQ,CAAC,KAAK;QAC7B,QAAQA,QAAQ,QAAQ,CAAC,MAAM;QAC/B,QAAQwB,QAAQ,eAAe;IACjC;IAEA,MAAMM,eAAe+B,4BACnBrC,QAAQ,MAAM,CAAC,aAAa,CAAC,UAAU;IAEzC,MAAMsC,gCAAgCC,0BACpCtD,wBAAwBmD;IAE1B,MAAM/C,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG,cAAc,WAAW;wBAC9B,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM6B;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMxB,SAAS,MAAMzB,mBAAmB;YACtC,QAAQiD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA/C,KAAK,IAAI,IAAIuB;IACf;IAEA,IAAI4B;IAGJ,IAAI;QACFA,SAAS,MAAM1B,yBACbzB,MACAU,cACA;YACE,aAAaxB,QAAQ,WAAW;YAChC,kBAAkB;QACpB;IAEJ,EAAE,OAAOwC,WAAW;QAClB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMK,QACJN,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAG1C;QAChE,OAAO;YACL,kBAAkBA;YAClB,OAAO,CAAC,eAAe,EAAE2C,cAAc;YACvCG;YACAE;QACF;IACF;IAEA,IAAIoB;IAGJ,IAAIC,eAAeF,OAAO,OAAO,CAAC,KAAK;IACvC,MAAMd,gBAAgB1B,QAAQ,MAAM,CAAC,aAAa;IAClD,IAAI,CAACjC,gBAAgByE,OAAO,OAAO,EAAEd,cAAc,UAAU,CAAC,SAAS,GACrE,OAAO;QACL,kBAAkBrD;QAClB,OAAOqE;QACP,aAAapB,KAAK,SAAS,CAACkB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;IAGF,IAAI;QACF,MAAMG,gBACJjB,cAAc,wCAAwC,CAACc,OAAO,OAAO,EAAE;YACrE,cAAc/B,cAAc,YAAY;YACxC,aAAaA,cAAc,WAAW;QACxC;QACF,MAAMmC,aAAaC,uBAAuB;YACxCF,cAAc,MAAM;eAChBA,cAAc,UAAU,IAAI,EAAE;SACnC;QACD7E,aAAa,iBAAiB8E;QAE9B,MAAME,eAAelE,iBAAiBgE,YAAYpE,QAAQ,QAAQ;QAClE,MAAMuE,gBAAgBD,aAAa,KAAK;QACxC,MAAME,iBAAiBF,aAAa,MAAM;QAC1ChF,aAAa,2BAA2BgF;QAExCL,mBAAmB,MAAMnE,sBAAsB;YAC7CE;YACA,UAAUoE;QACZ;QAEA9E,aACE,uDACAiF,eACAC,gBACAP,iBAAiB,KAAK,CAAC,KAAK,EAC5BA,iBAAiB,KAAK,CAAC,MAAM,EAC7BA,iBAAiB,OAAO,CAAC,KAAK;IAElC,EAAE,OAAOQ,OAAO;QACd,MAAMC,oBACJD,iBAAiBhC,QACb,CAAC,uCAAuC,EAAEgC,MAAM,OAAO,EAAE,GACzD;QACNP,eAAeA,eACX,GAAGA,aAAa,EAAE,EAAEQ,kBAAkB,CAAC,CAAC,GACxCA;IACN;IAEA,OAAO;QACLT;QACA,OAAOC;QACP,aAAapB,KAAK,SAAS,CAACkB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAeW,qBAAwB5E,OAO7C;IACC,MAAM,EAAE6E,SAAS,EAAE5E,OAAO,EAAE6E,aAAa,EAAEjE,gBAAgB,EAAEW,YAAY,EAAE,GACzExB;IACF,MAAM+B,eAAegD,sBAAsB;QACzC,oBAAoBD,eAAe,uBAAuB;QAC1D,yBAAyB,CAAC,CAACjE,kBAAkB,QAAQ;IACvD;IACA,MAAMc,mBAAmB1B,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM+E,wBAAwBC,uBAC5BjF,QAAQ,eAAe,IAAI,IAC3B6E;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OAAO;QAC/CI,YAAY,IAAI,CAAC;YACf,MAAM;YACN,MAAM;QACR;QAEAA,YAAY,IAAI,CAAC;YACf,MAAM;YACN,WAAW;gBACT,KAAKvD;gBACL,QAAQ;YACV;QACF;IACF;IAEAuD,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMlE,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAASmD;QACX;KACD;IAED,IAAIrE,kBAAkB;QACpB,MAAMwB,SAAS,MAAMzB,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM,EACJ,SAASO,WAAW,EACpBE,KAAK,EACLqC,iBAAiB,EAClB,GAAG,MAAMC,OAAOtE,MAAMU;IAEvB,IAAI6D;IACJ,IAAI;QACFA,cAAcC,2BAA8B1C;IAC9C,EAAE,OAAO2C,YAAY;QACnB,MAAM9C,eACJ8C,sBAAsB7C,QAAQ6C,WAAW,OAAO,GAAG5C,OAAO4C;QAC5D,MAAM,IAAI1C,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAE;IAEJ;IAEA,OAAO;QACLuC;QACAzC;QACAE;QACAqC;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBjE,YAA0B;IAK1B,MAAMO,eAAe2D;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAM3E,OAAsB;QAC1B;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS4D;QACX;KACD;IAEDtG,aAAa,yCAAyCoG;IAEtD,MAAMxB,SAAS,MAAM1B,yBACnBzB,MACAU,cACA;QACE,kBAAkB;IACpB;IAGF,OAAO;QACL,kBAAkByC,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
|
|
2
1
|
import { getDebug } from "@midscene/shared/logger";
|
|
3
2
|
import { assert } from "@midscene/shared/utils";
|
|
4
|
-
import { buildYamlFlowFromPlans,
|
|
3
|
+
import { buildYamlFlowFromPlans, findAllMidsceneLocatorField } from "../common.mjs";
|
|
4
|
+
import { planningModelFamilyRequiredForLocateMessage } from "./errors.mjs";
|
|
5
5
|
import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
|
|
6
6
|
import { extractXMLTag, parseMarkFinishedIndexes, parseSubGoalsFromXML } from "./prompt/util.mjs";
|
|
7
|
-
import { AIResponseParseError, callAI
|
|
7
|
+
import { AIResponseParseError, callAI } from "./service-caller/index.mjs";
|
|
8
|
+
import { prepareModelImage } from "./workflows/image-preprocess.mjs";
|
|
8
9
|
const debug = getDebug('planning');
|
|
9
10
|
const warnLog = getDebug('planning', {
|
|
10
11
|
console: true
|
|
11
12
|
});
|
|
12
13
|
const noPreviousActionsText = 'No previous actions have been executed in this aiAct execution yet. If the instruction asks for actions, choose the first action to execute.';
|
|
13
|
-
function parseXMLPlanningResponse(xmlString,
|
|
14
|
+
function parseXMLPlanningResponse(xmlString, jsonParser) {
|
|
14
15
|
const thought = extractXMLTag(xmlString, 'thought');
|
|
15
16
|
const memory = extractXMLTag(xmlString, 'memory');
|
|
16
17
|
const log = extractXMLTag(xmlString, 'log') || '';
|
|
@@ -34,11 +35,12 @@ function parseXMLPlanningResponse(xmlString, modelFamily) {
|
|
|
34
35
|
const type = actionType.split('<')[0].trim();
|
|
35
36
|
let param;
|
|
36
37
|
if (actionParamStr) try {
|
|
37
|
-
param =
|
|
38
|
-
|
|
38
|
+
param = jsonParser(actionParamStr, {
|
|
39
|
+
source: 'planning-action-param',
|
|
40
|
+
preserveStringValueKeys: 'input' === type.toLowerCase() ? [
|
|
39
41
|
'value'
|
|
40
|
-
]
|
|
41
|
-
}
|
|
42
|
+
] : void 0
|
|
43
|
+
});
|
|
42
44
|
} catch (e) {
|
|
43
45
|
throw new Error(`Failed to parse action-param-json: ${e}`);
|
|
44
46
|
}
|
|
@@ -76,27 +78,28 @@ function parseXMLPlanningResponse(xmlString, modelFamily) {
|
|
|
76
78
|
};
|
|
77
79
|
}
|
|
78
80
|
async function plan(userInstruction, opts) {
|
|
79
|
-
const { context,
|
|
81
|
+
const { context, conversationHistory } = opts;
|
|
82
|
+
const modelRuntime = opts.modelRuntime;
|
|
83
|
+
const { adapter } = modelRuntime;
|
|
80
84
|
const { shotSize } = context;
|
|
81
85
|
const screenshotBase64 = context.screenshot.base64;
|
|
82
|
-
|
|
86
|
+
if (opts.includeLocateInPlanning && !modelRuntime.config.modelFamily) throw new Error(planningModelFamilyRequiredForLocateMessage(modelRuntime.config.slot));
|
|
87
|
+
const locateResultAdapter = modelRuntime.config.modelFamily && 'standard' === adapter.locate.kind ? adapter.locate.resultAdapter : void 0;
|
|
83
88
|
const includeSubGoals = true === opts.deepThink;
|
|
84
89
|
const systemPrompt = await systemPromptToTaskPlanning({
|
|
85
90
|
actionSpace: opts.actionSpace,
|
|
86
|
-
|
|
87
|
-
|
|
91
|
+
locatePromptSpec: locateResultAdapter?.promptSpec,
|
|
92
|
+
includeLocateInPlanning: opts.includeLocateInPlanning,
|
|
88
93
|
includeThought: true,
|
|
89
94
|
includeSubGoals
|
|
90
95
|
});
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
imagePayload = paddedResult.imageBase64;
|
|
99
|
-
}
|
|
96
|
+
const preparedImage = await prepareModelImage({
|
|
97
|
+
imageBase64: screenshotBase64,
|
|
98
|
+
width: shotSize.width,
|
|
99
|
+
height: shotSize.height,
|
|
100
|
+
policy: adapter.imagePreprocess
|
|
101
|
+
});
|
|
102
|
+
const imagePayload = preparedImage.imageBase64;
|
|
100
103
|
const actionContext = opts.actionContext ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\n` : '';
|
|
101
104
|
const instruction = [
|
|
102
105
|
{
|
|
@@ -159,23 +162,23 @@ async function plan(userInstruction, opts) {
|
|
|
159
162
|
...instruction,
|
|
160
163
|
...historyLog
|
|
161
164
|
];
|
|
162
|
-
let { content: rawResponse, usage, reasoning_content } = await callAI(msgs,
|
|
165
|
+
let { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelRuntime, {
|
|
163
166
|
abortSignal: opts.abortSignal,
|
|
164
|
-
|
|
167
|
+
requiresOriginalImageDetail: opts.includeLocateInPlanning
|
|
165
168
|
});
|
|
166
169
|
let planFromAI;
|
|
167
170
|
try {
|
|
168
171
|
try {
|
|
169
|
-
planFromAI = parseXMLPlanningResponse(rawResponse,
|
|
172
|
+
planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);
|
|
170
173
|
} catch {
|
|
171
|
-
const retry = await callAI(msgs,
|
|
174
|
+
const retry = await callAI(msgs, modelRuntime, {
|
|
172
175
|
abortSignal: opts.abortSignal,
|
|
173
|
-
|
|
176
|
+
requiresOriginalImageDetail: opts.includeLocateInPlanning
|
|
174
177
|
});
|
|
175
178
|
rawResponse = retry.content;
|
|
176
179
|
usage = retry.usage;
|
|
177
180
|
reasoning_content = retry.reasoning_content;
|
|
178
|
-
planFromAI = parseXMLPlanningResponse(rawResponse,
|
|
181
|
+
planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);
|
|
179
182
|
}
|
|
180
183
|
if (planFromAI.action && void 0 !== planFromAI.finalizeSuccess) {
|
|
181
184
|
warnLog('Planning response included both an action and <complete>; ignoring <complete> output.');
|
|
@@ -209,7 +212,22 @@ async function plan(userInstruction, opts) {
|
|
|
209
212
|
debug('locateFields', locateFields);
|
|
210
213
|
locateFields.forEach((field)=>{
|
|
211
214
|
const locateResult = action.param[field];
|
|
212
|
-
if (locateResult
|
|
215
|
+
if (locateResult) {
|
|
216
|
+
if (!opts.includeLocateInPlanning) {
|
|
217
|
+
if ('object' == typeof locateResult) action.param[field] = {
|
|
218
|
+
prompt: locateResult.prompt
|
|
219
|
+
};
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
assert(locateResultAdapter, 'generic planning locate normalization requires a standard locate adapter');
|
|
223
|
+
action.param[field] = {
|
|
224
|
+
...locateResult,
|
|
225
|
+
locatedPixelBbox: locateResultAdapter.adaptPlanningParamToPixelBbox(locateResult, {
|
|
226
|
+
preparedSize: preparedImage.preparedSize,
|
|
227
|
+
contentSize: preparedImage.contentSize
|
|
228
|
+
})
|
|
229
|
+
};
|
|
230
|
+
}
|
|
213
231
|
});
|
|
214
232
|
});
|
|
215
233
|
if (includeSubGoals) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n parseModelResponseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\nconst noPreviousActionsText =\n 'No previous actions have been executed in this aiAct execution yet. If the instruction asks for actions, choose the first action to execute.';\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n // Strip any trailing XML tags that LLM might have leaked into the action type\n // e.g. \"KeyboardPress</action-type>\\n<action-param-json>\" -> \"KeyboardPress\"\n const type = actionType.split('<')[0].trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = parseModelResponseJson(\n actionParamStr,\n modelFamily,\n type.toLowerCase() === 'input'\n ? { preserveStringValueKeys: ['value'] }\n : undefined,\n );\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n // Controls aiAct planning prompt shape and state updates, such as sub-goals.\n deepThink?: boolean;\n abortSignal?: AbortSignal;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { shotSize } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n // Only enable sub-goals when aiAct is in deep-thinking planning mode.\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = shotSize.width;\n let imageHeight = shotSize.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In planning deep-think mode: show full sub-goals with logs\n // Otherwise: show historical execution logs\n const executionProgressText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const executionProgressSection = executionProgressText\n ? `\\n\\n${executionProgressText}`\n : conversationHistory.pendingFeedbackMessage\n ? ''\n : `\\n\\n${noPreviousActionsText}`;\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `This is the current screenshot.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n let {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n abortSignal: opts.abortSignal,\n // When GPT-5 planning includes bbox, the planning call also performs\n // localization, so the screenshot should be sent with original detail.\n forceOriginalImageDetail: modelFamily === 'gpt-5' && opts.includeBbox,\n });\n\n // Parse XML response to JSON object, retry once on parse failure\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n } catch {\n const retry = await callAI(msgs, modelConfig, {\n abortSignal: opts.abortSignal,\n // Keep retry requests consistent with the initial planning call.\n forceOriginalImageDetail: modelFamily === 'gpt-5' && opts.includeBbox,\n });\n rawResponse = retry.content;\n usage = retry.usage;\n reasoning_content = retry.reasoning_content;\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed in planning deep-think mode.\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n modelFamily,\n );\n }\n });\n });\n\n // Update sub-goals in conversation history only in planning deep-think mode.\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // Without planning deep-think mode, accumulate logs as historical execution steps.\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["debug","getDebug","warnLog","noPreviousActionsText","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","parseModelResponseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","shotSize","screenshotBase64","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","executionProgressText","executionProgressSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","retry","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;;;;;AA8BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,UAAUD,SAAS,YAAY;IAAE,SAAS;AAAK;AAErD,MAAME,wBACJ;AAKK,SAASC,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,SAASD,cAAcH,WAAW;IACxC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QAGrD,MAAMc,OAAOd,WAAW,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI;QAC1C,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQC,uBACNf,gBACAP,aACAoB,AAAuB,YAAvBA,KAAK,WAAW,KACZ;gBAAE,yBAAyB;oBAAC;iBAAQ;YAAC,IACrCR;QAER,EAAE,OAAOW,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAYC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,QAAQ,EAAE,GAAGH;IACrB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAE5B,WAAW,EAAE,GAAG6B;IAGxB,MAAMI,kBAAkBN,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMO,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7B3B;QACA,aAAa2B,KAAK,WAAW;QAC7B,gBAAgB;QAChBM;IACF;IAEA,IAAIG,eAAeJ;IACnB,IAAIK,aAAaN,SAAS,KAAK;IAC/B,IAAIO,cAAcP,SAAS,MAAM;IAKjC,IAAI/B,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMuC,eAAe,MAAMC,4BAA4BJ;QACvDC,aAAaE,aAAa,KAAK;QAC/BD,cAAcC,aAAa,MAAM;QACjCH,eAAeG,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBd,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMe,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEf,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIiB;IAKJ,MAAMC,wBAAwBX,kBAC1BH,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMe,2BAA2BD,wBAC7B,CAAC,IAAI,EAAEA,uBAAuB,GAC9Bd,oBAAoB,sBAAsB,GACxC,KACA,CAAC,IAAI,EAAEjC,uBAAuB;IAGpC,MAAMiD,eAAehB,oBAAoB,cAAc;IACvD,MAAMiB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIhB,oBAAoB,sBAAsB,EAAE;QAC9Ca,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGb,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEiB,kBAAkBF,0BAA0B;gBACzN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEa,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,+BAA+B,EAAEI,kBAAkBF,0BAA0B;YACtF;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACa;IAG3Bb,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMkB,aAAalB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMsB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASf;QAAa;WACrCQ;WACAM;KACJ;IAED,IAAI,EACF,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMpB,aAAa;QAClC,aAAaF,KAAK,WAAW;QAG7B,0BAA0B3B,AAAgB,YAAhBA,eAA2B2B,KAAK,WAAW;IACvE;IAGA,IAAI2B;IACJ,IAAI;QACF,IAAI;YACFA,aAAaxD,yBAAyBoD,aAAalD;QACrD,EAAE,OAAM;YACN,MAAMuD,QAAQ,MAAMF,OAAOJ,MAAMpB,aAAa;gBAC5C,aAAaF,KAAK,WAAW;gBAE7B,0BAA0B3B,AAAgB,YAAhBA,eAA2B2B,KAAK,WAAW;YACvE;YACAuB,cAAcK,MAAM,OAAO;YAC3BJ,QAAQI,MAAM,KAAK;YACnBH,oBAAoBG,MAAM,iBAAiB;YAC3CD,aAAaxD,yBAAyBoD,aAAalD;QACrD;QAEA,IAAIsD,WAAW,MAAM,IAAIA,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YACjE1D,QACE;YAEF0D,WAAW,eAAe,GAAG1C;YAC7B0C,WAAW,eAAe,GAAG1C;QAC/B;QAEA,MAAM4C,UAAUF,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIG,yBAAyB;QAG7B,IAAIH,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YAC5C5D,MAAM;YACN+D,yBAAyB;YAEzB,IAAIxB,iBACFH,oBAAoB,uBAAuB;QAE/C;QAEA,MAAM4B,cAAkC;YACtC,GAAGJ,UAAU;YACbE;YACAN;YACAC;YACAC;YACA,UAAUO,uBAAuBH,SAAS7B,KAAK,WAAW;YAC1D8B;QACF;QAEAG,OAAON,YAAY;QAEnBE,QAAQ,OAAO,CAAC,CAACrC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAM0C,sBAAsBlC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;YAG9B1B,MAAM,+BAA+BmE;YACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENnE,MAAM,gBAAgBoE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAe9C,OAAO,KAAK,CAAC6C,MAAM;gBACxC,IAAIC,gBAAgBjE,AAAgBY,WAAhBZ,aAElBmB,OAAO,KAAK,CAAC6C,MAAM,GAAGE,cACpBD,cACA5B,YACAC,aACAtC;YAGN;QACF;QAGA,IAAIiC,iBAAiB;YACnB,IAAIqB,WAAW,cAAc,EAAE,QAC7BxB,oBAAoB,aAAa,CAACwB,WAAW,cAAc;YAE7D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMa,SAASb,WAAW,mBAAmB,CAChDxB,oBAAoB,mBAAmB,CAACqC;YAI5C,IAAIb,WAAW,GAAG,EAChBxB,oBAAoB,gBAAgB,CAACwB,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChBxB,oBAAoB,mBAAmB,CAACwB,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnBxB,oBAAoB,YAAY,CAACwB,WAAW,MAAM;QAGpDxB,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMoB;gBACR;aACD;QACH;QAEA,OAAOQ;IACT,EAAE,OAAOU,YAAY;QAEnB,MAAMC,eACJD,sBAAsB5C,QAAQ4C,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClCnB,aACAC;IAEJ;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n} from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { buildYamlFlowFromPlans, findAllMidsceneLocatorField } from '../common';\nimport { planningModelFamilyRequiredForLocateMessage } from './errors';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport { AIResponseParseError, callAI } from './service-caller/index';\nimport type { JsonParser, JsonParserSource } from './service-caller/json';\nimport { prepareModelImage } from './workflows/image-preprocess';\nimport type { PlanOptions } from './workflows/planning/types';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\nconst noPreviousActionsText =\n 'No previous actions have been executed in this aiAct execution yet. If the instruction asks for actions, choose the first action to execute.';\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse.\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n jsonParser: JsonParser,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n // Strip any trailing XML tags that LLM might have leaked into the action type\n // e.g. \"KeyboardPress</action-type>\\n<action-param-json>\" -> \"KeyboardPress\"\n const type = actionType.split('<')[0].trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = jsonParser(actionParamStr, {\n source: 'planning-action-param',\n preserveStringValueKeys:\n type.toLowerCase() === 'input' ? ['value'] : undefined,\n });\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: PlanOptions,\n): Promise<PlanningAIResponse> {\n const { context, conversationHistory } = opts;\n const modelRuntime = opts.modelRuntime;\n const { adapter } = modelRuntime;\n const { shotSize } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n if (opts.includeLocateInPlanning && !modelRuntime.config.modelFamily) {\n throw new Error(\n planningModelFamilyRequiredForLocateMessage(modelRuntime.config.slot),\n );\n }\n\n const locateResultAdapter =\n modelRuntime.config.modelFamily && adapter.locate.kind === 'standard'\n ? adapter.locate.resultAdapter\n : undefined;\n\n // Only enable sub-goals when aiAct is in deep-thinking planning mode.\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n locatePromptSpec: locateResultAdapter?.promptSpec,\n includeLocateInPlanning: opts.includeLocateInPlanning,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n const preparedImage = await prepareModelImage({\n imageBase64: screenshotBase64,\n width: shotSize.width,\n height: shotSize.height,\n policy: adapter.imagePreprocess,\n });\n const imagePayload = preparedImage.imageBase64;\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In planning deep-think mode: show full sub-goals with logs\n // Otherwise: show historical execution logs\n const executionProgressText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const executionProgressSection = executionProgressText\n ? `\\n\\n${executionProgressText}`\n : conversationHistory.pendingFeedbackMessage\n ? ''\n : `\\n\\n${noPreviousActionsText}`;\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `This is the current screenshot.${memoriesSection}${executionProgressSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n let {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelRuntime, {\n abortSignal: opts.abortSignal,\n // Planning with locate results is localization-sensitive. Adapters decide\n // whether this should request original image detail.\n requiresOriginalImageDetail: opts.includeLocateInPlanning,\n });\n\n // Parse XML response to JSON object, retry once on parse failure\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);\n } catch {\n const retry = await callAI(msgs, modelRuntime, {\n abortSignal: opts.abortSignal,\n // Keep retry requests consistent with the initial planning call.\n requiresOriginalImageDetail: opts.includeLocateInPlanning,\n });\n rawResponse = retry.content;\n usage = retry.usage;\n reasoning_content = retry.reasoning_content;\n planFromAI = parseXMLPlanningResponse(rawResponse, adapter.jsonParser);\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed in planning deep-think mode.\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (!opts.includeLocateInPlanning) {\n if (typeof locateResult === 'object') {\n // In prompt-only planning mode, ignore any accidental coordinates from the model.\n action.param[field] = { prompt: locateResult.prompt };\n }\n return;\n }\n\n assert(\n locateResultAdapter,\n 'generic planning locate normalization requires a standard locate adapter',\n );\n action.param[field] = {\n ...locateResult,\n locatedPixelBbox: locateResultAdapter.adaptPlanningParamToPixelBbox(\n locateResult,\n {\n preparedSize: preparedImage.preparedSize,\n contentSize: preparedImage.contentSize,\n },\n ),\n };\n }\n });\n });\n\n // Update sub-goals in conversation history only in planning deep-think mode.\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // Without planning deep-think mode, accumulate logs as historical execution steps.\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["debug","getDebug","warnLog","noPreviousActionsText","parseXMLPlanningResponse","xmlString","jsonParser","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","e","Error","plan","userInstruction","opts","context","conversationHistory","modelRuntime","adapter","shotSize","screenshotBase64","planningModelFamilyRequiredForLocateMessage","locateResultAdapter","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","preparedImage","prepareModelImage","imagePayload","actionContext","instruction","latestFeedbackMessage","executionProgressText","executionProgressSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","retry","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;;;;;;AAoBA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,UAAUD,SAAS,YAAY;IAAE,SAAS;AAAK;AAErD,MAAME,wBACJ;AAKK,SAASC,yBACdC,SAAiB,EACjBC,UAAsB;IAEtB,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,SAASD,cAAcH,WAAW;IACxC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QAGrD,MAAMc,OAAOd,WAAW,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI;QAC1C,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQrB,WAAWO,gBAAgB;gBACjC,QAAQ;gBACR,yBACEa,AAAuB,YAAvBA,KAAK,WAAW,KAAiB;oBAAC;iBAAQ,GAAGR;YACjD;QACF,EAAE,OAAOU,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFH,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeO,KACpBC,eAAuB,EACvBC,IAAiB;IAEjB,MAAM,EAAEC,OAAO,EAAEC,mBAAmB,EAAE,GAAGF;IACzC,MAAMG,eAAeH,KAAK,YAAY;IACtC,MAAM,EAAEI,OAAO,EAAE,GAAGD;IACpB,MAAM,EAAEE,QAAQ,EAAE,GAAGJ;IACrB,MAAMK,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,IAAID,KAAK,uBAAuB,IAAI,CAACG,aAAa,MAAM,CAAC,WAAW,EAClE,MAAM,IAAIN,MACRU,4CAA4CJ,aAAa,MAAM,CAAC,IAAI;IAIxE,MAAMK,sBACJL,aAAa,MAAM,CAAC,WAAW,IAAIC,AAAwB,eAAxBA,QAAQ,MAAM,CAAC,IAAI,GAClDA,QAAQ,MAAM,CAAC,aAAa,GAC5BlB;IAGN,MAAMuB,kBAAkBT,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMU,eAAe,MAAMC,2BAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,kBAAkBQ,qBAAqB;QACvC,yBAAyBR,KAAK,uBAAuB;QACrD,gBAAgB;QAChBS;IACF;IAEA,MAAMG,gBAAgB,MAAMC,kBAAkB;QAC5C,aAAaP;QACb,OAAOD,SAAS,KAAK;QACrB,QAAQA,SAAS,MAAM;QACvB,QAAQD,QAAQ,eAAe;IACjC;IACA,MAAMU,eAAeF,cAAc,WAAW;IAE9C,MAAMG,gBAAgBf,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMgB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEhB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIkB;IAKJ,MAAMC,wBAAwBT,kBAC1BP,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMiB,2BAA2BD,wBAC7B,CAAC,IAAI,EAAEA,uBAAuB,GAC9BhB,oBAAoB,sBAAsB,GACxC,KACA,CAAC,IAAI,EAAE/B,uBAAuB;IAGpC,MAAMiD,eAAelB,oBAAoB,cAAc;IACvD,MAAMmB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIlB,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEmB,kBAAkBF,0BAA0B;gBACzN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAZ,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,+BAA+B,EAAEI,kBAAkBF,0BAA0B;YACtF;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKL;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFZ,oBAAoB,MAAM,CAACe;IAG3Bf,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMoB,aAAapB,oBAAoB,QAAQ,CAACF,KAAK,kBAAkB;IAEvE,MAAMuB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCM;WACAM;KACJ;IAED,IAAI,EACF,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMpB,cAAc;QACnC,aAAaH,KAAK,WAAW;QAG7B,6BAA6BA,KAAK,uBAAuB;IAC3D;IAGA,IAAI4B;IACJ,IAAI;QACF,IAAI;YACFA,aAAaxD,yBAAyBoD,aAAapB,QAAQ,UAAU;QACvE,EAAE,OAAM;YACN,MAAMyB,QAAQ,MAAMF,OAAOJ,MAAMpB,cAAc;gBAC7C,aAAaH,KAAK,WAAW;gBAE7B,6BAA6BA,KAAK,uBAAuB;YAC3D;YACAwB,cAAcK,MAAM,OAAO;YAC3BJ,QAAQI,MAAM,KAAK;YACnBH,oBAAoBG,MAAM,iBAAiB;YAC3CD,aAAaxD,yBAAyBoD,aAAapB,QAAQ,UAAU;QACvE;QAEA,IAAIwB,WAAW,MAAM,IAAIA,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YACjE1D,QACE;YAEF0D,WAAW,eAAe,GAAG1C;YAC7B0C,WAAW,eAAe,GAAG1C;QAC/B;QAEA,MAAM4C,UAAUF,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIG,yBAAyB;QAG7B,IAAIH,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YAC5C5D,MAAM;YACN+D,yBAAyB;YAEzB,IAAItB,iBACFP,oBAAoB,uBAAuB;QAE/C;QAEA,MAAM8B,cAAkC;YACtC,GAAGJ,UAAU;YACbE;YACAN;YACAC;YACAC;YACA,UAAUO,uBAAuBH,SAAS9B,KAAK,WAAW;YAC1D+B;QACF;QAEAG,OAAON,YAAY;QAEnBE,QAAQ,OAAO,CAAC,CAACrC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAM0C,sBAAsBnC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACP,SAAWA,OAAO,IAAI,KAAKC;YAG9B1B,MAAM,+BAA+BmE;YACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENnE,MAAM,gBAAgBoE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAe9C,OAAO,KAAK,CAAC6C,MAAM;gBACxC,IAAIC,cAAc;oBAChB,IAAI,CAACvC,KAAK,uBAAuB,EAAE;wBACjC,IAAI,AAAwB,YAAxB,OAAOuC,cAET9C,OAAO,KAAK,CAAC6C,MAAM,GAAG;4BAAE,QAAQC,aAAa,MAAM;wBAAC;wBAEtD;oBACF;oBAEAL,OACE1B,qBACA;oBAEFf,OAAO,KAAK,CAAC6C,MAAM,GAAG;wBACpB,GAAGC,YAAY;wBACf,kBAAkB/B,oBAAoB,6BAA6B,CACjE+B,cACA;4BACE,cAAc3B,cAAc,YAAY;4BACxC,aAAaA,cAAc,WAAW;wBACxC;oBAEJ;gBACF;YACF;QACF;QAGA,IAAIH,iBAAiB;YACnB,IAAImB,WAAW,cAAc,EAAE,QAC7B1B,oBAAoB,aAAa,CAAC0B,WAAW,cAAc;YAE7D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMY,SAASZ,WAAW,mBAAmB,CAChD1B,oBAAoB,mBAAmB,CAACsC;YAI5C,IAAIZ,WAAW,GAAG,EAChB1B,oBAAoB,gBAAgB,CAAC0B,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChB1B,oBAAoB,mBAAmB,CAAC0B,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnB1B,oBAAoB,YAAY,CAAC0B,WAAW,MAAM;QAGpD1B,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMsB;gBACR;aACD;QACH;QAEA,OAAOQ;IACT,EAAE,OAAOS,YAAY;QAEnB,MAAMC,eACJD,sBAAsB5C,QAAQ4C,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClClB,aACAC;IAEJ;AACF"}
|
|
@@ -1,10 +1,24 @@
|
|
|
1
|
-
import { adaptBbox, pointToBbox } from "../../common.mjs";
|
|
2
1
|
import { getDebug } from "@midscene/shared/logger";
|
|
2
|
+
import { finalizePixelBbox } from "../../shared/model-locate-result/bbox.mjs";
|
|
3
|
+
import { mapLocateResultToPixelBboxByCoordinates } from "../../shared/model-locate-result/pixel-bbox-mapper.mjs";
|
|
3
4
|
const debug = getDebug('auto-glm-actions');
|
|
4
5
|
const AUTO_GLM_COORDINATE_MAX = 1000;
|
|
5
|
-
function
|
|
6
|
-
const
|
|
7
|
-
|
|
6
|
+
function autoGLMCoordinateToLocateParam(coordinate, size) {
|
|
7
|
+
const ctx = {
|
|
8
|
+
preparedSize: size
|
|
9
|
+
};
|
|
10
|
+
const pixelBbox = mapLocateResultToPixelBboxByCoordinates({
|
|
11
|
+
type: 'point',
|
|
12
|
+
coordinates: coordinate
|
|
13
|
+
}, ctx, {
|
|
14
|
+
shape: 'point',
|
|
15
|
+
order: 'xy',
|
|
16
|
+
normalizedBy: AUTO_GLM_COORDINATE_MAX
|
|
17
|
+
});
|
|
18
|
+
return {
|
|
19
|
+
prompt: '',
|
|
20
|
+
locatedPixelBbox: finalizePixelBbox(pixelBbox, coordinate, ctx)
|
|
21
|
+
};
|
|
8
22
|
}
|
|
9
23
|
const BACK_BUTTON_NAMES = [
|
|
10
24
|
'AndroidBackButton',
|
|
@@ -42,16 +56,7 @@ function transformAutoGLMAction(action, size, actionSpace) {
|
|
|
42
56
|
{
|
|
43
57
|
const tapAction = doAction;
|
|
44
58
|
debug('Transform Tap action:', tapAction);
|
|
45
|
-
const
|
|
46
|
-
const locate = {
|
|
47
|
-
prompt: '',
|
|
48
|
-
bbox: [
|
|
49
|
-
x1,
|
|
50
|
-
y1,
|
|
51
|
-
x2,
|
|
52
|
-
y2
|
|
53
|
-
]
|
|
54
|
-
};
|
|
59
|
+
const locate = autoGLMCoordinateToLocateParam(tapAction.element, size);
|
|
55
60
|
return [
|
|
56
61
|
{
|
|
57
62
|
type: 'Tap',
|
|
@@ -65,16 +70,7 @@ function transformAutoGLMAction(action, size, actionSpace) {
|
|
|
65
70
|
{
|
|
66
71
|
const doubleTapAction = doAction;
|
|
67
72
|
debug('Transform Double Tap action:', doubleTapAction);
|
|
68
|
-
const
|
|
69
|
-
const locate = {
|
|
70
|
-
prompt: '',
|
|
71
|
-
bbox: [
|
|
72
|
-
x1,
|
|
73
|
-
y1,
|
|
74
|
-
x2,
|
|
75
|
-
y2
|
|
76
|
-
]
|
|
77
|
-
};
|
|
73
|
+
const locate = autoGLMCoordinateToLocateParam(doubleTapAction.element, size);
|
|
78
74
|
return [
|
|
79
75
|
{
|
|
80
76
|
type: 'DoubleClick',
|
|
@@ -101,16 +97,7 @@ function transformAutoGLMAction(action, size, actionSpace) {
|
|
|
101
97
|
{
|
|
102
98
|
const swipeAction = doAction;
|
|
103
99
|
debug('Transform Swipe action:', swipeAction);
|
|
104
|
-
const
|
|
105
|
-
const locate = {
|
|
106
|
-
prompt: '',
|
|
107
|
-
bbox: [
|
|
108
|
-
x1,
|
|
109
|
-
y1,
|
|
110
|
-
x2,
|
|
111
|
-
y2
|
|
112
|
-
]
|
|
113
|
-
};
|
|
100
|
+
const locate = autoGLMCoordinateToLocateParam(swipeAction.start, size);
|
|
114
101
|
const deltaX = swipeAction.end[0] - swipeAction.start[0];
|
|
115
102
|
const deltaY = swipeAction.end[1] - swipeAction.start[1];
|
|
116
103
|
let direction;
|
|
@@ -141,16 +128,7 @@ function transformAutoGLMAction(action, size, actionSpace) {
|
|
|
141
128
|
{
|
|
142
129
|
const longPressAction = doAction;
|
|
143
130
|
debug('Transform Long Press action:', longPressAction);
|
|
144
|
-
const
|
|
145
|
-
const locate = {
|
|
146
|
-
prompt: '',
|
|
147
|
-
bbox: [
|
|
148
|
-
x1,
|
|
149
|
-
y1,
|
|
150
|
-
x2,
|
|
151
|
-
y2
|
|
152
|
-
]
|
|
153
|
-
};
|
|
131
|
+
const locate = autoGLMCoordinateToLocateParam(longPressAction.element, size);
|
|
154
132
|
return [
|
|
155
133
|
{
|
|
156
134
|
type: 'LongPress',
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/models/auto-glm/actions.mjs","sources":["../../../../../src/ai-model/models/auto-glm/actions.ts"],"sourcesContent":["import type { DeviceAction } from '@/device';\nimport type {\n PlanningAction,\n PlanningLocateParamWithLocatedPixelBbox,\n} from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\nimport { finalizePixelBbox } from '../../shared/model-locate-result/bbox';\nimport { mapLocateResultToPixelBboxByCoordinates } from '../../shared/model-locate-result/pixel-bbox-mapper';\n\nconst debug = getDebug('auto-glm-actions');\n\n/**\n * Auto-GLM coordinate system range: [0, AUTO_GLM_COORDINATE_MAX]\n */\nconst AUTO_GLM_COORDINATE_MAX = 1000;\n\nfunction autoGLMCoordinateToLocateParam(\n coordinate: [number, number],\n size: { width: number; height: number },\n): PlanningLocateParamWithLocatedPixelBbox {\n const ctx = { preparedSize: size };\n const pixelBbox = mapLocateResultToPixelBboxByCoordinates(\n { type: 'point', coordinates: coordinate },\n ctx,\n { shape: 'point', order: 'xy', normalizedBy: AUTO_GLM_COORDINATE_MAX },\n );\n\n return {\n prompt: '',\n locatedPixelBbox: finalizePixelBbox(pixelBbox, coordinate, ctx),\n };\n}\n\nexport interface BaseAction {\n _metadata: string;\n think?: string;\n}\n\nexport interface TapAction extends BaseAction {\n _metadata: 'do';\n action: 'Tap';\n element: [number, number];\n}\n\nexport interface DoubleTapAction extends BaseAction {\n _metadata: 'do';\n action: 'Double Tap';\n element: [number, number];\n}\n\nexport interface TypeAction extends BaseAction {\n _metadata: 'do';\n action: 'Type';\n text: string;\n}\n\nexport interface SwipeAction extends BaseAction {\n _metadata: 'do';\n action: 'Swipe';\n start: [number, number];\n end: [number, number];\n}\n\nexport interface LongPressAction extends BaseAction {\n _metadata: 'do';\n action: 'Long Press';\n element: [number, number];\n}\n\nexport interface LaunchAction extends BaseAction {\n _metadata: 'do';\n action: 'Launch';\n app: string;\n}\n\nexport interface BackAction extends BaseAction {\n _metadata: 'do';\n action: 'Back';\n}\n\nexport interface HomeAction extends BaseAction {\n _metadata: 'do';\n action: 'Home';\n}\n\nexport interface WaitAction extends BaseAction {\n _metadata: 'do';\n action: 'Wait';\n durationMs: number;\n}\n\nexport interface InteractAction extends BaseAction {\n _metadata: 'do';\n action: 'Interact';\n}\n\nexport interface CallAPIAction extends BaseAction {\n _metadata: 'do';\n action: 'Call_API';\n instruction: string;\n}\n\nexport interface TakeoverAction extends BaseAction {\n _metadata: 'do';\n action: 'Take_over';\n message: string;\n}\n\nexport interface NoteAction extends BaseAction {\n _metadata: 'do';\n action: 'Note';\n message: string;\n}\n\nexport interface FinishAction extends BaseAction {\n _metadata: 'finish';\n message: string;\n}\n\nexport type ParsedAction =\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction\n | FinishAction;\n\nconst BACK_BUTTON_NAMES = ['AndroidBackButton', 'HarmonyBackButton'];\nconst HOME_BUTTON_NAMES = ['AndroidHomeButton', 'HarmonyHomeButton'];\n\n/**\n * Find the action name in actionSpace that matches one of the known names.\n * Falls back to defaultName if no match found or actionSpace is not provided.\n */\nfunction findActionName(\n actionSpace: DeviceAction[] | undefined,\n knownNames: string[],\n defaultName: string,\n): string {\n if (!actionSpace) return defaultName;\n const match = actionSpace.find((a) => knownNames.includes(a.name));\n return match ? match.name : defaultName;\n}\n\nexport function transformAutoGLMAction(\n action: ParsedAction,\n size: { width: number; height: number },\n actionSpace?: DeviceAction[],\n): PlanningAction[] {\n try {\n switch (action._metadata) {\n case 'finish': {\n const finishAction = action as FinishAction;\n debug('Transform finish action:', finishAction);\n return [\n {\n type: 'Finished',\n param: {},\n thought: finishAction.message,\n },\n ];\n }\n case 'do': {\n const doAction = action as\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction;\n\n switch ((doAction as any).action) {\n case 'Tap': {\n const tapAction = doAction as TapAction;\n debug('Transform Tap action:', tapAction);\n const locate = autoGLMCoordinateToLocateParam(\n tapAction.element,\n size,\n );\n\n return [\n {\n type: 'Tap',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Double Tap': {\n const doubleTapAction = doAction as DoubleTapAction;\n debug('Transform Double Tap action:', doubleTapAction);\n const locate = autoGLMCoordinateToLocateParam(\n doubleTapAction.element,\n size,\n );\n\n return [\n {\n type: 'DoubleClick',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Type': {\n const typeAction = doAction as TypeAction;\n debug('Transform Type action:', typeAction);\n\n return [\n {\n type: 'Input',\n param: {\n value: typeAction.text,\n },\n },\n ];\n }\n case 'Swipe': {\n const swipeAction = doAction as SwipeAction;\n debug('Transform Swipe action:', swipeAction);\n\n const locate = autoGLMCoordinateToLocateParam(\n swipeAction.start,\n size,\n );\n\n // Calculate horizontal and vertical delta in [0,AUTO_GLM_COORDINATE_MAX] coordinate system\n const deltaX = swipeAction.end[0] - swipeAction.start[0];\n const deltaY = swipeAction.end[1] - swipeAction.start[1];\n\n // Determine direction and distance\n let direction: 'up' | 'down' | 'left' | 'right';\n let distance: number;\n\n const absDeltaX = Math.abs(deltaX);\n const absDeltaY = Math.abs(deltaY);\n\n if (absDeltaY > absDeltaX) {\n // Vertical scroll\n distance = Math.round(\n (absDeltaY * size.height) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaY > 0 ? 'up' : 'down';\n } else {\n // Horizontal scroll\n distance = Math.round(\n (absDeltaX * size.width) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaX > 0 ? 'left' : 'right';\n }\n\n debug(\n `Calculate swipe direction: ${direction}, distance: ${distance}`,\n );\n\n return [\n {\n type: 'Scroll',\n param: {\n locate,\n // The scrolling direction here all refers to which direction of the page's content will appear on the screen.\n distance,\n direction,\n },\n thought: swipeAction.think || '',\n },\n ];\n }\n case 'Long Press': {\n const longPressAction = doAction as LongPressAction;\n debug('Transform Long Press action:', longPressAction);\n const locate = autoGLMCoordinateToLocateParam(\n longPressAction.element,\n size,\n );\n\n return [\n {\n type: 'LongPress',\n param: {\n locate,\n },\n thought: longPressAction.think || '',\n },\n ];\n }\n case 'Back': {\n const backAction = doAction as BackAction;\n debug('Transform Back action:', backAction);\n return [\n {\n type: findActionName(\n actionSpace,\n BACK_BUTTON_NAMES,\n 'AndroidBackButton',\n ),\n param: {},\n thought: backAction.think || '',\n },\n ];\n }\n case 'Home': {\n const homeAction = doAction as HomeAction;\n debug('Transform Home action:', homeAction);\n return [\n {\n type: findActionName(\n actionSpace,\n HOME_BUTTON_NAMES,\n 'AndroidHomeButton',\n ),\n param: {},\n thought: homeAction.think || '',\n },\n ];\n }\n case 'Wait': {\n const waitAction = doAction as WaitAction;\n debug('Transform Wait action:', waitAction);\n return [\n {\n type: 'Sleep',\n param: {\n timeMs: waitAction.durationMs,\n },\n thought: waitAction.think || '',\n },\n ];\n }\n case 'Launch': {\n const launchAction = doAction as LaunchAction;\n debug('Transform Launch action:', launchAction);\n return [\n {\n type: 'Launch',\n param: { uri: launchAction.app },\n thought: launchAction.think || '',\n },\n ];\n }\n case 'Interact': {\n throw new Error(\n `Action \"Interact\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Call_API': {\n throw new Error(\n `Action \"Call_API\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Take_over': {\n throw new Error(\n `Action \"Take_over\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Note': {\n throw new Error(\n `Action \"Note\" from auto-glm is not supported in the current implementation.`,\n );\n }\n default:\n throw new Error(\n `Unknown do() action type: ${(doAction as any).action}`,\n );\n }\n }\n default:\n throw new Error(\n `Unknown action metadata: ${(action as any)._metadata}`,\n );\n }\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n debug('Transform error:', errorMessage);\n throw new Error(`Failed to transform action: ${errorMessage}`);\n }\n}\n"],"names":["debug","getDebug","AUTO_GLM_COORDINATE_MAX","autoGLMCoordinateToLocateParam","coordinate","size","ctx","pixelBbox","mapLocateResultToPixelBboxByCoordinates","finalizePixelBbox","BACK_BUTTON_NAMES","HOME_BUTTON_NAMES","findActionName","actionSpace","knownNames","defaultName","match","a","transformAutoGLMAction","action","finishAction","doAction","tapAction","locate","doubleTapAction","typeAction","swipeAction","deltaX","deltaY","direction","distance","absDeltaX","Math","absDeltaY","longPressAction","backAction","homeAction","waitAction","launchAction","Error","error","errorMessage","String"],"mappings":";;;AASA,MAAMA,QAAQC,SAAS;AAKvB,MAAMC,0BAA0B;AAEhC,SAASC,+BACPC,UAA4B,EAC5BC,IAAuC;IAEvC,MAAMC,MAAM;QAAE,cAAcD;IAAK;IACjC,MAAME,YAAYC,wCAChB;QAAE,MAAM;QAAS,aAAaJ;IAAW,GACzCE,KACA;QAAE,OAAO;QAAS,OAAO;QAAM,cAAcJ;IAAwB;IAGvE,OAAO;QACL,QAAQ;QACR,kBAAkBO,kBAAkBF,WAAWH,YAAYE;IAC7D;AACF;AAwGA,MAAMI,oBAAoB;IAAC;IAAqB;CAAoB;AACpE,MAAMC,oBAAoB;IAAC;IAAqB;CAAoB;AAMpE,SAASC,eACPC,WAAuC,EACvCC,UAAoB,EACpBC,WAAmB;IAEnB,IAAI,CAACF,aAAa,OAAOE;IACzB,MAAMC,QAAQH,YAAY,IAAI,CAAC,CAACI,IAAMH,WAAW,QAAQ,CAACG,EAAE,IAAI;IAChE,OAAOD,QAAQA,MAAM,IAAI,GAAGD;AAC9B;AAEO,SAASG,uBACdC,MAAoB,EACpBd,IAAuC,EACvCQ,WAA4B;IAE5B,IAAI;QACF,OAAQM,OAAO,SAAS;YACtB,KAAK;gBAAU;oBACb,MAAMC,eAAeD;oBACrBnB,MAAM,4BAA4BoB;oBAClC,OAAO;wBACL;4BACE,MAAM;4BACN,OAAO,CAAC;4BACR,SAASA,aAAa,OAAO;wBAC/B;qBACD;gBACH;YACA,KAAK;gBAAM;oBACT,MAAMC,WAAWF;oBAejB,OAASE,SAAiB,MAAM;wBAC9B,KAAK;4BAAO;gCACV,MAAMC,YAAYD;gCAClBrB,MAAM,yBAAyBsB;gCAC/B,MAAMC,SAASpB,+BACbmB,UAAU,OAAO,EACjBjB;gCAGF,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLkB;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMC,kBAAkBH;gCACxBrB,MAAM,gCAAgCwB;gCACtC,MAAMD,SAASpB,+BACbqB,gBAAgB,OAAO,EACvBnB;gCAGF,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLkB;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAME,aAAaJ;gCACnBrB,MAAM,0BAA0ByB;gCAEhC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,OAAOA,WAAW,IAAI;wCACxB;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAS;gCACZ,MAAMC,cAAcL;gCACpBrB,MAAM,2BAA2B0B;gCAEjC,MAAMH,SAASpB,+BACbuB,YAAY,KAAK,EACjBrB;gCAIF,MAAMsB,SAASD,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCACxD,MAAME,SAASF,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCAGxD,IAAIG;gCACJ,IAAIC;gCAEJ,MAAMC,YAAYC,KAAK,GAAG,CAACL;gCAC3B,MAAMM,YAAYD,KAAK,GAAG,CAACJ;gCAE3B,IAAIK,YAAYF,WAAW;oCAEzBD,WAAWE,KAAK,KAAK,CAClBC,YAAY5B,KAAK,MAAM,GAAIH;oCAE9B2B,YAAYD,SAAS,IAAI,OAAO;gCAClC,OAAO;oCAELE,WAAWE,KAAK,KAAK,CAClBD,YAAY1B,KAAK,KAAK,GAAIH;oCAE7B2B,YAAYF,SAAS,IAAI,SAAS;gCACpC;gCAEA3B,MACE,CAAC,2BAA2B,EAAE6B,UAAU,YAAY,EAAEC,UAAU;gCAGlE,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLP;4CAEAO;4CACAD;wCACF;wCACA,SAASH,YAAY,KAAK,IAAI;oCAChC;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMQ,kBAAkBb;gCACxBrB,MAAM,gCAAgCkC;gCACtC,MAAMX,SAASpB,+BACb+B,gBAAgB,OAAO,EACvB7B;gCAGF,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLkB;wCACF;wCACA,SAASW,gBAAgB,KAAK,IAAI;oCACpC;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAad;gCACnBrB,MAAM,0BAA0BmC;gCAChC,OAAO;oCACL;wCACE,MAAMvB,eACJC,aACAH,mBACA;wCAEF,OAAO,CAAC;wCACR,SAASyB,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAaf;gCACnBrB,MAAM,0BAA0BoC;gCAChC,OAAO;oCACL;wCACE,MAAMxB,eACJC,aACAF,mBACA;wCAEF,OAAO,CAAC;wCACR,SAASyB,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAahB;gCACnBrB,MAAM,0BAA0BqC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,QAAQA,WAAW,UAAU;wCAC/B;wCACA,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAU;gCACb,MAAMC,eAAejB;gCACrBrB,MAAM,4BAA4BsC;gCAClC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CAAE,KAAKA,aAAa,GAAG;wCAAC;wCAC/B,SAASA,aAAa,KAAK,IAAI;oCACjC;iCACD;4BACH;wBACA,KAAK;4BACH,MAAM,IAAIC,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ;4BACE,MAAM,IAAIA,MACR,CAAC,0BAA0B,EAAGlB,SAAiB,MAAM,EAAE;oBAE7D;gBACF;YACA;gBACE,MAAM,IAAIkB,MACR,CAAC,yBAAyB,EAAGpB,OAAe,SAAS,EAAE;QAE7D;IACF,EAAE,OAAOqB,OAAO;QACd,MAAMC,eAAeD,iBAAiBD,QAAQC,MAAM,OAAO,GAAGE,OAAOF;QACrExC,MAAM,oBAAoByC;QAC1B,MAAM,IAAIF,MAAM,CAAC,4BAA4B,EAAEE,cAAc;IAC/D;AACF"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { autoGlmLocate } from "./locate.mjs";
|
|
2
|
+
import { autoGlmPlanning } from "./planning.mjs";
|
|
3
|
+
import { getAutoGLMChineseLocatePrompt, getAutoGLMChinesePlanPrompt, getAutoGLMMultilingualLocatePrompt, getAutoGLMMultilingualPlanPrompt } from "./prompt.mjs";
|
|
4
|
+
const defaultAutoGlmReplanningCycleLimit = 100;
|
|
5
|
+
function createAutoGlmAdapter({ getPlanPrompt, getLocatePrompt }) {
|
|
6
|
+
return {
|
|
7
|
+
chatCompletion: {
|
|
8
|
+
unsupportedUserConfig: [
|
|
9
|
+
'reasoningEnabled',
|
|
10
|
+
'reasoningEffort',
|
|
11
|
+
'reasoningBudget'
|
|
12
|
+
],
|
|
13
|
+
buildChatCompletionParams: ({ midsceneDefaults, userConfig })=>({
|
|
14
|
+
config: {
|
|
15
|
+
temperature: userConfig.temperature ?? midsceneDefaults.temperature,
|
|
16
|
+
top_p: 0.85,
|
|
17
|
+
frequency_penalty: 0.2
|
|
18
|
+
}
|
|
19
|
+
})
|
|
20
|
+
},
|
|
21
|
+
planning: {
|
|
22
|
+
kind: 'custom',
|
|
23
|
+
cacheEnabled: false,
|
|
24
|
+
defaultReplanningCycleLimit: defaultAutoGlmReplanningCycleLimit,
|
|
25
|
+
planFn: (userInstruction, options)=>autoGlmPlanning(userInstruction, options, getPlanPrompt)
|
|
26
|
+
},
|
|
27
|
+
locate: {
|
|
28
|
+
kind: 'custom',
|
|
29
|
+
locateFn: (elementDescription, options)=>autoGlmLocate(elementDescription, options, getLocatePrompt)
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
const autoGlmAdapters = {
|
|
34
|
+
'auto-glm': createAutoGlmAdapter({
|
|
35
|
+
getPlanPrompt: getAutoGLMChinesePlanPrompt,
|
|
36
|
+
getLocatePrompt: getAutoGLMChineseLocatePrompt
|
|
37
|
+
}),
|
|
38
|
+
'auto-glm-multilingual': createAutoGlmAdapter({
|
|
39
|
+
getPlanPrompt: getAutoGLMMultilingualPlanPrompt,
|
|
40
|
+
getLocatePrompt: getAutoGLMMultilingualLocatePrompt
|
|
41
|
+
})
|
|
42
|
+
};
|
|
43
|
+
export { autoGlmAdapters };
|
|
44
|
+
|
|
45
|
+
//# sourceMappingURL=adapter.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/models/auto-glm/adapter.mjs","sources":["../../../../../src/ai-model/models/auto-glm/adapter.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nimport type { ModelAdapterDefinition } from '../types';\nimport { autoGlmLocate } from './locate';\nimport { autoGlmPlanning } from './planning';\nimport {\n getAutoGLMChineseLocatePrompt,\n getAutoGLMChinesePlanPrompt,\n getAutoGLMMultilingualLocatePrompt,\n getAutoGLMMultilingualPlanPrompt,\n} from './prompt';\n\nconst defaultAutoGlmReplanningCycleLimit = 100;\n\nfunction createAutoGlmAdapter({\n getPlanPrompt,\n getLocatePrompt,\n}: {\n getPlanPrompt: () => string;\n getLocatePrompt: () => string;\n}): ModelAdapterDefinition {\n return {\n chatCompletion: {\n unsupportedUserConfig: [\n 'reasoningEnabled',\n 'reasoningEffort',\n 'reasoningBudget',\n ],\n buildChatCompletionParams: ({ midsceneDefaults, userConfig }) => ({\n config: {\n temperature: userConfig.temperature ?? midsceneDefaults.temperature,\n top_p: 0.85,\n frequency_penalty: 0.2,\n },\n }),\n },\n planning: {\n kind: 'custom',\n cacheEnabled: false,\n defaultReplanningCycleLimit: defaultAutoGlmReplanningCycleLimit,\n planFn: (userInstruction, options) =>\n autoGlmPlanning(userInstruction, options, getPlanPrompt),\n },\n locate: {\n kind: 'custom',\n locateFn: (elementDescription, options) =>\n autoGlmLocate(elementDescription, options, getLocatePrompt),\n },\n };\n}\n\nexport const autoGlmAdapters = {\n 'auto-glm': createAutoGlmAdapter({\n getPlanPrompt: getAutoGLMChinesePlanPrompt,\n getLocatePrompt: getAutoGLMChineseLocatePrompt,\n }),\n 'auto-glm-multilingual': createAutoGlmAdapter({\n getPlanPrompt: getAutoGLMMultilingualPlanPrompt,\n getLocatePrompt: getAutoGLMMultilingualLocatePrompt,\n }),\n} satisfies Pick<\n Record<TModelFamily, ModelAdapterDefinition>,\n 'auto-glm' | 'auto-glm-multilingual'\n>;\n"],"names":["defaultAutoGlmReplanningCycleLimit","createAutoGlmAdapter","getPlanPrompt","getLocatePrompt","midsceneDefaults","userConfig","userInstruction","options","autoGlmPlanning","elementDescription","autoGlmLocate","autoGlmAdapters","getAutoGLMChinesePlanPrompt","getAutoGLMChineseLocatePrompt","getAutoGLMMultilingualPlanPrompt","getAutoGLMMultilingualLocatePrompt"],"mappings":";;;AAWA,MAAMA,qCAAqC;AAE3C,SAASC,qBAAqB,EAC5BC,aAAa,EACbC,eAAe,EAIhB;IACC,OAAO;QACL,gBAAgB;YACd,uBAAuB;gBACrB;gBACA;gBACA;aACD;YACD,2BAA2B,CAAC,EAAEC,gBAAgB,EAAEC,UAAU,EAAE,GAAM;oBAChE,QAAQ;wBACN,aAAaA,WAAW,WAAW,IAAID,iBAAiB,WAAW;wBACnE,OAAO;wBACP,mBAAmB;oBACrB;gBACF;QACF;QACA,UAAU;YACR,MAAM;YACN,cAAc;YACd,6BAA6BJ;YAC7B,QAAQ,CAACM,iBAAiBC,UACxBC,gBAAgBF,iBAAiBC,SAASL;QAC9C;QACA,QAAQ;YACN,MAAM;YACN,UAAU,CAACO,oBAAoBF,UAC7BG,cAAcD,oBAAoBF,SAASJ;QAC/C;IACF;AACF;AAEO,MAAMQ,kBAAkB;IAC7B,YAAYV,qBAAqB;QAC/B,eAAeW;QACf,iBAAiBC;IACnB;IACA,yBAAyBZ,qBAAqB;QAC5C,eAAea;QACf,iBAAiBC;IACnB;AACF"}
|