@donggui/core 1.5.4-donggui.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +9 -0
- package/dist/es/agent/agent.mjs +709 -0
- package/dist/es/agent/agent.mjs.map +1 -0
- package/dist/es/agent/common.mjs +0 -0
- package/dist/es/agent/execution-session.mjs +41 -0
- package/dist/es/agent/execution-session.mjs.map +1 -0
- package/dist/es/agent/index.mjs +6 -0
- package/dist/es/agent/task-builder.mjs +330 -0
- package/dist/es/agent/task-builder.mjs.map +1 -0
- package/dist/es/agent/task-cache.mjs +186 -0
- package/dist/es/agent/task-cache.mjs.map +1 -0
- package/dist/es/agent/tasks.mjs +422 -0
- package/dist/es/agent/tasks.mjs.map +1 -0
- package/dist/es/agent/ui-utils.mjs +91 -0
- package/dist/es/agent/ui-utils.mjs.map +1 -0
- package/dist/es/agent/utils.mjs +198 -0
- package/dist/es/agent/utils.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/actions.mjs +224 -0
- package/dist/es/ai-model/auto-glm/actions.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/index.mjs +6 -0
- package/dist/es/ai-model/auto-glm/parser.mjs +239 -0
- package/dist/es/ai-model/auto-glm/parser.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/planning.mjs +71 -0
- package/dist/es/ai-model/auto-glm/planning.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs +222 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/util.mjs +9 -0
- package/dist/es/ai-model/auto-glm/util.mjs.map +1 -0
- package/dist/es/ai-model/conversation-history.mjs +195 -0
- package/dist/es/ai-model/conversation-history.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +11 -0
- package/dist/es/ai-model/inspect.mjs +386 -0
- package/dist/es/ai-model/inspect.mjs.map +1 -0
- package/dist/es/ai-model/llm-planning.mjs +233 -0
- package/dist/es/ai-model/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/common.mjs +7 -0
- package/dist/es/ai-model/prompt/common.mjs.map +1 -0
- package/dist/es/ai-model/prompt/describe.mjs +66 -0
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +129 -0
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs +51 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs +364 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +44 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/util.mjs +59 -0
- package/dist/es/ai-model/prompt/util.mjs.map +1 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +466 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
- package/dist/es/ai-model/ui-tars-planning.mjs +249 -0
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
- package/dist/es/common.mjs +371 -0
- package/dist/es/common.mjs.map +1 -0
- package/dist/es/device/device-options.mjs +0 -0
- package/dist/es/device/index.mjs +300 -0
- package/dist/es/device/index.mjs.map +1 -0
- package/dist/es/dump/html-utils.mjs +211 -0
- package/dist/es/dump/html-utils.mjs.map +1 -0
- package/dist/es/dump/image-restoration.mjs +43 -0
- package/dist/es/dump/image-restoration.mjs.map +1 -0
- package/dist/es/dump/index.mjs +3 -0
- package/dist/es/index.mjs +15 -0
- package/dist/es/index.mjs.map +1 -0
- package/dist/es/report-generator.mjs +134 -0
- package/dist/es/report-generator.mjs.map +1 -0
- package/dist/es/report.mjs +111 -0
- package/dist/es/report.mjs.map +1 -0
- package/dist/es/screenshot-item.mjs +105 -0
- package/dist/es/screenshot-item.mjs.map +1 -0
- package/dist/es/service/index.mjs +256 -0
- package/dist/es/service/index.mjs.map +1 -0
- package/dist/es/service/utils.mjs +15 -0
- package/dist/es/service/utils.mjs.map +1 -0
- package/dist/es/skill/index.mjs +38 -0
- package/dist/es/skill/index.mjs.map +1 -0
- package/dist/es/task-runner.mjs +258 -0
- package/dist/es/task-runner.mjs.map +1 -0
- package/dist/es/task-timing.mjs +12 -0
- package/dist/es/task-timing.mjs.map +1 -0
- package/dist/es/tree.mjs +13 -0
- package/dist/es/tree.mjs.map +1 -0
- package/dist/es/types.mjs +196 -0
- package/dist/es/types.mjs.map +1 -0
- package/dist/es/utils.mjs +218 -0
- package/dist/es/utils.mjs.map +1 -0
- package/dist/es/yaml/builder.mjs +13 -0
- package/dist/es/yaml/builder.mjs.map +1 -0
- package/dist/es/yaml/index.mjs +4 -0
- package/dist/es/yaml/player.mjs +418 -0
- package/dist/es/yaml/player.mjs.map +1 -0
- package/dist/es/yaml/utils.mjs +73 -0
- package/dist/es/yaml/utils.mjs.map +1 -0
- package/dist/es/yaml.mjs +0 -0
- package/dist/lib/agent/agent.js +757 -0
- package/dist/lib/agent/agent.js.map +1 -0
- package/dist/lib/agent/common.js +5 -0
- package/dist/lib/agent/execution-session.js +75 -0
- package/dist/lib/agent/execution-session.js.map +1 -0
- package/dist/lib/agent/index.js +81 -0
- package/dist/lib/agent/index.js.map +1 -0
- package/dist/lib/agent/task-builder.js +367 -0
- package/dist/lib/agent/task-builder.js.map +1 -0
- package/dist/lib/agent/task-cache.js +238 -0
- package/dist/lib/agent/task-cache.js.map +1 -0
- package/dist/lib/agent/tasks.js +465 -0
- package/dist/lib/agent/tasks.js.map +1 -0
- package/dist/lib/agent/ui-utils.js +143 -0
- package/dist/lib/agent/ui-utils.js.map +1 -0
- package/dist/lib/agent/utils.js +275 -0
- package/dist/lib/agent/utils.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/actions.js +258 -0
- package/dist/lib/ai-model/auto-glm/actions.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/index.js +66 -0
- package/dist/lib/ai-model/auto-glm/index.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/parser.js +282 -0
- package/dist/lib/ai-model/auto-glm/parser.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/planning.js +105 -0
- package/dist/lib/ai-model/auto-glm/planning.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/prompt.js +259 -0
- package/dist/lib/ai-model/auto-glm/prompt.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/util.js +46 -0
- package/dist/lib/ai-model/auto-glm/util.js.map +1 -0
- package/dist/lib/ai-model/conversation-history.js +229 -0
- package/dist/lib/ai-model/conversation-history.js.map +1 -0
- package/dist/lib/ai-model/index.js +125 -0
- package/dist/lib/ai-model/index.js.map +1 -0
- package/dist/lib/ai-model/inspect.js +429 -0
- package/dist/lib/ai-model/inspect.js.map +1 -0
- package/dist/lib/ai-model/llm-planning.js +270 -0
- package/dist/lib/ai-model/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/common.js +41 -0
- package/dist/lib/ai-model/prompt/common.js.map +1 -0
- package/dist/lib/ai-model/prompt/describe.js +100 -0
- package/dist/lib/ai-model/prompt/describe.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +169 -0
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-locator.js +88 -0
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-planning.js +401 -0
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js +81 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/util.js +105 -0
- package/dist/lib/ai-model/prompt/util.js.map +1 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +531 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -0
- package/dist/lib/ai-model/ui-tars-planning.js +283 -0
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
- package/dist/lib/common.js +480 -0
- package/dist/lib/common.js.map +1 -0
- package/dist/lib/device/device-options.js +20 -0
- package/dist/lib/device/device-options.js.map +1 -0
- package/dist/lib/device/index.js +418 -0
- package/dist/lib/device/index.js.map +1 -0
- package/dist/lib/dump/html-utils.js +281 -0
- package/dist/lib/dump/html-utils.js.map +1 -0
- package/dist/lib/dump/image-restoration.js +77 -0
- package/dist/lib/dump/image-restoration.js.map +1 -0
- package/dist/lib/dump/index.js +60 -0
- package/dist/lib/dump/index.js.map +1 -0
- package/dist/lib/index.js +146 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/report-generator.js +172 -0
- package/dist/lib/report-generator.js.map +1 -0
- package/dist/lib/report.js +145 -0
- package/dist/lib/report.js.map +1 -0
- package/dist/lib/screenshot-item.js +139 -0
- package/dist/lib/screenshot-item.js.map +1 -0
- package/dist/lib/service/index.js +290 -0
- package/dist/lib/service/index.js.map +1 -0
- package/dist/lib/service/utils.js +49 -0
- package/dist/lib/service/utils.js.map +1 -0
- package/dist/lib/skill/index.js +72 -0
- package/dist/lib/skill/index.js.map +1 -0
- package/dist/lib/task-runner.js +295 -0
- package/dist/lib/task-runner.js.map +1 -0
- package/dist/lib/task-timing.js +46 -0
- package/dist/lib/task-timing.js.map +1 -0
- package/dist/lib/tree.js +53 -0
- package/dist/lib/tree.js.map +1 -0
- package/dist/lib/types.js +285 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/utils.js +297 -0
- package/dist/lib/utils.js.map +1 -0
- package/dist/lib/yaml/builder.js +57 -0
- package/dist/lib/yaml/builder.js.map +1 -0
- package/dist/lib/yaml/index.js +81 -0
- package/dist/lib/yaml/index.js.map +1 -0
- package/dist/lib/yaml/player.js +452 -0
- package/dist/lib/yaml/player.js.map +1 -0
- package/dist/lib/yaml/utils.js +126 -0
- package/dist/lib/yaml/utils.js.map +1 -0
- package/dist/lib/yaml.js +20 -0
- package/dist/lib/yaml.js.map +1 -0
- package/dist/types/agent/agent.d.ts +190 -0
- package/dist/types/agent/common.d.ts +0 -0
- package/dist/types/agent/execution-session.d.ts +36 -0
- package/dist/types/agent/index.d.ts +10 -0
- package/dist/types/agent/task-builder.d.ts +34 -0
- package/dist/types/agent/task-cache.d.ts +48 -0
- package/dist/types/agent/tasks.d.ts +70 -0
- package/dist/types/agent/ui-utils.d.ts +14 -0
- package/dist/types/agent/utils.d.ts +29 -0
- package/dist/types/ai-model/auto-glm/actions.d.ts +77 -0
- package/dist/types/ai-model/auto-glm/index.d.ts +6 -0
- package/dist/types/ai-model/auto-glm/parser.d.ts +18 -0
- package/dist/types/ai-model/auto-glm/planning.d.ts +10 -0
- package/dist/types/ai-model/auto-glm/prompt.d.ts +27 -0
- package/dist/types/ai-model/auto-glm/util.d.ts +13 -0
- package/dist/types/ai-model/conversation-history.d.ts +105 -0
- package/dist/types/ai-model/index.d.ts +14 -0
- package/dist/types/ai-model/inspect.d.ts +58 -0
- package/dist/types/ai-model/llm-planning.d.ts +19 -0
- package/dist/types/ai-model/prompt/common.d.ts +2 -0
- package/dist/types/ai-model/prompt/describe.d.ts +1 -0
- package/dist/types/ai-model/prompt/extraction.d.ts +7 -0
- package/dist/types/ai-model/prompt/llm-locator.d.ts +3 -0
- package/dist/types/ai-model/prompt/llm-planning.d.ts +10 -0
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +3 -0
- package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +26 -0
- package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +33 -0
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +100 -0
- package/dist/types/ai-model/service-caller/index.d.ts +49 -0
- package/dist/types/ai-model/ui-tars-planning.d.ts +72 -0
- package/dist/types/common.d.ts +288 -0
- package/dist/types/device/device-options.d.ts +142 -0
- package/dist/types/device/index.d.ts +2315 -0
- package/dist/types/dump/html-utils.d.ts +52 -0
- package/dist/types/dump/image-restoration.d.ts +6 -0
- package/dist/types/dump/index.d.ts +5 -0
- package/dist/types/index.d.ts +17 -0
- package/dist/types/report-generator.d.ts +48 -0
- package/dist/types/report.d.ts +15 -0
- package/dist/types/screenshot-item.d.ts +66 -0
- package/dist/types/service/index.d.ts +23 -0
- package/dist/types/service/utils.d.ts +2 -0
- package/dist/types/skill/index.d.ts +25 -0
- package/dist/types/task-runner.d.ts +48 -0
- package/dist/types/task-timing.d.ts +8 -0
- package/dist/types/tree.d.ts +4 -0
- package/dist/types/types.d.ts +645 -0
- package/dist/types/utils.d.ts +40 -0
- package/dist/types/yaml/builder.d.ts +2 -0
- package/dist/types/yaml/index.d.ts +4 -0
- package/dist/types/yaml/player.d.ts +34 -0
- package/dist/types/yaml/utils.d.ts +9 -0
- package/dist/types/yaml.d.ts +203 -0
- package/package.json +111 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n safeParseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n const type = actionType.trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = safeParseJson(actionParamStr, modelFamily);\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n abortSignal?: AbortSignal;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { shotSize } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n // Only enable sub-goals when deepThink is true\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = shotSize.width;\n let imageHeight = shotSize.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In deepThink mode: show full sub-goals with logs\n // In non-deepThink mode: show historical execution logs\n const subGoalsText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const subGoalsSection = subGoalsText ? `\\n\\n${subGoalsText}` : '';\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the latest screenshot${memoriesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n let {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n abortSignal: opts.abortSignal,\n });\n\n // Parse XML response to JSON object, retry once on parse failure\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n } catch {\n const retry = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n abortSignal: opts.abortSignal,\n });\n rawResponse = retry.content;\n usage = retry.usage;\n reasoning_content = retry.reasoning_content;\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed (only when deepThink is enabled)\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n modelFamily,\n );\n }\n });\n });\n\n // Update sub-goals in conversation history based on response (only when deepThink is enabled)\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // In non-deepThink mode, accumulate logs as historical execution steps\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["debug","getDebug","warnLog","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","safeParseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","shotSize","screenshotBase64","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","subGoalsText","subGoalsSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","retry","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;;;;;AA+BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,UAAUD,SAAS,YAAY;IAAE,SAAS;AAAK;AAK9C,SAASE,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,SAASD,cAAcH,WAAW;IACxC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QACrD,MAAMc,OAAOd,WAAW,IAAI;QAC5B,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQC,cAAcf,gBAAgBP;QACxC,EAAE,OAAOuB,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAWC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,QAAQ,EAAE,GAAGH;IACrB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAE5B,WAAW,EAAE,GAAG6B;IAGxB,MAAMI,kBAAkBN,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMO,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7B3B;QACA,aAAa2B,KAAK,WAAW;QAC7B,gBAAgB;QAChBM;IACF;IAEA,IAAIG,eAAeJ;IACnB,IAAIK,aAAaN,SAAS,KAAK;IAC/B,IAAIO,cAAcP,SAAS,MAAM;IAKjC,IAAI/B,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMuC,eAAe,MAAMC,4BAA4BJ;QACvDC,aAAaE,aAAa,KAAK;QAC/BD,cAAcC,aAAa,MAAM;QACjCH,eAAeG,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBd,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMe,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEf,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIiB;IAKJ,MAAMC,eAAeX,kBACjBH,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMe,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAG/D,MAAME,eAAehB,oBAAoB,cAAc;IACvD,MAAMiB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIhB,oBAAoB,sBAAsB,EAAE;QAC9Ca,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGb,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEiB,kBAAkBF,iBAAiB;gBAChN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEa,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,6BAA6B,EAAEI,kBAAkBF,iBAAiB;YAC3E;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACa;IAG3Bb,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMkB,aAAalB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMsB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASf;QAAa;WACrCQ;WACAM;KACJ;IAED,IAAI,EACF,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMpB,aAAa;QAClC,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;QAClE,aAAaA,KAAK,WAAW;IAC/B;IAGA,IAAI2B;IACJ,IAAI;QACF,IAAI;YACFA,aAAaxD,yBAAyBoD,aAAalD;QACrD,EAAE,OAAM;YACN,MAAMuD,QAAQ,MAAMF,OAAOJ,MAAMpB,aAAa;gBAC5C,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;gBAClE,aAAaA,KAAK,WAAW;YAC/B;YACAuB,cAAcK,MAAM,OAAO;YAC3BJ,QAAQI,MAAM,KAAK;YACnBH,oBAAoBG,MAAM,iBAAiB;YAC3CD,aAAaxD,yBAAyBoD,aAAalD;QACrD;QAEA,IAAIsD,WAAW,MAAM,IAAIA,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YACjEzD,QACE;YAEFyD,WAAW,eAAe,GAAG1C;YAC7B0C,WAAW,eAAe,GAAG1C;QAC/B;QAEA,MAAM4C,UAAUF,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIG,yBAAyB;QAG7B,IAAIH,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YAC5C3D,MAAM;YACN8D,yBAAyB;YAEzB,IAAIxB,iBACFH,oBAAoB,uBAAuB;QAE/C;QAEA,MAAM4B,cAAkC;YACtC,GAAGJ,UAAU;YACbE;YACAN;YACAC;YACAC;YACA,UAAUO,uBAAuBH,SAAS7B,KAAK,WAAW;YAC1D8B;QACF;QAEAG,OAAON,YAAY;QAEnBE,QAAQ,OAAO,CAAC,CAACrC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAM0C,sBAAsBlC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;YAG9BzB,MAAM,+BAA+BkE;YACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENlE,MAAM,gBAAgBmE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAe9C,OAAO,KAAK,CAAC6C,MAAM;gBACxC,IAAIC,gBAAgBjE,AAAgBY,WAAhBZ,aAElBmB,OAAO,KAAK,CAAC6C,MAAM,GAAGE,cACpBD,cACA5B,YACAC,aACAtC;YAGN;QACF;QAGA,IAAIiC,iBAAiB;YACnB,IAAIqB,WAAW,cAAc,EAAE,QAC7BxB,oBAAoB,aAAa,CAACwB,WAAW,cAAc;YAE7D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMa,SAASb,WAAW,mBAAmB,CAChDxB,oBAAoB,mBAAmB,CAACqC;YAI5C,IAAIb,WAAW,GAAG,EAChBxB,oBAAoB,gBAAgB,CAACwB,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChBxB,oBAAoB,mBAAmB,CAACwB,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnBxB,oBAAoB,YAAY,CAACwB,WAAW,MAAM;QAGpDxB,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMoB;gBACR;aACD;QACH;QAEA,OAAOQ;IACT,EAAE,OAAOU,YAAY;QAEnB,MAAMC,eACJD,sBAAsB5C,QAAQ4C,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClCnB,aACAC;IAEJ;AACF"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
function bboxDescription(modelFamily) {
|
|
2
|
+
if ('gemini' === modelFamily) return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';
|
|
3
|
+
return '2d bounding box as [xmin, ymin, xmax, ymax]';
|
|
4
|
+
}
|
|
5
|
+
export { bboxDescription };
|
|
6
|
+
|
|
7
|
+
//# sourceMappingURL=common.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/common.mjs","sources":["../../../../src/ai-model/prompt/common.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nexport function bboxDescription(modelFamily: TModelFamily | undefined) {\n if (modelFamily === 'gemini') {\n return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';\n }\n return '2d bounding box as [xmin, ymin, xmax, ymax]';\n}\n"],"names":["bboxDescription","modelFamily"],"mappings":"AACO,SAASA,gBAAgBC,WAAqC;IACnE,IAAIA,AAAgB,aAAhBA,aACF,OAAO;IAET,OAAO;AACT"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
2
|
+
const examplesMap = {
|
|
3
|
+
Chinese: [
|
|
4
|
+
'"登录表单中的"登录"按钮"',
|
|
5
|
+
'"搜索输入框,placeholder 为"请输入关键词""',
|
|
6
|
+
'"顶部导航栏中文字为"首页"的链接"',
|
|
7
|
+
'"联系表单中的提交按钮"',
|
|
8
|
+
'"aria-label 为"打开菜单"的菜单图标"'
|
|
9
|
+
],
|
|
10
|
+
English: [
|
|
11
|
+
'"Login button with text \'Sign In\'"',
|
|
12
|
+
'"Search input with placeholder \'Enter keywords\'"',
|
|
13
|
+
'"Navigation link with text \'Home\' in header"',
|
|
14
|
+
'"Submit button in contact form"',
|
|
15
|
+
'"Menu icon with aria-label \'Open menu\'"'
|
|
16
|
+
]
|
|
17
|
+
};
|
|
18
|
+
const getExamples = (language)=>{
|
|
19
|
+
const examples = examplesMap[language] || examplesMap.English;
|
|
20
|
+
return examples.map((e)=>`- ${e}`).join('\n');
|
|
21
|
+
};
|
|
22
|
+
const elementDescriberInstruction = ()=>{
|
|
23
|
+
const preferredLanguage = getPreferredLanguage();
|
|
24
|
+
return `
|
|
25
|
+
Describe the element in the red rectangle for precise identification.
|
|
26
|
+
|
|
27
|
+
IMPORTANT: You MUST write the description in ${preferredLanguage}.
|
|
28
|
+
|
|
29
|
+
CRITICAL REQUIREMENTS:
|
|
30
|
+
1. UNIQUENESS: The description must uniquely identify this element on the current page
|
|
31
|
+
2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts
|
|
32
|
+
3. PRECISION: Be specific enough to distinguish from similar elements
|
|
33
|
+
|
|
34
|
+
DESCRIPTION STRUCTURE:
|
|
35
|
+
1. Element type (button, input, link, div, etc.)
|
|
36
|
+
2. Primary identifier (in order of preference):
|
|
37
|
+
- Unique text content: "with text 'Login'"
|
|
38
|
+
- Unique attribute: "with aria-label 'Search'"
|
|
39
|
+
- Unique class/ID: "with class 'primary-button'"
|
|
40
|
+
- Unique position: "in header navigation"
|
|
41
|
+
3. Secondary identifiers (if needed for uniqueness):
|
|
42
|
+
- Visual features: "blue background", "with icon"
|
|
43
|
+
- Relative position: "below search bar", "in sidebar"
|
|
44
|
+
- Parent context: "in login form", "in main menu"
|
|
45
|
+
|
|
46
|
+
GUIDELINES:
|
|
47
|
+
- Keep description under 25 words
|
|
48
|
+
- Prioritize semantic identifiers over visual ones
|
|
49
|
+
- Use consistent terminology across similar elements
|
|
50
|
+
- Avoid page-specific or temporary content
|
|
51
|
+
- Don't mention the red rectangle or selection box
|
|
52
|
+
- Focus on stable, reusable characteristics
|
|
53
|
+
- **Write the description in ${preferredLanguage}**
|
|
54
|
+
|
|
55
|
+
EXAMPLES:
|
|
56
|
+
${getExamples(preferredLanguage)}
|
|
57
|
+
|
|
58
|
+
Return JSON:
|
|
59
|
+
{
|
|
60
|
+
"description": "unique element identifier",
|
|
61
|
+
"error"?: "error message if any"
|
|
62
|
+
}`;
|
|
63
|
+
};
|
|
64
|
+
export { elementDescriberInstruction };
|
|
65
|
+
|
|
66
|
+
//# sourceMappingURL=describe.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/describe.mjs","sources":["../../../../src/ai-model/prompt/describe.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\n\nconst examplesMap: Record<string, string[]> = {\n Chinese: [\n '\"登录表单中的\"登录\"按钮\"',\n '\"搜索输入框,placeholder 为\"请输入关键词\"\"',\n '\"顶部导航栏中文字为\"首页\"的链接\"',\n '\"联系表单中的提交按钮\"',\n '\"aria-label 为\"打开菜单\"的菜单图标\"',\n ],\n English: [\n '\"Login button with text \\'Sign In\\'\"',\n '\"Search input with placeholder \\'Enter keywords\\'\"',\n '\"Navigation link with text \\'Home\\' in header\"',\n '\"Submit button in contact form\"',\n '\"Menu icon with aria-label \\'Open menu\\'\"',\n ],\n};\n\nconst getExamples = (language: string) => {\n const examples = examplesMap[language] || examplesMap.English;\n return examples.map((e) => `- ${e}`).join('\\n');\n};\n\nexport const elementDescriberInstruction = () => {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nDescribe the element in the red rectangle for precise identification.\n\nIMPORTANT: You MUST write the description in ${preferredLanguage}.\n\nCRITICAL REQUIREMENTS:\n1. UNIQUENESS: The description must uniquely identify this element on the current page\n2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts\n3. PRECISION: Be specific enough to distinguish from similar elements\n\nDESCRIPTION STRUCTURE:\n1. Element type (button, input, link, div, etc.)\n2. Primary identifier (in order of preference):\n - Unique text content: \"with text 'Login'\"\n - Unique attribute: \"with aria-label 'Search'\"\n - Unique class/ID: \"with class 'primary-button'\"\n - Unique position: \"in header navigation\"\n3. Secondary identifiers (if needed for uniqueness):\n - Visual features: \"blue background\", \"with icon\"\n - Relative position: \"below search bar\", \"in sidebar\"\n - Parent context: \"in login form\", \"in main menu\"\n\nGUIDELINES:\n- Keep description under 25 words\n- Prioritize semantic identifiers over visual ones\n- Use consistent terminology across similar elements\n- Avoid page-specific or temporary content\n- Don't mention the red rectangle or selection box\n- Focus on stable, reusable characteristics\n- **Write the description in ${preferredLanguage}**\n\nEXAMPLES:\n${getExamples(preferredLanguage)}\n\nReturn JSON:\n{\n \"description\": \"unique element identifier\",\n \"error\"?: \"error message if any\"\n}`;\n};\n"],"names":["examplesMap","getExamples","language","examples","e","elementDescriberInstruction","preferredLanguage","getPreferredLanguage"],"mappings":";AAEA,MAAMA,cAAwC;IAC5C,SAAS;QACP;QACA;QACA;QACA;QACA;KACD;IACD,SAAS;QACP;QACA;QACA;QACA;QACA;KACD;AACH;AAEA,MAAMC,cAAc,CAACC;IACnB,MAAMC,WAAWH,WAAW,CAACE,SAAS,IAAIF,YAAY,OAAO;IAC7D,OAAOG,SAAS,GAAG,CAAC,CAACC,IAAM,CAAC,EAAE,EAAEA,GAAG,EAAE,IAAI,CAAC;AAC5C;AAEO,MAAMC,8BAA8B;IACzC,MAAMC,oBAAoBC;IAE1B,OAAO,CAAC;;;6CAGmC,EAAED,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;6BA0BpC,EAAEA,kBAAkB;;;AAGjD,EAAEL,YAAYK,mBAAmB;;;;;;CAMhC,CAAC;AACF"}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
2
|
+
import { safeParseJson } from "../service-caller/index.mjs";
|
|
3
|
+
import { extractXMLTag } from "./util.mjs";
|
|
4
|
+
function parseXMLExtractionResponse(xmlString) {
|
|
5
|
+
const thought = extractXMLTag(xmlString, 'thought');
|
|
6
|
+
const dataJsonStr = extractXMLTag(xmlString, 'data-json');
|
|
7
|
+
const errorsStr = extractXMLTag(xmlString, 'errors');
|
|
8
|
+
if (!dataJsonStr) throw new Error('Missing required field: data-json');
|
|
9
|
+
let data;
|
|
10
|
+
try {
|
|
11
|
+
data = safeParseJson(dataJsonStr, void 0);
|
|
12
|
+
} catch (e) {
|
|
13
|
+
throw new Error(`Failed to parse data-json: ${e}`);
|
|
14
|
+
}
|
|
15
|
+
let errors;
|
|
16
|
+
if (errorsStr) try {
|
|
17
|
+
const parsedErrors = safeParseJson(errorsStr, void 0);
|
|
18
|
+
if (Array.isArray(parsedErrors)) errors = parsedErrors;
|
|
19
|
+
} catch (e) {}
|
|
20
|
+
return {
|
|
21
|
+
...thought ? {
|
|
22
|
+
thought
|
|
23
|
+
} : {},
|
|
24
|
+
data,
|
|
25
|
+
...errors && errors.length > 0 ? {
|
|
26
|
+
errors
|
|
27
|
+
} : {}
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
function systemPromptToExtract() {
|
|
31
|
+
const preferredLanguage = getPreferredLanguage();
|
|
32
|
+
return `
|
|
33
|
+
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
34
|
+
|
|
35
|
+
The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
|
|
36
|
+
|
|
37
|
+
If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
|
|
38
|
+
|
|
39
|
+
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
Return in the following XML format:
|
|
43
|
+
<thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>
|
|
44
|
+
<data-json>the extracted data as JSON. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.</data-json>
|
|
45
|
+
<errors>optional error messages as JSON array, e.g., ["error1", "error2"]</errors>
|
|
46
|
+
|
|
47
|
+
# Example 1
|
|
48
|
+
For example, if the DATA_DEMAND is:
|
|
49
|
+
|
|
50
|
+
<DATA_DEMAND>
|
|
51
|
+
{
|
|
52
|
+
"name": "name shows on the left panel, string",
|
|
53
|
+
"age": "age shows on the right panel, number",
|
|
54
|
+
"isAdmin": "if the user is admin, boolean"
|
|
55
|
+
}
|
|
56
|
+
</DATA_DEMAND>
|
|
57
|
+
|
|
58
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
59
|
+
|
|
60
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
61
|
+
<data-json>
|
|
62
|
+
{
|
|
63
|
+
"name": "John",
|
|
64
|
+
"age": 30,
|
|
65
|
+
"isAdmin": true
|
|
66
|
+
}
|
|
67
|
+
</data-json>
|
|
68
|
+
|
|
69
|
+
# Example 2
|
|
70
|
+
If the DATA_DEMAND is:
|
|
71
|
+
|
|
72
|
+
<DATA_DEMAND>
|
|
73
|
+
the todo items list, string[]
|
|
74
|
+
</DATA_DEMAND>
|
|
75
|
+
|
|
76
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
77
|
+
|
|
78
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
79
|
+
<data-json>
|
|
80
|
+
["todo 1", "todo 2", "todo 3"]
|
|
81
|
+
</data-json>
|
|
82
|
+
|
|
83
|
+
# Example 3
|
|
84
|
+
If the DATA_DEMAND is:
|
|
85
|
+
|
|
86
|
+
<DATA_DEMAND>
|
|
87
|
+
the page title, string
|
|
88
|
+
</DATA_DEMAND>
|
|
89
|
+
|
|
90
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
91
|
+
|
|
92
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
93
|
+
<data-json>
|
|
94
|
+
"todo list"
|
|
95
|
+
</data-json>
|
|
96
|
+
|
|
97
|
+
# Example 4
|
|
98
|
+
If the DATA_DEMAND is:
|
|
99
|
+
|
|
100
|
+
<DATA_DEMAND>
|
|
101
|
+
{
|
|
102
|
+
"result": "Boolean, is it currently the SMS page?"
|
|
103
|
+
}
|
|
104
|
+
</DATA_DEMAND>
|
|
105
|
+
|
|
106
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
107
|
+
|
|
108
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
109
|
+
<data-json>
|
|
110
|
+
{ "result": true }
|
|
111
|
+
</data-json>
|
|
112
|
+
`;
|
|
113
|
+
}
|
|
114
|
+
const extractDataQueryPrompt = (pageDescription, dataQuery)=>{
|
|
115
|
+
let dataQueryText = '';
|
|
116
|
+
dataQueryText = 'string' == typeof dataQuery ? dataQuery : JSON.stringify(dataQuery, null, 2);
|
|
117
|
+
return `
|
|
118
|
+
<PageDescription>
|
|
119
|
+
${pageDescription}
|
|
120
|
+
</PageDescription>
|
|
121
|
+
|
|
122
|
+
<DATA_DEMAND>
|
|
123
|
+
${dataQueryText}
|
|
124
|
+
</DATA_DEMAND>
|
|
125
|
+
`;
|
|
126
|
+
};
|
|
127
|
+
export { extractDataQueryPrompt, parseXMLExtractionResponse, systemPromptToExtract };
|
|
128
|
+
|
|
129
|
+
//# sourceMappingURL=extraction.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/extraction.mjs","sources":["../../../../src/ai-model/prompt/extraction.ts"],"sourcesContent":["import type { AIDataExtractionResponse } from '@/types';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { safeParseJson } from '../service-caller/index';\nimport { extractXMLTag } from './util';\n\n/**\n * Parse XML response from LLM and convert to AIDataExtractionResponse\n */\nexport function parseXMLExtractionResponse<T>(\n xmlString: string,\n): AIDataExtractionResponse<T> {\n const thought = extractXMLTag(xmlString, 'thought');\n const dataJsonStr = extractXMLTag(xmlString, 'data-json');\n const errorsStr = extractXMLTag(xmlString, 'errors');\n\n // Parse data-json (required)\n if (!dataJsonStr) {\n throw new Error('Missing required field: data-json');\n }\n\n let data: T;\n try {\n data = safeParseJson(dataJsonStr, undefined) as T;\n } catch (e) {\n throw new Error(`Failed to parse data-json: ${e}`);\n }\n\n // Parse errors (optional)\n let errors: string[] | undefined;\n if (errorsStr) {\n try {\n const parsedErrors = safeParseJson(errorsStr, undefined);\n if (Array.isArray(parsedErrors)) {\n errors = parsedErrors;\n }\n } catch (e) {\n // If errors parsing fails, just ignore it\n }\n }\n\n return {\n ...(thought ? { thought } : {}),\n data,\n ...(errors && errors.length > 0 ? { errors } : {}),\n };\n}\n\nexport function systemPromptToExtract() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.\n\nThe user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.\n\nIf a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.\n\nIf the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.\n\n\nReturn in the following XML format:\n<thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>\n<data-json>the extracted data as JSON. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.</data-json>\n<errors>optional error messages as JSON array, e.g., [\"error1\", \"error2\"]</errors>\n\n# Example 1\nFor example, if the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"name\": \"name shows on the left panel, string\",\n \"age\": \"age shows on the right panel, number\",\n \"isAdmin\": \"if the user is admin, boolean\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n{\n \"name\": \"John\",\n \"age\": 30,\n \"isAdmin\": true\n}\n</data-json>\n\n# Example 2\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe todo items list, string[]\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n[\"todo 1\", \"todo 2\", \"todo 3\"]\n</data-json>\n\n# Example 3\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe page title, string\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n\"todo list\"\n</data-json>\n\n# Example 4\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"result\": \"Boolean, is it currently the SMS page?\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n{ \"result\": true }\n</data-json>\n`;\n}\n\nexport const extractDataQueryPrompt = (\n pageDescription: string,\n dataQuery: string | Record<string, string>,\n) => {\n let dataQueryText = '';\n if (typeof dataQuery === 'string') {\n dataQueryText = dataQuery;\n } else {\n dataQueryText = JSON.stringify(dataQuery, null, 2);\n }\n\n return `\n<PageDescription>\n${pageDescription}\n</PageDescription>\n\n<DATA_DEMAND>\n${dataQueryText}\n</DATA_DEMAND>\n `;\n};\n"],"names":["parseXMLExtractionResponse","xmlString","thought","extractXMLTag","dataJsonStr","errorsStr","Error","data","safeParseJson","undefined","e","errors","parsedErrors","Array","systemPromptToExtract","preferredLanguage","getPreferredLanguage","extractDataQueryPrompt","pageDescription","dataQuery","dataQueryText","JSON"],"mappings":";;;AASO,SAASA,2BACdC,SAAiB;IAEjB,MAAMC,UAAUC,cAAcF,WAAW;IACzC,MAAMG,cAAcD,cAAcF,WAAW;IAC7C,MAAMI,YAAYF,cAAcF,WAAW;IAG3C,IAAI,CAACG,aACH,MAAM,IAAIE,MAAM;IAGlB,IAAIC;IACJ,IAAI;QACFA,OAAOC,cAAcJ,aAAaK;IACpC,EAAE,OAAOC,GAAG;QACV,MAAM,IAAIJ,MAAM,CAAC,2BAA2B,EAAEI,GAAG;IACnD;IAGA,IAAIC;IACJ,IAAIN,WACF,IAAI;QACF,MAAMO,eAAeJ,cAAcH,WAAWI;QAC9C,IAAII,MAAM,OAAO,CAACD,eAChBD,SAASC;IAEb,EAAE,OAAOF,GAAG,CAEZ;IAGF,OAAO;QACL,GAAIR,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9BK;QACA,GAAII,UAAUA,OAAO,MAAM,GAAG,IAAI;YAAEA;QAAO,IAAI,CAAC,CAAC;IACnD;AACF;AAEO,SAASG;IACd,MAAMC,oBAAoBC;IAE1B,OAAO,CAAC;;;;;;;;;;;0EAWgE,EAAED,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAqE9F,CAAC;AACD;AAEO,MAAME,yBAAyB,CACpCC,iBACAC;IAEA,IAAIC,gBAAgB;IAElBA,gBADE,AAAqB,YAArB,OAAOD,YACOA,YAEAE,KAAK,SAAS,CAACF,WAAW,MAAM;IAGlD,OAAO,CAAC;;AAEV,EAAED,gBAAgB;;;;AAIlB,EAAEE,cAAc;;EAEd,CAAC;AACH"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
2
|
+
import { bboxDescription } from "./common.mjs";
|
|
3
|
+
function systemPromptToLocateElement(modelFamily) {
|
|
4
|
+
const preferredLanguage = getPreferredLanguage();
|
|
5
|
+
const bboxComment = bboxDescription(modelFamily);
|
|
6
|
+
return `
|
|
7
|
+
## Role:
|
|
8
|
+
You are an AI assistant that helps identify UI elements.
|
|
9
|
+
|
|
10
|
+
## Objective:
|
|
11
|
+
- Identify elements in screenshots that match the user's description.
|
|
12
|
+
- Provide the coordinates of the element that matches the user's description.
|
|
13
|
+
|
|
14
|
+
## Important Notes for Locating Elements:
|
|
15
|
+
- When the user describes an element that contains text (such as buttons, input fields, dropdown options, radio buttons, etc.), you should locate ONLY the text region of that element, not the entire element boundary.
|
|
16
|
+
- For example: If an input field is large (both wide and tall) with a placeholder text "Please enter your comment", you should locate only the area where the placeholder text appears, not the entire input field.
|
|
17
|
+
- This principle applies to all text-containing elements: focus on the visible text region rather than the full element container.
|
|
18
|
+
|
|
19
|
+
## Output Format:
|
|
20
|
+
\`\`\`json
|
|
21
|
+
{
|
|
22
|
+
"bbox": [number, number, number, number], // ${bboxComment}
|
|
23
|
+
"errors"?: string[]
|
|
24
|
+
}
|
|
25
|
+
\`\`\`
|
|
26
|
+
|
|
27
|
+
Fields:
|
|
28
|
+
* \`bbox\` is the bounding box of the element that matches the user's description
|
|
29
|
+
* \`errors\` is an optional array of error messages (if any)
|
|
30
|
+
|
|
31
|
+
For example, when an element is found:
|
|
32
|
+
\`\`\`json
|
|
33
|
+
{
|
|
34
|
+
"bbox": [100, 100, 200, 200],
|
|
35
|
+
"errors": []
|
|
36
|
+
}
|
|
37
|
+
\`\`\`
|
|
38
|
+
|
|
39
|
+
When no element is found:
|
|
40
|
+
\`\`\`json
|
|
41
|
+
{
|
|
42
|
+
"bbox": [],
|
|
43
|
+
"errors": ["I can see ..., but {some element} is not found. Use ${preferredLanguage}."]
|
|
44
|
+
}
|
|
45
|
+
\`\`\`
|
|
46
|
+
`;
|
|
47
|
+
}
|
|
48
|
+
const findElementPrompt = (targetElementDescription)=>`Find: ${targetElementDescription}`;
|
|
49
|
+
export { findElementPrompt, systemPromptToLocateElement };
|
|
50
|
+
|
|
51
|
+
//# sourceMappingURL=llm-locator.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-locator.mjs","sources":["../../../../src/ai-model/prompt/llm-locator.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport { bboxDescription } from './common';\nexport function systemPromptToLocateElement(\n modelFamily: TModelFamily | undefined,\n) {\n const preferredLanguage = getPreferredLanguage();\n const bboxComment = bboxDescription(modelFamily);\n return `\n## Role:\nYou are an AI assistant that helps identify UI elements.\n\n## Objective:\n- Identify elements in screenshots that match the user's description.\n- Provide the coordinates of the element that matches the user's description.\n\n## Important Notes for Locating Elements:\n- When the user describes an element that contains text (such as buttons, input fields, dropdown options, radio buttons, etc.), you should locate ONLY the text region of that element, not the entire element boundary.\n- For example: If an input field is large (both wide and tall) with a placeholder text \"Please enter your comment\", you should locate only the area where the placeholder text appears, not the entire input field.\n- This principle applies to all text-containing elements: focus on the visible text region rather than the full element container.\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxComment}\n \"errors\"?: string[]\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` is the bounding box of the element that matches the user's description\n* \\`errors\\` is an optional array of error messages (if any)\n\nFor example, when an element is found:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"errors\": []\n}\n\\`\\`\\`\n\nWhen no element is found:\n\\`\\`\\`json\n{\n \"bbox\": [],\n \"errors\": [\"I can see ..., but {some element} is not found. Use ${preferredLanguage}.\"]\n}\n\\`\\`\\`\n`;\n}\n\nexport const findElementPrompt = (targetElementDescription: string) =>\n `Find: ${targetElementDescription}`;\n"],"names":["systemPromptToLocateElement","modelFamily","preferredLanguage","getPreferredLanguage","bboxComment","bboxDescription","findElementPrompt","targetElementDescription"],"mappings":";;AAGO,SAASA,4BACdC,WAAqC;IAErC,MAAMC,oBAAoBC;IAC1B,MAAMC,cAAcC,gBAAgBJ;IACpC,OAAO,CAAC;;;;;;;;;;;;;;;;gDAgBsC,EAAEG,YAAY;;;;;;;;;;;;;;;;;;;;;kEAqBI,EAAEF,kBAAkB;;;AAGtF,CAAC;AACD;AAEO,MAAMI,oBAAoB,CAACC,2BAChC,CAAC,MAAM,EAAEA,0BAA0B"}
|