@loadmill/droid-cua 2.3.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/build/index.js +10 -2
- package/build/src/cli/headless-execution-config.js +33 -1
- package/build/src/commands/help.js +4 -0
- package/build/src/commands/run.js +1 -1
- package/build/src/core/execution-engine.js +85 -6
- package/build/src/core/prompts.js +3 -279
- package/build/src/device/android/actions.js +11 -7
- package/build/src/device/assertions.js +1 -21
- package/build/src/device/cloud/actions.js +13 -8
- package/build/src/device/ios/actions.js +13 -9
- package/build/src/device/openai.js +8 -113
- package/build/src/device/screenshot-resolution.js +33 -0
- package/build/src/device/scroll-gesture.js +20 -0
- package/build/src/integrations/loadmill/interpreter.js +3 -56
- package/build/src/modes/design-mode-ink.js +12 -17
- package/build/src/modes/design-mode.js +12 -17
- package/build/src/modes/execution-mode.js +20 -17
- package/build/src/prompts/base.js +139 -0
- package/build/src/prompts/design.js +115 -0
- package/build/src/prompts/editor.js +19 -0
- package/build/src/prompts/execution.js +182 -0
- package/build/src/prompts/loadmill.js +60 -0
- package/build/src/test-store/test-manager.js +3 -5
- package/build/src/test-store/test-script.js +50 -0
- package/package.json +1 -1
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* Assertion handling for script validation
|
|
3
3
|
*/
|
|
4
4
|
import { printCliOutput } from "../utils/console-output.js";
|
|
5
|
+
export { buildAssertionSystemPrompt } from "../prompts/execution.js";
|
|
5
6
|
export function isAssertion(userInput) {
|
|
6
7
|
const trimmed = userInput.trim();
|
|
7
8
|
const lower = trimmed.toLowerCase();
|
|
@@ -20,27 +21,6 @@ export function extractAssertionPrompt(userInput) {
|
|
|
20
21
|
}
|
|
21
22
|
return trimmed;
|
|
22
23
|
}
|
|
23
|
-
export function buildAssertionSystemPrompt(baseSystemPrompt, assertionPrompt) {
|
|
24
|
-
return `${baseSystemPrompt}
|
|
25
|
-
|
|
26
|
-
ASSERTION MODE:
|
|
27
|
-
You are now validating an assertion. The user has provided an assertion statement that you must verify.
|
|
28
|
-
|
|
29
|
-
Your task:
|
|
30
|
-
1. Take screenshots and perform LIMITED actions if needed to validate the assertion.
|
|
31
|
-
2. Determine if the assertion is TRUE or FALSE based on the current state.
|
|
32
|
-
3. You MUST respond with a clear verdict in this exact format:
|
|
33
|
-
- If the assertion is true, include the text: "ASSERTION RESULT: PASS"
|
|
34
|
-
- If the assertion is false or cannot be confidently validated, include: "ASSERTION RESULT: FAIL"
|
|
35
|
-
4. After the verdict, provide a brief explanation (1-2 sentences) of why it passed or failed.
|
|
36
|
-
|
|
37
|
-
The assertion to validate is: "${assertionPrompt}"
|
|
38
|
-
|
|
39
|
-
Remember:
|
|
40
|
-
- If you cannot confidently validate the assertion, treat it as FAIL.
|
|
41
|
-
- You must include either "ASSERTION RESULT: PASS" or "ASSERTION RESULT: FAIL" in your response.
|
|
42
|
-
- Be thorough but efficient. Only take the actions necessary to validate the assertion.`;
|
|
43
|
-
}
|
|
44
24
|
export function checkAssertionResult(transcript) {
|
|
45
25
|
const transcriptText = transcript.join("\n");
|
|
46
26
|
const hasPassed = transcriptText.includes("ASSERTION RESULT: PASS");
|
|
@@ -2,6 +2,7 @@ import { logger } from "../../utils/logger.js";
|
|
|
2
2
|
import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
|
|
3
3
|
import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
|
|
4
4
|
import { getActiveSession, getDevicePixelRatio } from "./connection.js";
|
|
5
|
+
import { resolveScrollGesture } from "../scroll-gesture.js";
|
|
5
6
|
function normalizeMobileKeypress(platform, keys = []) {
|
|
6
7
|
if (!Array.isArray(keys) || keys.length === 0) {
|
|
7
8
|
throw new Error("Keypress action is missing keys");
|
|
@@ -93,14 +94,18 @@ export async function handleModelAction(deviceId, action, scale = 1.0, context =
|
|
|
93
94
|
break;
|
|
94
95
|
}
|
|
95
96
|
case "scroll": {
|
|
96
|
-
const scrollX =
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
addOutput({
|
|
103
|
-
|
|
97
|
+
const { scrollX, scrollY, startX, startY, endX, endY, hasAnchor } = resolveScrollGesture(action, {
|
|
98
|
+
scale,
|
|
99
|
+
dpr,
|
|
100
|
+
fallbackStartX: 200,
|
|
101
|
+
fallbackStartY: 400
|
|
102
|
+
});
|
|
103
|
+
addOutput({
|
|
104
|
+
type: "action",
|
|
105
|
+
text: `Scrolling from (${startX}, ${startY}) to (${endX}, ${endY}) by (${scrollX}, ${scrollY})`,
|
|
106
|
+
...meta({ scrollX, scrollY, startX, startY, endX, endY, anchorSource: hasAnchor ? "action" : "fallback" })
|
|
107
|
+
});
|
|
108
|
+
await session.client.scroll(session.sessionId, startX, startY, endX, endY);
|
|
104
109
|
break;
|
|
105
110
|
}
|
|
106
111
|
case "drag": {
|
|
@@ -8,6 +8,7 @@ import { getActiveSession, getDevicePixelRatio } from "./connection.js";
|
|
|
8
8
|
import { logger } from "../../utils/logger.js";
|
|
9
9
|
import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
|
|
10
10
|
import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
|
|
11
|
+
import { resolveScrollGesture } from "../scroll-gesture.js";
|
|
11
12
|
function normalizeMobileKeypress(keys = []) {
|
|
12
13
|
if (!Array.isArray(keys) || keys.length === 0) {
|
|
13
14
|
throw new Error("Keypress action is missing keys");
|
|
@@ -92,15 +93,18 @@ export async function handleModelAction(simulatorId, action, scale = 1.0, contex
|
|
|
92
93
|
}
|
|
93
94
|
case "scroll": {
|
|
94
95
|
const dpr = getDevicePixelRatio();
|
|
95
|
-
const scrollX =
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
96
|
+
const { scrollX, scrollY, startX, startY, endX, endY, hasAnchor } = resolveScrollGesture(action, {
|
|
97
|
+
scale,
|
|
98
|
+
dpr,
|
|
99
|
+
fallbackStartX: 197,
|
|
100
|
+
fallbackStartY: 426
|
|
101
|
+
});
|
|
102
|
+
addOutput({
|
|
103
|
+
type: "action",
|
|
104
|
+
text: `Scrolling from (${startX}, ${startY}) to (${endX}, ${endY}) by (${scrollX}, ${scrollY}) points`,
|
|
105
|
+
...meta({ scrollX, scrollY, startX, startY, endX, endY, anchorSource: hasAnchor ? "action" : "fallback", unit: "points" })
|
|
106
|
+
});
|
|
107
|
+
await appium.scroll(session.sessionId, startX, startY, endX, endY);
|
|
104
108
|
break;
|
|
105
109
|
}
|
|
106
110
|
case "drag": {
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import OpenAI from "openai";
|
|
2
|
+
import { buildTestRevisionSystemPrompt } from "../prompts/editor.js";
|
|
3
|
+
import { buildAppContextCompactionInput } from "../prompts/execution.js";
|
|
2
4
|
import { logger } from "../utils/logger.js";
|
|
3
5
|
import { CuaDebugTracer } from "../utils/cua-debug-tracer.js";
|
|
4
6
|
let openai = null;
|
|
@@ -129,23 +131,7 @@ export async function reviseTestScript(originalScript, revisionRequest) {
|
|
|
129
131
|
model: "gpt-4o",
|
|
130
132
|
messages: [{
|
|
131
133
|
role: "system",
|
|
132
|
-
content:
|
|
133
|
-
|
|
134
|
-
Current test script:
|
|
135
|
-
${originalScript}
|
|
136
|
-
|
|
137
|
-
User's revision request:
|
|
138
|
-
${revisionRequest}
|
|
139
|
-
|
|
140
|
-
Apply the user's changes and output the revised test script.
|
|
141
|
-
|
|
142
|
-
FORMAT RULES:
|
|
143
|
-
- One simple instruction per line (NO numbers, NO bullets)
|
|
144
|
-
- Use imperative commands: "Open X", "Click Y", "Type Z"
|
|
145
|
-
- Include "assert: <condition>" lines to validate expected behavior
|
|
146
|
-
- End with "exit"
|
|
147
|
-
|
|
148
|
-
Output only the revised test script, nothing else.`
|
|
134
|
+
content: buildTestRevisionSystemPrompt(originalScript, revisionRequest)
|
|
149
135
|
}]
|
|
150
136
|
});
|
|
151
137
|
return response.choices[0].message.content.trim();
|
|
@@ -154,102 +140,11 @@ export async function compactAppContext({ contextDocument, taskDescription, toke
|
|
|
154
140
|
const response = await getOpenAI().responses.create({
|
|
155
141
|
model: "gpt-5.4",
|
|
156
142
|
temperature: 0,
|
|
157
|
-
input:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
text: `You are compressing an app context document for a mobile testing agent.
|
|
163
|
-
|
|
164
|
-
You will receive:
|
|
165
|
-
1. A context document
|
|
166
|
-
2. A test task
|
|
167
|
-
|
|
168
|
-
Your job is to SELECT only the facts from the context document that are useful for the given task.
|
|
169
|
-
The output will be injected into a system prompt with a strict token budget.
|
|
170
|
-
|
|
171
|
-
CRITICAL:
|
|
172
|
-
- Use only facts explicitly supported by the context document
|
|
173
|
-
- Never invent, infer, normalize, substitute, or improve credentials, labels, screen names, button names, or numeric values
|
|
174
|
-
- Preserve exact values verbatim when present in the source
|
|
175
|
-
- Prefer facts that help the agent act correctly when they are not obvious from the task alone
|
|
176
|
-
- Do not restate, paraphrase, summarize, or reorganize the test task
|
|
177
|
-
- The output must not read like instructions or a test plan
|
|
178
|
-
- Do not describe what the agent should do
|
|
179
|
-
- Output only reference knowledge about the app
|
|
180
|
-
- If a line could be copied from the task with minor wording changes, omit it
|
|
181
|
-
- Prefer copying source facts verbatim or near-verbatim over rewriting them
|
|
182
|
-
- Do not collapse multiple specific source facts into one generic summary if that removes useful distinctions
|
|
183
|
-
|
|
184
|
-
Selection priority:
|
|
185
|
-
1. Facts the agent would NOT know from the test script alone
|
|
186
|
-
2. Facts that are hard to infer from screenshots
|
|
187
|
-
3. Non-obvious navigation or interaction details
|
|
188
|
-
4. Exact visible labels needed to act correctly
|
|
189
|
-
5. Credentials and other exact values
|
|
190
|
-
|
|
191
|
-
High-value facts:
|
|
192
|
-
- exact UI labels
|
|
193
|
-
- how state, mode, or account selection is performed
|
|
194
|
-
- where logout is located
|
|
195
|
-
- hidden or non-obvious navigation
|
|
196
|
-
- which menu items are decorative or non-functional
|
|
197
|
-
- screen titles and section labels used to confirm location
|
|
198
|
-
- exact credentials and role labels
|
|
199
|
-
|
|
200
|
-
Low-value facts:
|
|
201
|
-
- restating the test steps
|
|
202
|
-
- repeating literal values already present in the task
|
|
203
|
-
- generic summaries like "approve the transaction"
|
|
204
|
-
|
|
205
|
-
When the task involves authentication, switching state or mode, opening menus, or moving between major areas of the app, strongly prefer including:
|
|
206
|
-
- how account, state, or mode selection is performed
|
|
207
|
-
- exact visible labels for the relevant controls
|
|
208
|
-
- where exit or sign-out actions are located
|
|
209
|
-
- the screen or section labels that confirm the agent is in the right place
|
|
210
|
-
|
|
211
|
-
Rules:
|
|
212
|
-
- Output plain text only
|
|
213
|
-
- No markdown, no bullet symbols, no numbering, no headers
|
|
214
|
-
- Use terse, factual language: one fact per line, no filler words
|
|
215
|
-
- Blank lines only to separate logical groups
|
|
216
|
-
- Prefer exact visible UI labels over summaries
|
|
217
|
-
- Do not describe step-by-step procedures
|
|
218
|
-
- Do not restate the test workflow
|
|
219
|
-
- State only facts about screens, elements, hidden interactions, entities, credentials, and navigation
|
|
220
|
-
- If a useful fact is not explicitly stated in the context document, omit it
|
|
221
|
-
- Include only information relevant to this task
|
|
222
|
-
- Do not waste space repeating the task itself
|
|
223
|
-
- If the task already states a value or action, include it only when the context adds non-obvious execution details
|
|
224
|
-
- Return a short result or an empty string if little is relevant
|
|
225
|
-
- Target: under ${tokenBudget} tokens
|
|
226
|
-
|
|
227
|
-
Bad output patterns to avoid:
|
|
228
|
-
- generic summaries that remove actionable details
|
|
229
|
-
- lines that restate the task in generic prose
|
|
230
|
-
- lines that describe obvious workflow steps instead of app knowledge
|
|
231
|
-
- lines that replace exact source labels or mechanisms with broad summaries
|
|
232
|
-
|
|
233
|
-
Good output characteristics:
|
|
234
|
-
- preserves the exact label or mechanism from the source when it matters
|
|
235
|
-
- keeps distinctions like dropdown vs tabs, drawer vs visible button, exact section titles, exact button text
|
|
236
|
-
- includes hidden or non-obvious navigation details when relevant
|
|
237
|
-
|
|
238
|
-
Return only the briefing text.`
|
|
239
|
-
}]
|
|
240
|
-
},
|
|
241
|
-
{
|
|
242
|
-
role: "user",
|
|
243
|
-
content: [{
|
|
244
|
-
type: "input_text",
|
|
245
|
-
text: `APP CONTEXT DOCUMENT:
|
|
246
|
-
${contextDocument}
|
|
247
|
-
|
|
248
|
-
TASK:
|
|
249
|
-
${taskDescription}`
|
|
250
|
-
}]
|
|
251
|
-
}
|
|
252
|
-
]
|
|
143
|
+
input: buildAppContextCompactionInput({
|
|
144
|
+
contextDocument,
|
|
145
|
+
taskDescription,
|
|
146
|
+
tokenBudget,
|
|
147
|
+
})
|
|
253
148
|
});
|
|
254
149
|
return {
|
|
255
150
|
briefing: typeof response.output_text === "string" ? response.output_text.trim() : "",
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export const TARGET_SCALED_WIDTH = 400;
|
|
2
|
+
export const SCREENSHOT_RESOLUTION_MODE_DOWNSCALED = "downscaled";
|
|
3
|
+
export const SCREENSHOT_RESOLUTION_MODE_NATIVE = "native";
|
|
4
|
+
export function normalizeScreenshotResolutionMode(value) {
|
|
5
|
+
return value === SCREENSHOT_RESOLUTION_MODE_NATIVE
|
|
6
|
+
? SCREENSHOT_RESOLUTION_MODE_NATIVE
|
|
7
|
+
: SCREENSHOT_RESOLUTION_MODE_DOWNSCALED;
|
|
8
|
+
}
|
|
9
|
+
export function validateScreenshotResolutionMode(value, label) {
|
|
10
|
+
if (typeof value !== "string") {
|
|
11
|
+
throw new Error(`${label} must be one of: downscaled, native.`);
|
|
12
|
+
}
|
|
13
|
+
const normalized = normalizeScreenshotResolutionMode(value);
|
|
14
|
+
if (normalized !== value) {
|
|
15
|
+
throw new Error(`${label} must be one of: downscaled, native.`);
|
|
16
|
+
}
|
|
17
|
+
return normalized;
|
|
18
|
+
}
|
|
19
|
+
export function buildResolutionAwareDeviceInfo({ width, height, screenshotResolutionMode, }) {
|
|
20
|
+
const normalizedMode = normalizeScreenshotResolutionMode(screenshotResolutionMode);
|
|
21
|
+
const scale = normalizedMode === SCREENSHOT_RESOLUTION_MODE_NATIVE || width <= TARGET_SCALED_WIDTH
|
|
22
|
+
? 1.0
|
|
23
|
+
: TARGET_SCALED_WIDTH / width;
|
|
24
|
+
return {
|
|
25
|
+
scaled_width: Math.round(width * scale),
|
|
26
|
+
scaled_height: Math.round(height * scale),
|
|
27
|
+
scale,
|
|
28
|
+
screenshot_resolution_mode: normalizedMode,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
export function readScreenshotResolutionModeFromEnv() {
|
|
32
|
+
return normalizeScreenshotResolutionMode(process.env.DROID_CUA_SCREENSHOT_RESOLUTION_MODE);
|
|
33
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
// The model returns scroll actions with `scroll_x` / `scroll_y` plus optional
|
|
2
|
+
// anchor coordinates `x` / `y`. This helper converts those model coordinates
|
|
3
|
+
// into backend gesture coordinates: startX/startY and endX/endY.
|
|
4
|
+
export function resolveScrollGesture(action, { scale = 1.0, dpr = 1.0, fallbackStartX = 0, fallbackStartY = 0 } = {}) {
|
|
5
|
+
const divisor = scale * dpr;
|
|
6
|
+
const scrollX = Math.round((action?.scroll_x ?? 0) / divisor);
|
|
7
|
+
const scrollY = Math.round((action?.scroll_y ?? 0) / divisor);
|
|
8
|
+
const hasAnchor = Number.isFinite(action?.x) && Number.isFinite(action?.y);
|
|
9
|
+
const startX = hasAnchor ? Math.round(action.x / divisor) : Math.round(fallbackStartX);
|
|
10
|
+
const startY = hasAnchor ? Math.round(action.y / divisor) : Math.round(fallbackStartY);
|
|
11
|
+
return {
|
|
12
|
+
scrollX,
|
|
13
|
+
scrollY,
|
|
14
|
+
startX,
|
|
15
|
+
startY,
|
|
16
|
+
endX: startX + scrollX,
|
|
17
|
+
endY: startY - scrollY,
|
|
18
|
+
hasAnchor
|
|
19
|
+
};
|
|
20
|
+
}
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* AI-powered text interpretation for Loadmill commands
|
|
3
3
|
*/
|
|
4
4
|
import OpenAI from "openai";
|
|
5
|
+
import { buildLoadmillCommandInterpretationMessages, buildLoadmillFlowSelectionMessages, } from "../../prompts/loadmill.js";
|
|
5
6
|
let openai = null;
|
|
6
7
|
function getOpenAI() {
|
|
7
8
|
if (!openai) {
|
|
@@ -19,39 +20,7 @@ function getOpenAI() {
|
|
|
19
20
|
export async function interpretLoadmillCommand(userInput) {
|
|
20
21
|
const response = await getOpenAI().chat.completions.create({
|
|
21
22
|
model: "gpt-4o-mini",
|
|
22
|
-
messages:
|
|
23
|
-
{
|
|
24
|
-
role: "system",
|
|
25
|
-
content: `You are a parser that extracts structured data from natural language Loadmill commands.
|
|
26
|
-
|
|
27
|
-
Extract the following from the user's input:
|
|
28
|
-
1. searchQuery: The flow name or description to search for (required). FIX any obvious typos or misspellings.
|
|
29
|
-
2. parameters: Any key=value pairs mentioned (as an object)
|
|
30
|
-
3. action: Either "run" (if user wants to execute) or "search" (if user just wants to find flows)
|
|
31
|
-
|
|
32
|
-
Output JSON only, no markdown or explanation.
|
|
33
|
-
|
|
34
|
-
Examples:
|
|
35
|
-
Input: "run the checkout flow with user=test123"
|
|
36
|
-
Output: {"searchQuery": "checkout flow", "parameters": {"user": "test123"}, "action": "run"}
|
|
37
|
-
|
|
38
|
-
Input: "search for login test"
|
|
39
|
-
Output: {"searchQuery": "login test", "parameters": {}, "action": "search"}
|
|
40
|
-
|
|
41
|
-
Input: "run user authentication with email=test@example.com password=secret123"
|
|
42
|
-
Output: {"searchQuery": "user authentication", "parameters": {"email": "test@example.com", "password": "secret123"}, "action": "run"}
|
|
43
|
-
|
|
44
|
-
Input: "execute payment flow"
|
|
45
|
-
Output: {"searchQuery": "payment flow", "parameters": {}, "action": "run"}
|
|
46
|
-
|
|
47
|
-
Input: "create a transction with amount=200"
|
|
48
|
-
Output: {"searchQuery": "transaction", "parameters": {"amount": "200"}, "action": "run"}`
|
|
49
|
-
},
|
|
50
|
-
{
|
|
51
|
-
role: "user",
|
|
52
|
-
content: userInput
|
|
53
|
-
}
|
|
54
|
-
],
|
|
23
|
+
messages: buildLoadmillCommandInterpretationMessages(userInput),
|
|
55
24
|
response_format: { type: "json_object" }
|
|
56
25
|
});
|
|
57
26
|
const content = response.choices[0].message.content;
|
|
@@ -84,29 +53,7 @@ export async function selectBestFlow(flows, originalQuery) {
|
|
|
84
53
|
}).join("\n");
|
|
85
54
|
const response = await getOpenAI().chat.completions.create({
|
|
86
55
|
model: "gpt-4o-mini",
|
|
87
|
-
messages:
|
|
88
|
-
{
|
|
89
|
-
role: "system",
|
|
90
|
-
content: `You are selecting the best matching test flow based on a user query.
|
|
91
|
-
|
|
92
|
-
Given the user's query and a list of available flows, select the best match.
|
|
93
|
-
|
|
94
|
-
Output JSON with:
|
|
95
|
-
- index: 1-based index of the best matching flow
|
|
96
|
-
- confidence: number between 0 and 1 indicating how confident you are
|
|
97
|
-
|
|
98
|
-
If no flow seems to match well, set confidence to a low value (< 0.5).
|
|
99
|
-
|
|
100
|
-
Output JSON only, no markdown.`
|
|
101
|
-
},
|
|
102
|
-
{
|
|
103
|
-
role: "user",
|
|
104
|
-
content: `Query: "${originalQuery}"
|
|
105
|
-
|
|
106
|
-
Available flows:
|
|
107
|
-
${flowList}`
|
|
108
|
-
}
|
|
109
|
-
],
|
|
56
|
+
messages: buildLoadmillFlowSelectionMessages(originalQuery, flowList),
|
|
110
57
|
response_format: { type: "json_object" }
|
|
111
58
|
});
|
|
112
59
|
const content = response.choices[0].message.content;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { getScreenshotAsBase64 } from "../device/connection.js";
|
|
2
2
|
import { sendCUARequest, reviseTestScript } from "../device/openai.js";
|
|
3
|
-
import { buildDesignModePrompt } from "../core/prompts.js";
|
|
3
|
+
import { buildDesignModePrompt, buildDesignRecoveryPrompt } from "../core/prompts.js";
|
|
4
4
|
import { saveTest } from "../test-store/test-manager.js";
|
|
5
5
|
import { logger } from "../utils/logger.js";
|
|
6
6
|
/**
|
|
@@ -20,6 +20,7 @@ export class DesignModeInk {
|
|
|
20
20
|
this.waitingForInput = false; // Flag to indicate we're explicitly waiting for input
|
|
21
21
|
this.inputResolver = null; // Promise resolver for input
|
|
22
22
|
this.initialUserPrompt = null; // Store initial prompt for error recovery
|
|
23
|
+
this.baseDesignPrompt = null;
|
|
23
24
|
this.consecutiveErrorCount = 0;
|
|
24
25
|
this.maxConsecutiveErrors = 3;
|
|
25
26
|
}
|
|
@@ -30,7 +31,10 @@ export class DesignModeInk {
|
|
|
30
31
|
async start() {
|
|
31
32
|
const addOutput = this.context.addOutput;
|
|
32
33
|
// Set design mode system prompt
|
|
33
|
-
const designPrompt = buildDesignModePrompt(this.session.deviceInfo
|
|
34
|
+
const designPrompt = buildDesignModePrompt(this.session.deviceInfo, {}, {
|
|
35
|
+
strictMode: Boolean(this.engine?.strictMode)
|
|
36
|
+
});
|
|
37
|
+
this.baseDesignPrompt = designPrompt;
|
|
34
38
|
this.session.setSystemPrompt(designPrompt);
|
|
35
39
|
// Update UI
|
|
36
40
|
if (this.context.setMode) {
|
|
@@ -330,21 +334,12 @@ export class DesignModeInk {
|
|
|
330
334
|
}
|
|
331
335
|
// Automatic recovery - continue from where we left off using transcript
|
|
332
336
|
addOutput({ type: 'info', text: 'Recovering from error and continuing...' });
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
${this.session.getTranscriptText()}
|
|
341
|
-
|
|
342
|
-
Continue from where we left off and complete the original task: "${this.initialUserPrompt}"
|
|
343
|
-
|
|
344
|
-
Remember:
|
|
345
|
-
- Don't repeat actions that already succeeded
|
|
346
|
-
- Continue towards generating the test script
|
|
347
|
-
- If the flow was complete before the error, generate the script now`;
|
|
337
|
+
const recoveryContext = buildDesignRecoveryPrompt({
|
|
338
|
+
basePrompt: this.baseDesignPrompt || this.session.systemPrompt || buildDesignModePrompt(this.session.deviceInfo),
|
|
339
|
+
transcript: this.session.getTranscriptText(),
|
|
340
|
+
objective: this.initialUserPrompt,
|
|
341
|
+
errorMessage: err.message
|
|
342
|
+
});
|
|
348
343
|
// Reset conversation state for fresh API call
|
|
349
344
|
this.session.clearMessages();
|
|
350
345
|
this.session.addMessage("system", recoveryContext);
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import readline from "readline";
|
|
2
2
|
import { getScreenshotAsBase64 } from "../device/connection.js";
|
|
3
3
|
import { sendCUARequest, reviseTestScript } from "../device/openai.js";
|
|
4
|
-
import { buildDesignModePrompt } from "../core/prompts.js";
|
|
4
|
+
import { buildDesignModePrompt, buildDesignRecoveryPrompt } from "../core/prompts.js";
|
|
5
5
|
import { saveTest } from "../test-store/test-manager.js";
|
|
6
6
|
import { logger } from "../utils/logger.js";
|
|
7
7
|
/**
|
|
@@ -17,6 +17,7 @@ export class DesignMode {
|
|
|
17
17
|
this.escPressed = false;
|
|
18
18
|
this.recentActions = []; // Track recent actions for stuck detection
|
|
19
19
|
this.initialUserPrompt = null; // Store initial prompt for error recovery
|
|
20
|
+
this.baseDesignPrompt = null;
|
|
20
21
|
this.consecutiveErrorCount = 0;
|
|
21
22
|
this.maxConsecutiveErrors = 3;
|
|
22
23
|
}
|
|
@@ -27,7 +28,10 @@ export class DesignMode {
|
|
|
27
28
|
*/
|
|
28
29
|
async start(context) {
|
|
29
30
|
// Set design mode system prompt
|
|
30
|
-
const designPrompt = buildDesignModePrompt(this.session.deviceInfo
|
|
31
|
+
const designPrompt = buildDesignModePrompt(this.session.deviceInfo, {}, {
|
|
32
|
+
strictMode: Boolean(this.engine?.strictMode)
|
|
33
|
+
});
|
|
34
|
+
this.baseDesignPrompt = designPrompt;
|
|
31
35
|
this.session.setSystemPrompt(designPrompt);
|
|
32
36
|
console.log(`\n=== Design Mode: Creating test "${this.testName}" ===`);
|
|
33
37
|
console.log("Describe what you want to test. The agent will explore autonomously.");
|
|
@@ -314,21 +318,12 @@ export class DesignMode {
|
|
|
314
318
|
}
|
|
315
319
|
// Automatic recovery - continue from where we left off using transcript
|
|
316
320
|
console.log("\nRecovering from error and continuing...");
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
${this.session.getTranscriptText()}
|
|
325
|
-
|
|
326
|
-
Continue from where we left off and complete the original task: "${this.initialUserPrompt}"
|
|
327
|
-
|
|
328
|
-
Remember:
|
|
329
|
-
- Don't repeat actions that already succeeded
|
|
330
|
-
- Continue towards generating the test script
|
|
331
|
-
- If the flow was complete before the error, generate the script now`;
|
|
321
|
+
const recoveryContext = buildDesignRecoveryPrompt({
|
|
322
|
+
basePrompt: this.baseDesignPrompt || this.session.systemPrompt || buildDesignModePrompt(this.session.deviceInfo),
|
|
323
|
+
transcript: this.session.getTranscriptText(),
|
|
324
|
+
objective: this.initialUserPrompt,
|
|
325
|
+
errorMessage: err.message
|
|
326
|
+
});
|
|
332
327
|
// Reset conversation state for fresh API call
|
|
333
328
|
this.session.clearMessages();
|
|
334
329
|
this.session.addMessage("system", recoveryContext);
|
|
@@ -1,10 +1,19 @@
|
|
|
1
1
|
import { getScreenshotAsBase64, connectToDevice, getDeviceInfo, getCurrentPlatform } from "../device/connection.js";
|
|
2
2
|
import { sendCUARequest } from "../device/openai.js";
|
|
3
|
+
import { buildExecutionRecoveryPrompt } from "../core/prompts.js";
|
|
3
4
|
import { isAssertion, extractAssertionPrompt, buildAssertionSystemPrompt, checkAssertionResult, handleAssertionFailure, handleAssertionSuccess, } from "../device/assertions.js";
|
|
4
5
|
import { isLoadmillInstruction, extractLoadmillCommand, executeLoadmillInstruction, } from "../device/loadmill.js";
|
|
5
6
|
import { logger } from "../utils/logger.js";
|
|
6
7
|
import { emitDesktopDebug } from "../utils/desktop-debug.js";
|
|
7
8
|
import { printCliOutput } from "../utils/console-output.js";
|
|
9
|
+
export function buildExecutionRequestPayload({ instruction, isAssertionStep, messages, previousResponseId }) {
|
|
10
|
+
return {
|
|
11
|
+
messagesToSend: previousResponseId && !isAssertionStep
|
|
12
|
+
? [{ role: "user", content: instruction }]
|
|
13
|
+
: messages,
|
|
14
|
+
previousResponseIdToSend: previousResponseId,
|
|
15
|
+
};
|
|
16
|
+
}
|
|
8
17
|
/**
|
|
9
18
|
* Execution Mode - Run test scripts line-by-line
|
|
10
19
|
* Each instruction is executed in isolation (messages cleared after each turn)
|
|
@@ -210,18 +219,13 @@ export class ExecutionMode {
|
|
|
210
219
|
instructionIndex: stepContext?.instructionIndex,
|
|
211
220
|
captureSource: isAssertionStep ? "instruction-input-assertion" : "instruction-input"
|
|
212
221
|
});
|
|
213
|
-
//
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
}
|
|
221
|
-
else {
|
|
222
|
-
// Fresh start or assertion - send full messages (system + user)
|
|
223
|
-
messagesToSend = this.session.messages;
|
|
224
|
-
}
|
|
222
|
+
// Assertions rely on the prior response chain for earlier execution context.
|
|
223
|
+
const { messagesToSend, previousResponseIdToSend } = buildExecutionRequestPayload({
|
|
224
|
+
instruction,
|
|
225
|
+
isAssertionStep,
|
|
226
|
+
messages: this.session.messages,
|
|
227
|
+
previousResponseId: this.session.previousResponseId
|
|
228
|
+
});
|
|
225
229
|
const response = await sendCUARequest({
|
|
226
230
|
messages: messagesToSend,
|
|
227
231
|
screenshotBase64,
|
|
@@ -410,11 +414,10 @@ export class ExecutionMode {
|
|
|
410
414
|
const transcriptContext = this.session.getTranscriptText();
|
|
411
415
|
this.session.clearMessages();
|
|
412
416
|
// clearMessages() restores the base system prompt, but we need to add context
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
}
|
|
417
|
+
const recoverySystemPrompt = buildExecutionRecoveryPrompt({
|
|
418
|
+
basePrompt: this.initialSystemText,
|
|
419
|
+
transcript: transcriptContext
|
|
420
|
+
});
|
|
418
421
|
// Replace the system message with the enhanced one
|
|
419
422
|
this.session.messages = [{ role: "system", content: recoverySystemPrompt }];
|
|
420
423
|
this.session.updateResponseId(undefined);
|