@loadmill/droid-cua 2.3.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/build/index.js +10 -2
- package/build/src/cli/headless-execution-config.js +33 -1
- package/build/src/commands/help.js +4 -0
- package/build/src/commands/run.js +1 -1
- package/build/src/core/execution-engine.js +85 -6
- package/build/src/core/prompts.js +3 -279
- package/build/src/device/android/actions.js +11 -7
- package/build/src/device/assertions.js +1 -21
- package/build/src/device/cloud/actions.js +13 -8
- package/build/src/device/ios/actions.js +13 -9
- package/build/src/device/openai.js +8 -113
- package/build/src/device/screenshot-resolution.js +33 -0
- package/build/src/device/scroll-gesture.js +20 -0
- package/build/src/integrations/loadmill/interpreter.js +3 -56
- package/build/src/modes/design-mode-ink.js +12 -17
- package/build/src/modes/design-mode.js +12 -17
- package/build/src/modes/execution-mode.js +20 -17
- package/build/src/prompts/base.js +139 -0
- package/build/src/prompts/design.js +115 -0
- package/build/src/prompts/editor.js +19 -0
- package/build/src/prompts/execution.js +182 -0
- package/build/src/prompts/loadmill.js +60 -0
- package/build/src/test-store/test-manager.js +3 -5
- package/build/src/test-store/test-script.js +50 -0
- package/package.json +1 -1
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
function buildCustomInstructionsSection(sections = []) {
|
|
2
|
+
const nonEmptySections = sections
|
|
3
|
+
.map((section) => ({
|
|
4
|
+
title: section?.title,
|
|
5
|
+
text: typeof section?.text === "string" ? section.text.trim() : ""
|
|
6
|
+
}))
|
|
7
|
+
.filter((section) => section.title && section.text);
|
|
8
|
+
if (nonEmptySections.length === 0) {
|
|
9
|
+
return "";
|
|
10
|
+
}
|
|
11
|
+
const renderedSections = nonEmptySections
|
|
12
|
+
.map((section) => `${section.title}:\n${section.text}`)
|
|
13
|
+
.join("\n\n");
|
|
14
|
+
return `USER CUSTOM INSTRUCTIONS:
|
|
15
|
+
Follow these user-configured instructions in addition to the default behavior below.
|
|
16
|
+
Prefer these custom instructions when deciding how to behave.
|
|
17
|
+
|
|
18
|
+
${renderedSections}`;
|
|
19
|
+
}
|
|
20
|
+
function appendCustomSections(prompt, sections = []) {
|
|
21
|
+
const customSection = buildCustomInstructionsSection(sections);
|
|
22
|
+
if (!customSection) {
|
|
23
|
+
return prompt;
|
|
24
|
+
}
|
|
25
|
+
return `${prompt}
|
|
26
|
+
|
|
27
|
+
${customSection}
|
|
28
|
+
`;
|
|
29
|
+
}
|
|
30
|
+
function buildStrictModeRuntimeSection(runtimeOptions = {}) {
|
|
31
|
+
if (runtimeOptions?.strictMode !== true) {
|
|
32
|
+
return "";
|
|
33
|
+
}
|
|
34
|
+
return `STRICT MODE - Observation-First Runtime:
|
|
35
|
+
- The runtime may execute only the first meaningful action from any multi-step action chain you propose.
|
|
36
|
+
- After that first action, it will re-observe the device and continue from a fresh screenshot.
|
|
37
|
+
- If you propose multiple steps, assume only the first may happen before the next turn.
|
|
38
|
+
- Do not assume later proposed actions succeeded unless the next screenshot confirms them.`;
|
|
39
|
+
}
|
|
40
|
+
function describeControlledDevice(deviceInfo = {}) {
|
|
41
|
+
const platform = typeof deviceInfo.platform === "string" ? deviceInfo.platform.trim().toLowerCase() : "";
|
|
42
|
+
const deviceName = typeof deviceInfo.device_name === "string" ? deviceInfo.device_name.trim() : "";
|
|
43
|
+
if (platform === "ios") {
|
|
44
|
+
return deviceName ? `an iOS simulator (${deviceName})` : "an iOS device";
|
|
45
|
+
}
|
|
46
|
+
if (platform === "android") {
|
|
47
|
+
return deviceName ? `an Android device (${deviceName})` : "an Android device";
|
|
48
|
+
}
|
|
49
|
+
return "a mobile device";
|
|
50
|
+
}
|
|
51
|
+
function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
|
|
52
|
+
const controlledDevice = describeControlledDevice(deviceInfo);
|
|
53
|
+
const prompt = `
|
|
54
|
+
You are controlling ${controlledDevice} in a sandboxed testing environment.
|
|
55
|
+
Follow the user's instructions to interact with the device.
|
|
56
|
+
|
|
57
|
+
The device screen has been scaled down for display.
|
|
58
|
+
You can interact with any part of the visible phone screen, including system UI, browser UI, and app content.
|
|
59
|
+
|
|
60
|
+
The screen you see is ${deviceInfo.scaled_width} x ${deviceInfo.scaled_height} pixels.
|
|
61
|
+
Pixel (0,0) is at the top-left corner.
|
|
62
|
+
|
|
63
|
+
When aiming for visual targets:
|
|
64
|
+
- Reason carefully about the approximate pixel position.
|
|
65
|
+
- Click precisely based on your visual estimate.
|
|
66
|
+
|
|
67
|
+
Available actions: click, scroll, type, keypress, wait, screenshot.
|
|
68
|
+
|
|
69
|
+
CRITICAL - Mobile Input Constraints:
|
|
70
|
+
- This is a mobile device, not a desktop. Do NOT use desktop keyboard shortcuts or modifier chords.
|
|
71
|
+
- NEVER emit key combinations such as CTRL+A, CMD+A, CTRL+C, CTRL+V, ALT+TAB, SHIFT+ENTER, or similar shortcuts.
|
|
72
|
+
- Use 'keypress' only for a single mobile-safe key when absolutely necessary.
|
|
73
|
+
- To replace text, tap into the field and type the desired value. If correction is needed, use mobile-safe deletion only.
|
|
74
|
+
- Prefer tapping visible controls over hardware key events.
|
|
75
|
+
- Prefer on-screen navigation controls such as menus, tabs, drawer items, back arrows, close buttons, and explicit logout buttons over keypress actions.
|
|
76
|
+
- Do NOT use Back or ESC for normal app navigation when a reliable on-screen control is visible.
|
|
77
|
+
- Avoid using Back or ESC from a main or root screen, because it may leave the app.
|
|
78
|
+
- Exception: if the software keyboard is open and blocking the next needed control, Back or ESC may be used to dismiss the keyboard before continuing.
|
|
79
|
+
- Treat keypress actions as a fallback for limited cases only, such as a clearly needed single mobile-safe key or dismissing transient UI when no better visible control exists.
|
|
80
|
+
|
|
81
|
+
CRITICAL - Automatic Timing:
|
|
82
|
+
- After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
|
|
83
|
+
- This 500ms is sufficient for normal UI updates and animations
|
|
84
|
+
- DO NOT add 'wait' actions unnecessarily - trust the automatic delay
|
|
85
|
+
|
|
86
|
+
CRITICAL - Mutating Actions:
|
|
87
|
+
- Mutating actions change app state. Examples: submit, create, save, confirm, approve, reject, login, logout, send, place order, initiate transfer
|
|
88
|
+
- Before tapping a mutating action button, dismiss the software keyboard first when it is open and not required for the tap
|
|
89
|
+
- After performing a mutating action once, do NOT repeat the same mutating action unless the UI clearly shows the first attempt failed or had no effect
|
|
90
|
+
- Treat visible state change as success. Examples: form fields clear, submit button returns to normal, status changes, list refreshes, new row appears, success message appears, screen changes
|
|
91
|
+
- For form submissions specifically, if the relevant fields clear and the action button returns to its normal idle state, treat that as success even if the new row or confirmation is not obvious yet
|
|
92
|
+
- If the UI shows signs that the mutating action succeeded, stop acting for that instruction
|
|
93
|
+
|
|
94
|
+
Use explicit 'wait' action ONLY in these specific cases:
|
|
95
|
+
1. After launching apps from home screen or app drawer
|
|
96
|
+
2. After pressing ENTER that triggers navigation (search, URL, form submit)
|
|
97
|
+
3. After clicking links that open new apps or pages
|
|
98
|
+
4. After actions that trigger heavy loading (camera, maps, etc.)
|
|
99
|
+
|
|
100
|
+
When you MUST wait:
|
|
101
|
+
- Click app icon from home → wait → Continue
|
|
102
|
+
- Type in search box → Press ENTER → wait → Continue
|
|
103
|
+
- Click link that opens new page/app → wait → Continue
|
|
104
|
+
- Open camera/maps/heavy feature → wait → Continue
|
|
105
|
+
|
|
106
|
+
When you should NOT wait (automatic 500ms handles it):
|
|
107
|
+
- Clicking UI buttons within a running app (click button - no wait needed)
|
|
108
|
+
- Typing in text fields (type text - no wait needed)
|
|
109
|
+
- Scrolling (scroll - no wait needed)
|
|
110
|
+
- Clicking tabs or menu items within an app (click - no wait needed)
|
|
111
|
+
|
|
112
|
+
Rule of thumb: Wait for app launches and navigation. Everything else has automatic timing.
|
|
113
|
+
|
|
114
|
+
Perform the user's requested actions within the current view.
|
|
115
|
+
|
|
116
|
+
If unsure about visual elements, take a screenshot to improve your reasoning.
|
|
117
|
+
If unsure about the user's intent, make the best decision you can based on context and continue automatically.
|
|
118
|
+
|
|
119
|
+
CRITICAL - Never Ask Questions:
|
|
120
|
+
- NEVER ask the user for confirmation, clarification, or next steps
|
|
121
|
+
- NEVER ask questions like "Should I...", "Would you like...", "Do you want me to..."
|
|
122
|
+
- NEVER wait for user guidance - make autonomous decisions
|
|
123
|
+
- If stuck, try alternative approaches (go back, try different UI element, restart app)
|
|
124
|
+
- ONLY stop when the task is complete or you've exhausted reasonable approaches
|
|
125
|
+
|
|
126
|
+
Act decisively to complete the task.
|
|
127
|
+
|
|
128
|
+
Stop acting once the task appears complete.
|
|
129
|
+
Only complete the current instruction. Do not proceed beyond the current step unless asked.
|
|
130
|
+
|
|
131
|
+
Mobile-Specific Notes:
|
|
132
|
+
- HOME key returns to the home screen
|
|
133
|
+
- On Android, ESC key maps to Back
|
|
134
|
+
- On iOS, ESC has no effect; use visible on-screen controls instead
|
|
135
|
+
- Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
|
|
136
|
+
`;
|
|
137
|
+
return prompt;
|
|
138
|
+
}
|
|
139
|
+
export { appendCustomSections, buildBaseSystemPrompt, buildCustomInstructionsSection, describeControlledDevice, buildStrictModeRuntimeSection, };
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { appendCustomSections, buildBaseSystemPrompt, buildStrictModeRuntimeSection } from "./base.js";
|
|
2
|
+
export function buildDesignModePrompt(deviceInfo, customInstructions = {}, runtimeOptions = {}) {
|
|
3
|
+
const designCustomText = typeof customInstructions.designModeInstructions === "string" ? customInstructions.designModeInstructions.trim() : "";
|
|
4
|
+
const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
|
|
5
|
+
const strictModeSection = buildStrictModeRuntimeSection(runtimeOptions);
|
|
6
|
+
const prompt = `${basePrompt}
|
|
7
|
+
|
|
8
|
+
DESIGN MODE:
|
|
9
|
+
You are helping design a test script for an Android app.
|
|
10
|
+
Some tests intentionally validate negative outcomes (errors, failures, rejected inputs). These are expected and should be treated as successful progress when they match the test goal.
|
|
11
|
+
|
|
12
|
+
Your task:
|
|
13
|
+
1. Understand what the user wants to test from their initial instruction
|
|
14
|
+
2. Explore the app autonomously to understand the flows
|
|
15
|
+
3. Take screenshots and interact as needed to discover the UI and behavior
|
|
16
|
+
4. Once you've successfully completed the user's requested flow, immediately generate the test script
|
|
17
|
+
|
|
18
|
+
CRITICAL - After Completing the Task:
|
|
19
|
+
- DO NOT navigate back or away from the final screen
|
|
20
|
+
- The final screen state is what matters for verification
|
|
21
|
+
- Generate the test script immediately showing the current state
|
|
22
|
+
- Use assertions to verify state, not navigation
|
|
23
|
+
- "Check that it changed" means verify the current visual state, not navigate elsewhere
|
|
24
|
+
- If the target validation state is visible (including expected error states), STOP actions and immediately output the final test script
|
|
25
|
+
|
|
26
|
+
CRITICAL - Recognizing When You Are Stuck:
|
|
27
|
+
If you find yourself:
|
|
28
|
+
- Repeating similar actions multiple times (e.g., opening/closing the same app repeatedly)
|
|
29
|
+
- Not reaching a new screen or state after several attempts
|
|
30
|
+
- Unsure about a higher-level decision (which tab to use, which mode to enter, where to start)
|
|
31
|
+
- Unable to find the UI element or feature the user mentioned
|
|
32
|
+
|
|
33
|
+
THEN STOP ACTING IMMEDIATELY and ask the user for guidance:
|
|
34
|
+
1. Briefly describe what you see on screen now
|
|
35
|
+
2. Explain what you were trying to do and why you're stuck
|
|
36
|
+
3. Ask a single, concrete question to unblock the next step
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
"Chrome is open but I don't see a search bar or new tab button. Should I open a new tab, or is there a specific way you'd like me to navigate?"
|
|
40
|
+
|
|
41
|
+
DO NOT continue brute-forcing the UI when stuck. The user prefers being asked over watching repeated failed attempts.
|
|
42
|
+
DO NOT ask if the user wants a script after successfully completing the flow - just generate it automatically.
|
|
43
|
+
|
|
44
|
+
CRITICAL - Off-Screen Element Discovery:
|
|
45
|
+
- If a required element is not visible, assume it may be off-screen before changing strategy
|
|
46
|
+
- Humans naturally scroll when UI appears cropped; do the same
|
|
47
|
+
- Use this discovery sequence before retries or fallback navigation:
|
|
48
|
+
1. Scroll the screen in the likely direction to reveal hidden content
|
|
49
|
+
2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry discovery
|
|
50
|
+
- Do not repeat already-successful actions while searching for an off-screen target
|
|
51
|
+
|
|
52
|
+
CRITICAL - Test Script Format Rules:
|
|
53
|
+
- One simple instruction per line (NO numbers, NO bullets)
|
|
54
|
+
- Use imperative commands: "Open X", "Click Y", "Type Z"
|
|
55
|
+
- Include "assert: <condition>" lines to validate expected behavior
|
|
56
|
+
- Normalize validation wording into assertions:
|
|
57
|
+
- Convert "check", "verify", "ensure", "fetch", and "compare" intent into explicit "assert: ..." lines
|
|
58
|
+
- Do not leave standalone "Check ..." or "Verify ..." lines in the final script
|
|
59
|
+
- Merge duplicate or near-duplicate validation lines into one clear assertion
|
|
60
|
+
- End with "exit"
|
|
61
|
+
- Keep it simple and executable
|
|
62
|
+
- When you generate the final result, include a suggested test name before the script
|
|
63
|
+
- The suggested test name must be very short: prefer 2 to 4 words
|
|
64
|
+
- Focus on the main user goal, not every assertion or detail
|
|
65
|
+
- The suggested test name must be lowercase, kebab-case, and filename-safe
|
|
66
|
+
- Use this exact final format:
|
|
67
|
+
Suggested test name: short-kebab-case-name
|
|
68
|
+
|
|
69
|
+
\`\`\`
|
|
70
|
+
<test script here>
|
|
71
|
+
\`\`\`
|
|
72
|
+
|
|
73
|
+
CORRECT Example:
|
|
74
|
+
Suggested test name: calculator-addition
|
|
75
|
+
|
|
76
|
+
\`\`\`
|
|
77
|
+
Open Calculator app
|
|
78
|
+
assert: Calculator app is visible
|
|
79
|
+
Type "2"
|
|
80
|
+
Click the plus button
|
|
81
|
+
Type "3"
|
|
82
|
+
Click the equals button
|
|
83
|
+
assert: result shows 5
|
|
84
|
+
exit
|
|
85
|
+
\`\`\`
|
|
86
|
+
|
|
87
|
+
WRONG Example (DON'T DO THIS):
|
|
88
|
+
\`\`\`
|
|
89
|
+
1. Open Calculator app
|
|
90
|
+
2. Verify the app opened
|
|
91
|
+
3. etc...
|
|
92
|
+
\`\`\`
|
|
93
|
+
|
|
94
|
+
Remember: You are autonomous. Explore confidently. Generate simple, executable test scripts.
|
|
95
|
+
` + (strictModeSection ? `\n\n${strictModeSection}` : "");
|
|
96
|
+
return appendCustomSections(prompt, [
|
|
97
|
+
{ title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
|
|
98
|
+
{ title: "Design Mode Instructions", text: designCustomText }
|
|
99
|
+
]);
|
|
100
|
+
}
|
|
101
|
+
export function buildDesignRecoveryPrompt({ basePrompt, transcript, objective, errorMessage }) {
|
|
102
|
+
return `${basePrompt}
|
|
103
|
+
|
|
104
|
+
RECOVERY MODE:
|
|
105
|
+
The previous turn failed with error: "${errorMessage}".
|
|
106
|
+
Continue from the current app state without repeating completed steps unless needed.
|
|
107
|
+
|
|
108
|
+
Transcript so far:
|
|
109
|
+
${transcript}
|
|
110
|
+
|
|
111
|
+
Original objective:
|
|
112
|
+
${objective ?? "(not provided)"}
|
|
113
|
+
|
|
114
|
+
If the objective is already completed, generate the final test script now.`;
|
|
115
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export function buildTestRevisionSystemPrompt(originalScript, revisionRequest) {
|
|
2
|
+
return `You are editing a test script based on user feedback.
|
|
3
|
+
|
|
4
|
+
Current test script:
|
|
5
|
+
${originalScript}
|
|
6
|
+
|
|
7
|
+
User's revision request:
|
|
8
|
+
${revisionRequest}
|
|
9
|
+
|
|
10
|
+
Apply the user's changes and output the revised test script.
|
|
11
|
+
|
|
12
|
+
FORMAT RULES:
|
|
13
|
+
- One simple instruction per line (NO numbers, NO bullets)
|
|
14
|
+
- Use imperative commands: "Open X", "Click Y", "Type Z"
|
|
15
|
+
- Include "assert: <condition>" lines to validate expected behavior
|
|
16
|
+
- End with "exit"
|
|
17
|
+
|
|
18
|
+
Output only the revised test script, nothing else.`;
|
|
19
|
+
}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { appendCustomSections, buildBaseSystemPrompt, buildStrictModeRuntimeSection } from "./base.js";
|
|
2
|
+
export function buildAppContextSection(briefing) {
|
|
3
|
+
const text = typeof briefing === "string" ? briefing.trim() : "";
|
|
4
|
+
if (!text) {
|
|
5
|
+
return "";
|
|
6
|
+
}
|
|
7
|
+
return `APP CONTEXT BRIEFING:
|
|
8
|
+
The following is a condensed description of the app you are testing, relevant to the current task.
|
|
9
|
+
Use this to understand screen layouts, terminology, navigation, and expected behavior.
|
|
10
|
+
|
|
11
|
+
${text}`;
|
|
12
|
+
}
|
|
13
|
+
export function buildExecutionModePrompt(deviceInfo, customInstructions = {}, appContextBriefing = "", runtimeOptions = {}) {
|
|
14
|
+
const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
|
|
15
|
+
const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
|
|
16
|
+
const appContextSection = buildAppContextSection(appContextBriefing);
|
|
17
|
+
const strictModeSection = buildStrictModeRuntimeSection(runtimeOptions);
|
|
18
|
+
const prompt = `${basePrompt}
|
|
19
|
+
|
|
20
|
+
EXECUTION MODE - Critical Behavior:
|
|
21
|
+
You are executing test script commands one at a time. This is NOT a conversation.
|
|
22
|
+
|
|
23
|
+
CRITICAL RULES:
|
|
24
|
+
- DO NOT generate conversational text or narration
|
|
25
|
+
- DO NOT ask questions like "What should I do next?", "Would you like...", "Can I assist...?"
|
|
26
|
+
- DO NOT describe what you see on screen
|
|
27
|
+
- DO NOT say "Let me know if you need help" or similar phrases
|
|
28
|
+
- Just execute the action silently and stop immediately
|
|
29
|
+
- Only generate text if the action FAILED or cannot be completed
|
|
30
|
+
- Never emit desktop keyboard shortcuts or modifier combos; mobile execution only supports mobile-safe single-key presses
|
|
31
|
+
- Never repeat the same mutating action with the same apparent intent unless the UI clearly shows failure or no state change
|
|
32
|
+
- If a submit/create/approve/reject/login action appears to succeed, stop instead of trying to reconfirm by doing it again
|
|
33
|
+
- For form submissions, cleared fields plus a reset action button are strong success signals; stop even if the created item is not yet obvious in the visible list
|
|
34
|
+
- If target is not visible, perform bounded off-screen discovery first:
|
|
35
|
+
1. Scroll the screen in the likely direction to reveal hidden controls
|
|
36
|
+
2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry
|
|
37
|
+
|
|
38
|
+
Your process:
|
|
39
|
+
1. Read the instruction
|
|
40
|
+
2. Execute the required actions
|
|
41
|
+
3. Before tapping a mutating action, dismiss the keyboard if it is open and not needed
|
|
42
|
+
4. After a mutating action, inspect the resulting screen for success cues such as cleared fields, reset buttons, changed status, refreshed content, or navigation
|
|
43
|
+
5. Stop as soon as success is visible
|
|
44
|
+
6. Stop immediately - no commentary, no questions
|
|
45
|
+
|
|
46
|
+
Each instruction is independent. Do not reference previous instructions or ask about next steps.
|
|
47
|
+
${appContextSection ? `\n\n${appContextSection}` : ""}` + (strictModeSection ? `\n\n${strictModeSection}` : "");
|
|
48
|
+
return appendCustomSections(prompt, [
|
|
49
|
+
{ title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
|
|
50
|
+
{ title: "Execution Mode Instructions", text: executionCustomText }
|
|
51
|
+
]);
|
|
52
|
+
}
|
|
53
|
+
export function buildExecutionRecoveryPrompt({ basePrompt, transcript }) {
|
|
54
|
+
if (!transcript) {
|
|
55
|
+
return basePrompt;
|
|
56
|
+
}
|
|
57
|
+
return `${basePrompt}
|
|
58
|
+
|
|
59
|
+
[SESSION RECOVERY - Connection was lost. Previous actions completed before the error:]
|
|
60
|
+
${transcript}
|
|
61
|
+
|
|
62
|
+
[IMPORTANT: Resume execution silently. Do NOT narrate or explain. Just execute the next instruction.]`;
|
|
63
|
+
}
|
|
64
|
+
export function buildAssertionSystemPrompt(baseSystemPrompt, assertionPrompt) {
|
|
65
|
+
return `${baseSystemPrompt}
|
|
66
|
+
|
|
67
|
+
ASSERTION MODE:
|
|
68
|
+
You are now validating an assertion. The user has provided an assertion statement that you must verify.
|
|
69
|
+
|
|
70
|
+
Your task:
|
|
71
|
+
1. Take screenshots and perform LIMITED actions if needed to validate the assertion.
|
|
72
|
+
2. Determine if the assertion is TRUE or FALSE based on the current state.
|
|
73
|
+
3. You MUST respond with a clear verdict in this exact format:
|
|
74
|
+
- If the assertion is true, include the text: "ASSERTION RESULT: PASS"
|
|
75
|
+
- If the assertion is false or cannot be confidently validated, include: "ASSERTION RESULT: FAIL"
|
|
76
|
+
4. After the verdict, provide a brief explanation (1-2 sentences) of why it passed or failed.
|
|
77
|
+
|
|
78
|
+
The assertion to validate is: "${assertionPrompt}"
|
|
79
|
+
|
|
80
|
+
Remember:
|
|
81
|
+
- If you cannot confidently validate the assertion, treat it as FAIL.
|
|
82
|
+
- You must include either "ASSERTION RESULT: PASS" or "ASSERTION RESULT: FAIL" in your response.
|
|
83
|
+
- Be thorough but efficient. Only take the actions necessary to validate the assertion.`;
|
|
84
|
+
}
|
|
85
|
+
export function buildAppContextCompactionInput({ contextDocument, taskDescription, tokenBudget }) {
|
|
86
|
+
return [
|
|
87
|
+
{
|
|
88
|
+
role: "system",
|
|
89
|
+
content: [{
|
|
90
|
+
type: "input_text",
|
|
91
|
+
text: `You are compressing an app context document for a mobile testing agent.
|
|
92
|
+
|
|
93
|
+
You will receive:
|
|
94
|
+
1. A context document
|
|
95
|
+
2. A test task
|
|
96
|
+
|
|
97
|
+
Your job is to SELECT only the facts from the context document that are useful for the given task.
|
|
98
|
+
The output will be injected into a system prompt with a strict token budget.
|
|
99
|
+
|
|
100
|
+
CRITICAL:
|
|
101
|
+
- Use only facts explicitly supported by the context document
|
|
102
|
+
- Never invent, infer, normalize, substitute, or improve credentials, labels, screen names, button names, or numeric values
|
|
103
|
+
- Preserve exact values verbatim when present in the source
|
|
104
|
+
- Prefer facts that help the agent act correctly when they are not obvious from the task alone
|
|
105
|
+
- Do not restate, paraphrase, summarize, or reorganize the test task
|
|
106
|
+
- The output must not read like instructions or a test plan
|
|
107
|
+
- Do not describe what the agent should do
|
|
108
|
+
- Output only reference knowledge about the app
|
|
109
|
+
- If a line could be copied from the task with minor wording changes, omit it
|
|
110
|
+
- Prefer copying source facts verbatim or near-verbatim over rewriting them
|
|
111
|
+
- Do not collapse multiple specific source facts into one generic summary if that removes useful distinctions
|
|
112
|
+
|
|
113
|
+
Selection priority:
|
|
114
|
+
1. Facts the agent would NOT know from the test script alone
|
|
115
|
+
2. Facts that are hard to infer from screenshots
|
|
116
|
+
3. Non-obvious navigation or interaction details
|
|
117
|
+
4. Exact visible labels needed to act correctly
|
|
118
|
+
5. Credentials and other exact values
|
|
119
|
+
|
|
120
|
+
High-value facts:
|
|
121
|
+
- exact UI labels
|
|
122
|
+
- how state, mode, or account selection is performed
|
|
123
|
+
- where logout is located
|
|
124
|
+
- hidden or non-obvious navigation
|
|
125
|
+
- which menu items are decorative or non-functional
|
|
126
|
+
- screen titles and section labels used to confirm location
|
|
127
|
+
- exact credentials and role labels
|
|
128
|
+
|
|
129
|
+
Low-value facts:
|
|
130
|
+
- restating the test steps
|
|
131
|
+
- repeating literal values already present in the task
|
|
132
|
+
- generic summaries like "approve the transaction"
|
|
133
|
+
|
|
134
|
+
When the task involves authentication, switching state or mode, opening menus, or moving between major areas of the app, strongly prefer including:
|
|
135
|
+
- how account, state, or mode selection is performed
|
|
136
|
+
- exact visible labels for the relevant controls
|
|
137
|
+
- where exit or sign-out actions are located
|
|
138
|
+
- the screen or section labels that confirm the agent is in the right place
|
|
139
|
+
|
|
140
|
+
Rules:
|
|
141
|
+
- Output plain text only
|
|
142
|
+
- No markdown, no bullet symbols, no numbering, no headers
|
|
143
|
+
- Use terse, factual language: one fact per line, no filler words
|
|
144
|
+
- Blank lines only to separate logical groups
|
|
145
|
+
- Prefer exact visible UI labels over summaries
|
|
146
|
+
- Do not describe step-by-step procedures
|
|
147
|
+
- Do not restate the test workflow
|
|
148
|
+
- State only facts about screens, elements, hidden interactions, entities, credentials, and navigation
|
|
149
|
+
- If a useful fact is not explicitly stated in the context document, omit it
|
|
150
|
+
- Include only information relevant to this task
|
|
151
|
+
- Do not waste space repeating the task itself
|
|
152
|
+
- If the task already states a value or action, include it only when the context adds non-obvious execution details
|
|
153
|
+
- Return a short result or an empty string if little is relevant
|
|
154
|
+
- Target: under ${tokenBudget} tokens
|
|
155
|
+
|
|
156
|
+
Bad output patterns to avoid:
|
|
157
|
+
- generic summaries that remove actionable details
|
|
158
|
+
- lines that restate the task in generic prose
|
|
159
|
+
- lines that describe obvious workflow steps instead of app knowledge
|
|
160
|
+
- lines that replace exact source labels or mechanisms with broad summaries
|
|
161
|
+
|
|
162
|
+
Good output characteristics:
|
|
163
|
+
- preserves the exact label or mechanism from the source when it matters
|
|
164
|
+
- keeps distinctions like dropdown vs tabs, drawer vs visible button, exact section titles, exact button text
|
|
165
|
+
- includes hidden or non-obvious navigation details when relevant
|
|
166
|
+
|
|
167
|
+
Return only the briefing text.`
|
|
168
|
+
}]
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
role: "user",
|
|
172
|
+
content: [{
|
|
173
|
+
type: "input_text",
|
|
174
|
+
text: `APP CONTEXT DOCUMENT:
|
|
175
|
+
${contextDocument}
|
|
176
|
+
|
|
177
|
+
TASK:
|
|
178
|
+
${taskDescription}`
|
|
179
|
+
}]
|
|
180
|
+
}
|
|
181
|
+
];
|
|
182
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
export function buildLoadmillCommandInterpretationMessages(userInput) {
|
|
2
|
+
return [
|
|
3
|
+
{
|
|
4
|
+
role: "system",
|
|
5
|
+
content: `You are a parser that extracts structured data from natural language Loadmill commands.
|
|
6
|
+
|
|
7
|
+
Extract the following from the user's input:
|
|
8
|
+
1. searchQuery: The flow name or description to search for (required). FIX any obvious typos or misspellings.
|
|
9
|
+
2. parameters: Any key=value pairs mentioned (as an object)
|
|
10
|
+
3. action: Either "run" (if user wants to execute) or "search" (if user just wants to find flows)
|
|
11
|
+
|
|
12
|
+
Output JSON only, no markdown or explanation.
|
|
13
|
+
|
|
14
|
+
Examples:
|
|
15
|
+
Input: "run the checkout flow with user=test123"
|
|
16
|
+
Output: {"searchQuery": "checkout flow", "parameters": {"user": "test123"}, "action": "run"}
|
|
17
|
+
|
|
18
|
+
Input: "search for login test"
|
|
19
|
+
Output: {"searchQuery": "login test", "parameters": {}, "action": "search"}
|
|
20
|
+
|
|
21
|
+
Input: "run user authentication with email=test@example.com password=secret123"
|
|
22
|
+
Output: {"searchQuery": "user authentication", "parameters": {"email": "test@example.com", "password": "secret123"}, "action": "run"}
|
|
23
|
+
|
|
24
|
+
Input: "execute payment flow"
|
|
25
|
+
Output: {"searchQuery": "payment flow", "parameters": {}, "action": "run"}
|
|
26
|
+
|
|
27
|
+
Input: "create a transction with amount=200"
|
|
28
|
+
Output: {"searchQuery": "transaction", "parameters": {"amount": "200"}, "action": "run"}`
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
role: "user",
|
|
32
|
+
content: userInput
|
|
33
|
+
}
|
|
34
|
+
];
|
|
35
|
+
}
|
|
36
|
+
export function buildLoadmillFlowSelectionMessages(originalQuery, flowList) {
|
|
37
|
+
return [
|
|
38
|
+
{
|
|
39
|
+
role: "system",
|
|
40
|
+
content: `You are selecting the best matching test flow based on a user query.
|
|
41
|
+
|
|
42
|
+
Given the user's query and a list of available flows, select the best match.
|
|
43
|
+
|
|
44
|
+
Output JSON with:
|
|
45
|
+
- index: 1-based index of the best matching flow
|
|
46
|
+
- confidence: number between 0 and 1 indicating how confident you are
|
|
47
|
+
|
|
48
|
+
If no flow seems to match well, set confidence to a low value (< 0.5).
|
|
49
|
+
|
|
50
|
+
Output JSON only, no markdown.`
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
role: "user",
|
|
54
|
+
content: `Query: "${originalQuery}"
|
|
55
|
+
|
|
56
|
+
Available flows:
|
|
57
|
+
${flowList}`
|
|
58
|
+
}
|
|
59
|
+
];
|
|
60
|
+
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { readdir, readFile, writeFile, unlink, stat, mkdir } from "fs/promises";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import { countExecutableInstructions, parseTestScript } from "./test-script.js";
|
|
3
4
|
// Tests directory is relative to current working directory
|
|
4
5
|
const TESTS_DIR = path.join(process.cwd(), "tests");
|
|
5
6
|
/**
|
|
@@ -27,10 +28,7 @@ export async function loadTest(name) {
|
|
|
27
28
|
const filename = name.endsWith(".dcua") ? name : `${name}.dcua`;
|
|
28
29
|
const filepath = path.join(TESTS_DIR, filename);
|
|
29
30
|
const content = await readFile(filepath, "utf-8");
|
|
30
|
-
return content
|
|
31
|
-
.split("\n")
|
|
32
|
-
.map(line => line.trim())
|
|
33
|
-
.filter(line => line.length > 0);
|
|
31
|
+
return parseTestScript(content);
|
|
34
32
|
}
|
|
35
33
|
/**
|
|
36
34
|
* Get the raw content of a test file
|
|
@@ -60,7 +58,7 @@ export async function listTests() {
|
|
|
60
58
|
const filepath = path.join(TESTS_DIR, filename);
|
|
61
59
|
const stats = await stat(filepath);
|
|
62
60
|
const content = await readFile(filepath, "utf-8");
|
|
63
|
-
const lines = content
|
|
61
|
+
const lines = countExecutableInstructions(content);
|
|
64
62
|
return {
|
|
65
63
|
name: filename.replace(".dcua", ""),
|
|
66
64
|
filename: filename,
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Find the first comment marker in a test script line.
|
|
3
|
+
* `//` starts a comment only at line start or when preceded by whitespace.
|
|
4
|
+
*
|
|
5
|
+
* @param {string} line
|
|
6
|
+
* @returns {number}
|
|
7
|
+
*/
|
|
8
|
+
export function findCommentStart(line) {
|
|
9
|
+
for (let i = 0; i < line.length - 1; i++) {
|
|
10
|
+
if (line[i] !== "/" || line[i + 1] !== "/") {
|
|
11
|
+
continue;
|
|
12
|
+
}
|
|
13
|
+
if (i === 0 || /\s/.test(line[i - 1])) {
|
|
14
|
+
return i;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return -1;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Remove inline comments and surrounding instruction whitespace from a line.
|
|
21
|
+
*
|
|
22
|
+
* @param {string} line
|
|
23
|
+
* @returns {string}
|
|
24
|
+
*/
|
|
25
|
+
export function stripInstructionComment(line) {
|
|
26
|
+
const commentStart = findCommentStart(line);
|
|
27
|
+
const instruction = commentStart >= 0 ? line.slice(0, commentStart) : line;
|
|
28
|
+
return instruction.trim();
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Parse executable instructions from raw .dcua content.
|
|
32
|
+
*
|
|
33
|
+
* @param {string} content
|
|
34
|
+
* @returns {string[]}
|
|
35
|
+
*/
|
|
36
|
+
export function parseTestScript(content) {
|
|
37
|
+
return content
|
|
38
|
+
.split("\n")
|
|
39
|
+
.map(stripInstructionComment)
|
|
40
|
+
.filter((line) => line.length > 0);
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Count executable instructions in raw .dcua content.
|
|
44
|
+
*
|
|
45
|
+
* @param {string} content
|
|
46
|
+
* @returns {number}
|
|
47
|
+
*/
|
|
48
|
+
export function countExecutableInstructions(content) {
|
|
49
|
+
return parseTestScript(content).length;
|
|
50
|
+
}
|