testchimp-runner-core 0.0.34 → 0.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/execution-service.d.ts +1 -4
- package/dist/execution-service.d.ts.map +1 -1
- package/dist/execution-service.js +155 -468
- package/dist/execution-service.js.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11 -1
- package/dist/index.js.map +1 -1
- package/dist/orchestrator/decision-parser.d.ts +18 -0
- package/dist/orchestrator/decision-parser.d.ts.map +1 -0
- package/dist/orchestrator/decision-parser.js +127 -0
- package/dist/orchestrator/decision-parser.js.map +1 -0
- package/dist/orchestrator/index.d.ts +4 -2
- package/dist/orchestrator/index.d.ts.map +1 -1
- package/dist/orchestrator/index.js +14 -2
- package/dist/orchestrator/index.js.map +1 -1
- package/dist/orchestrator/orchestrator-agent.d.ts +17 -14
- package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
- package/dist/orchestrator/orchestrator-agent.js +534 -204
- package/dist/orchestrator/orchestrator-agent.js.map +1 -1
- package/dist/orchestrator/orchestrator-prompts.d.ts +14 -2
- package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -1
- package/dist/orchestrator/orchestrator-prompts.js +529 -247
- package/dist/orchestrator/orchestrator-prompts.js.map +1 -1
- package/dist/orchestrator/page-som-handler.d.ts +106 -0
- package/dist/orchestrator/page-som-handler.d.ts.map +1 -0
- package/dist/orchestrator/page-som-handler.js +1353 -0
- package/dist/orchestrator/page-som-handler.js.map +1 -0
- package/dist/orchestrator/som-types.d.ts +149 -0
- package/dist/orchestrator/som-types.d.ts.map +1 -0
- package/dist/orchestrator/som-types.js +87 -0
- package/dist/orchestrator/som-types.js.map +1 -0
- package/dist/orchestrator/tool-registry.d.ts +2 -0
- package/dist/orchestrator/tool-registry.d.ts.map +1 -1
- package/dist/orchestrator/tool-registry.js.map +1 -1
- package/dist/orchestrator/tools/index.d.ts +4 -1
- package/dist/orchestrator/tools/index.d.ts.map +1 -1
- package/dist/orchestrator/tools/index.js +7 -2
- package/dist/orchestrator/tools/index.js.map +1 -1
- package/dist/orchestrator/tools/refresh-som-markers.d.ts +12 -0
- package/dist/orchestrator/tools/refresh-som-markers.d.ts.map +1 -0
- package/dist/orchestrator/tools/refresh-som-markers.js +64 -0
- package/dist/orchestrator/tools/refresh-som-markers.js.map +1 -0
- package/dist/orchestrator/tools/view-previous-screenshot.d.ts +15 -0
- package/dist/orchestrator/tools/view-previous-screenshot.d.ts.map +1 -0
- package/dist/orchestrator/tools/view-previous-screenshot.js +92 -0
- package/dist/orchestrator/tools/view-previous-screenshot.js.map +1 -0
- package/dist/orchestrator/types.d.ts +23 -1
- package/dist/orchestrator/types.d.ts.map +1 -1
- package/dist/orchestrator/types.js +11 -1
- package/dist/orchestrator/types.js.map +1 -1
- package/dist/scenario-service.d.ts +5 -0
- package/dist/scenario-service.d.ts.map +1 -1
- package/dist/scenario-service.js +17 -0
- package/dist/scenario-service.js.map +1 -1
- package/dist/scenario-worker-class.d.ts +4 -0
- package/dist/scenario-worker-class.d.ts.map +1 -1
- package/dist/scenario-worker-class.js +18 -3
- package/dist/scenario-worker-class.js.map +1 -1
- package/dist/testing/agent-tester.d.ts +35 -0
- package/dist/testing/agent-tester.d.ts.map +1 -0
- package/dist/testing/agent-tester.js +84 -0
- package/dist/testing/agent-tester.js.map +1 -0
- package/dist/testing/ref-translator-tester.d.ts +44 -0
- package/dist/testing/ref-translator-tester.d.ts.map +1 -0
- package/dist/testing/ref-translator-tester.js +104 -0
- package/dist/testing/ref-translator-tester.js.map +1 -0
- package/dist/utils/hierarchical-selector.d.ts +47 -0
- package/dist/utils/hierarchical-selector.d.ts.map +1 -0
- package/dist/utils/hierarchical-selector.js +212 -0
- package/dist/utils/hierarchical-selector.js.map +1 -0
- package/dist/utils/page-info-retry.d.ts +14 -0
- package/dist/utils/page-info-retry.d.ts.map +1 -0
- package/dist/utils/page-info-retry.js +60 -0
- package/dist/utils/page-info-retry.js.map +1 -0
- package/dist/utils/page-info-utils.d.ts +1 -0
- package/dist/utils/page-info-utils.d.ts.map +1 -1
- package/dist/utils/page-info-utils.js +46 -18
- package/dist/utils/page-info-utils.js.map +1 -1
- package/dist/utils/ref-attacher.d.ts +21 -0
- package/dist/utils/ref-attacher.d.ts.map +1 -0
- package/dist/utils/ref-attacher.js +149 -0
- package/dist/utils/ref-attacher.js.map +1 -0
- package/dist/utils/ref-translator.d.ts +49 -0
- package/dist/utils/ref-translator.d.ts.map +1 -0
- package/dist/utils/ref-translator.js +276 -0
- package/dist/utils/ref-translator.js.map +1 -0
- package/package.json +1 -1
- package/plandocs/exploratory-mode-support-v2.plan.md +953 -0
- package/plandocs/exploratory-mode-support.plan.md +928 -0
- package/plandocs/journey-id-tracking-addendum.md +227 -0
- package/src/execution-service.ts +179 -596
- package/src/index.ts +10 -0
- package/src/orchestrator/decision-parser.ts +139 -0
- package/src/orchestrator/index.ts +25 -1
- package/src/orchestrator/orchestrator-agent.ts +656 -236
- package/src/orchestrator/orchestrator-prompts.ts +559 -247
- package/src/orchestrator/page-som-handler.ts +1565 -0
- package/src/orchestrator/som-types.ts +188 -0
- package/src/orchestrator/tool-registry.ts +2 -0
- package/src/orchestrator/tools/index.ts +4 -1
- package/src/orchestrator/tools/refresh-som-markers.ts +69 -0
- package/src/orchestrator/tools/view-previous-screenshot.ts +103 -0
- package/src/orchestrator/types.ts +49 -6
- package/src/scenario-service.ts +20 -0
- package/src/scenario-worker-class.ts +24 -3
- package/src/utils/page-info-retry.ts +65 -0
- package/src/utils/page-info-utils.ts +53 -18
- package/testchimp-runner-core-0.0.35.tgz +0 -0
- package/src/orchestrator/orchestrator-agent.ts.backup +0 -1386
- package/testchimp-runner-core-0.0.33.tgz +0 -0
- /package/{RELEASE_0.0.26.md → releasenotes/RELEASE_0.0.26.md} +0 -0
- /package/{RELEASE_0.0.27.md → releasenotes/RELEASE_0.0.27.md} +0 -0
- /package/{RELEASE_0.0.28.md → releasenotes/RELEASE_0.0.28.md} +0 -0
|
@@ -10,223 +10,261 @@ export class OrchestratorPrompts {
|
|
|
10
10
|
/**
|
|
11
11
|
* Build main system prompt for selector-based mode
|
|
12
12
|
*/
|
|
13
|
-
static buildSystemPrompt(toolDescriptions: string): string {
|
|
13
|
+
static buildSystemPrompt(toolDescriptions: string, enableCoordinateMode: boolean = false): string {
|
|
14
14
|
return `You are an intelligent test automation agent that executes web scenarios using Playwright.
|
|
15
15
|
|
|
16
|
+
DISCRETE EXPERIENCE LOOP:
|
|
17
|
+
You operate in iterations: receive state → decide → sleep → wake with new state.
|
|
18
|
+
|
|
19
|
+
Key implications:
|
|
20
|
+
- System waits for page stability after each batch
|
|
21
|
+
- Effects may be transient (alerts) or persistent (error labels) - suggest checking persistent indicators
|
|
22
|
+
- Batch safe commands (fill all fields together), separate DOM-changing ones (click then fill)
|
|
23
|
+
- Note to future self: strategy, what to verify, backup plans if action fails
|
|
24
|
+
|
|
25
|
+
EXECUTION PRIORITIES:
|
|
26
|
+
1. SoM-marked elements (reliable selectors)
|
|
27
|
+
2. Keyboard nav from marked elements (Tab, Enter)
|
|
28
|
+
3. Coords for unmarked elements (valid fallback)
|
|
29
|
+
|
|
30
|
+
Difficult: Shadow DOM, iframes, canvas UIs, file uploads - try keyboard or coords.
|
|
31
|
+
|
|
16
32
|
${toolDescriptions}
|
|
17
33
|
|
|
18
|
-
|
|
34
|
+
CRITICAL: STEP COMPLETION DECISION
|
|
35
|
+
Each step has ONE specific goal. Once that goal is achieved, mark status="complete" IMMEDIATELY.
|
|
36
|
+
|
|
37
|
+
**When to mark COMPLETE:**
|
|
38
|
+
- Step: "Navigate to URL" → Mark complete after navigate command succeeds (don't login yet!)
|
|
39
|
+
- Step: "Fill login form" → Mark complete after filling fields (don't click submit yet!)
|
|
40
|
+
- Step: "Click Submit" → Mark complete after clicking (don't wait for next page!)
|
|
41
|
+
|
|
42
|
+
**DO NOT:**
|
|
43
|
+
- Continue with future steps while still on current step
|
|
44
|
+
- Assume the step wants you to do more than stated
|
|
45
|
+
- Wait for side effects (navigation, etc.) before marking complete
|
|
46
|
+
|
|
47
|
+
**The goal text is LITERAL** - do exactly what it says, then mark complete.
|
|
48
|
+
|
|
49
|
+
OUTPUT FORMAT (JSON):
|
|
50
|
+
|
|
51
|
+
{
|
|
52
|
+
"status": "continue" | "complete" | "stuck" | "infeasible",
|
|
53
|
+
"reasoning": "your thinking",
|
|
54
|
+
|
|
55
|
+
"commands": [ // Mix ref and playwright commands as needed
|
|
56
|
+
{ "type": "playwright", "code": "await page.goto('https://example.com')" },
|
|
57
|
+
{ "type": "ref", "ref": "e22", "operation": "fill", "value": "text" },
|
|
58
|
+
{ "type": "ref", "ref": "e31", "operation": "click" },
|
|
59
|
+
{ "type": "playwright", "code": "await page.waitForLoadState('networkidle')" }
|
|
60
|
+
],
|
|
61
|
+
|
|
62
|
+
"toolCalls": [{ "name": "tool_name", "params": {} }],
|
|
63
|
+
"blockerDetected": { "description": "...", "clearingCommands": ["..."] },
|
|
64
|
+
"experiences": ["app pattern"],
|
|
65
|
+
"noteToFutureSelf": "See NOTETOSELF GUIDELINES below",
|
|
66
|
+
"debugInfo": { // OPTIONAL: Only if you have confident prompt improvement suggestions
|
|
67
|
+
"suggestedPromptUpdates": "Add instruction: When form has Country dropdown, select country BEFORE filling phone (enables country code)",
|
|
68
|
+
"reasoning": "Encountered this pattern 3 times - dropdown selection unlocks dependent fields"
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
NOTETOSELF: Your only cognition continuity - capture THINKING/INTENTIONS (history has actions).
|
|
73
|
+
Include: strategy, hypothesis, alternatives/backups if fails, what to verify next, observations.
|
|
74
|
+
Example: "Strategy: Clicking ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands with nav options."
|
|
75
|
+
|
|
76
|
+
META-LEARNING (debugInfo): Could this prompt have been better. Suggest fixes.
|
|
77
|
+
|
|
78
|
+
STATUS: complete=goal achieved, continue=need more, stuck=5 fails, infeasible=impossible.
|
|
79
|
+
RULES: Do only step goal. Minimal commands. Try different selectors if fail. Use blockerDetected for modals.
|
|
80
|
+
|
|
81
|
+
COMMANDS FORMAT:
|
|
82
|
+
|
|
83
|
+
Array of plain Playwright command strings:
|
|
84
|
+
{
|
|
85
|
+
"commands": [
|
|
86
|
+
"await page.fill('input[name=\"email\"]', 'user@test.com')",
|
|
87
|
+
"await page.fill('input[name=\"password\"]', 'secret123')",
|
|
88
|
+
"await page.click('button[type=\"submit\"]')"
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
SELECTOR STRATEGIES (use in order of preference):
|
|
93
|
+
1. getByRole: page.getByRole('button', {name: 'Login'})
|
|
94
|
+
2. getByLabel: page.getByLabel('Email address')
|
|
95
|
+
3. getByPlaceholder: page.getByPlaceholder('Enter email')
|
|
96
|
+
4. getByText: page.getByText('Sign in')
|
|
97
|
+
5. CSS: page.locator('input[name="email"]')
|
|
98
|
+
6. Test IDs: page.getByTestId('login-button')
|
|
99
|
+
|
|
100
|
+
Example login commands:
|
|
101
|
+
{
|
|
102
|
+
"commands": [
|
|
103
|
+
"await page.getByLabel('Email').fill('user@test.com')",
|
|
104
|
+
"await page.getByLabel('Password').fill('secret123')",
|
|
105
|
+
"await page.getByRole('button', {name: 'Submit'}).click()"
|
|
106
|
+
]
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
INTERACTIVE ELEMENTS section shows available selectors for each element.`;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Build SoM (Set-of-Marks) system prompt for visual element identification
|
|
114
|
+
*/
|
|
115
|
+
static buildSomSystemPrompt(restrictCoordinates: boolean = false): string {
|
|
116
|
+
const coordinateRestriction = restrictCoordinates ? `
|
|
117
|
+
|
|
118
|
+
CRITICAL: COORDINATE COMMANDS RESTRICTED
|
|
119
|
+
You are in SCRIPT GENERATION mode. Coordinate-based commands should ONLY be used as an ABSOLUTE LAST RESORT.
|
|
120
|
+
|
|
121
|
+
Strong preference order:
|
|
122
|
+
1. Use SoM-marked elements with actions (fill, click, press Enter)
|
|
123
|
+
2. Use keyboard navigation from SoM-marked elements (Tab, Arrow keys, Enter to submit)
|
|
124
|
+
3. ONLY IF NO OTHER OPTION EXISTS: use coordinate commands
|
|
125
|
+
|
|
126
|
+
If you use coordinates, you MUST explain in commandReasoning why no SoM-marked alternative exists.` : '';
|
|
127
|
+
|
|
128
|
+
return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}
|
|
129
|
+
|
|
130
|
+
DISCRETE EXPERIENCE LOOP:
|
|
131
|
+
You operate in iterations: receive state → decide → sleep → wake with new state.
|
|
132
|
+
System waits for page stability after each batch - you ALWAYS receive fully loaded pages (never loading screens).
|
|
133
|
+
Batch safe commands, suggest persistent indicators, tell future self what to verify.
|
|
134
|
+
|
|
135
|
+
IMPORTANT: You will receive a screenshot with COLOR-CODED BOUNDING BOXES and IDs overlaid on interactive elements.
|
|
136
|
+
|
|
137
|
+
SCREENSHOT SCOPE:
|
|
138
|
+
- Shows VIEWPORT ONLY (what's currently visible, not full page)
|
|
139
|
+
- Elements outside the viewport are NOT shown (you must scroll to reveal them)
|
|
140
|
+
- If you need to see more: use SCROLL action or take_screenshot tool with isFullPage=true
|
|
141
|
+
|
|
142
|
+
VISUAL MARKER SYSTEM:
|
|
143
|
+
- Each interactive element has a colored bounding box with a unique color
|
|
144
|
+
- The element ID (1, 2, 3, etc.) is displayed in a label at the TOP-RIGHT corner, OUTSIDE the box
|
|
145
|
+
- The label is typically positioned OUTSIDE and ABOVE the bounding box (not attached)
|
|
146
|
+
- The label color matches the bounding box color for easy correlation
|
|
147
|
+
- TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color
|
|
148
|
+
|
|
149
|
+
REFERENCE ELEMENTS BY ID:
|
|
150
|
+
- To interact with an element, reference its ID in your commands
|
|
151
|
+
|
|
152
|
+
ICON BUTTON IDENTIFICATION:
|
|
153
|
+
When step involves icon buttons (no visible text), use COMMON ICON SEMANTICS + element map:
|
|
154
|
+
- Match step goal to icon meaning: "Add Campaign" → plus icon, "Settings" → gear, "Menu" → hamburger, "Delete" → trash
|
|
155
|
+
- Check element map for aria-label confirmation: [5] might show (aria: "add-campaign")
|
|
156
|
+
- Common icons: hamburger=menu, gear=settings, plus=add, trash=delete, arrow=back/nav, check=confirm, X=close, magnifier=search, dots=more
|
|
157
|
+
- DON'T randomly try icon buttons - reason about which icon fits the step goal
|
|
158
|
+
|
|
159
|
+
Example: Step "Add new campaign" → Look for plus icon in toolbar → Check map shows aria "add" → Use that ID.
|
|
160
|
+
|
|
161
|
+
CRITICAL: ONLY INTERACT WITH VISIBLE ELEMENTS - use your EYES, not assumptions!
|
|
162
|
+
FORBIDDEN: Guessing locations, assuming "typical" positions, clicking without seeing element.
|
|
163
|
+
REQUIRED: Only interact with elements you SEE in screenshot. If not visible, scroll or use take_screenshot(isFullPage=true).
|
|
164
|
+
If action fails, try alternative elements - don't repeat same ID blindly.
|
|
165
|
+
|
|
166
|
+
TYPESCRIPT INTERFACES (your response MUST conform to these):
|
|
167
|
+
|
|
168
|
+
\`\`\`typescript
|
|
169
|
+
interface Coordinate {
|
|
170
|
+
x: number; // Percentage of viewport width (0-100, use 3 decimals: 15.625)
|
|
171
|
+
y: number; // Percentage of viewport height (0-100, use 3 decimals: 82.375)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
interface SomCommand {
|
|
175
|
+
action: InteractionAction; // REQUIRED: Action to perform (distinguishes from SomVerification)
|
|
176
|
+
elementRef?: string; // Element ID from screenshot (e.g., "1", "2", "42")
|
|
177
|
+
coord?: Coordinate; // Direct percentage-based coords (use when SoM marker missing)
|
|
178
|
+
value?: string; // For fill/select/press actions
|
|
179
|
+
fromCoord?: Coordinate; // For drag actions (start point)
|
|
180
|
+
toCoord?: Coordinate; // For drag actions (end point)
|
|
181
|
+
// ... other optional parameters
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
interface SomVerification {
|
|
185
|
+
verificationType: VerificationType; // REQUIRED: Type of verification (distinguishes from SomCommand)
|
|
186
|
+
elementRef?: string; // SoM ID (e.g., "3") - optional for count verifications
|
|
187
|
+
expected?: string | number; // Expected value/text/count
|
|
188
|
+
description?: string; // Human-readable description
|
|
189
|
+
selector?: string; // CSS selector for count verifications (e.g., 'ul.items > li')
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// See available verifications in comment above
|
|
193
|
+
|
|
194
|
+
COMMANDS ARRAY: Mix actions (has 'action') and verifications (has 'verificationType').
|
|
195
|
+
Example: [{"elementRef":"4","action":"fill","value":"Hello"}, {"elementRef":"3","verificationType":"textContains","expected":"You: Hello"}]
|
|
196
|
+
CRITICAL: Verification steps MUST generate verification commands (never 0 commands) - don't just visually confirm!
|
|
197
|
+
|
|
198
|
+
COORDINATES (when SoM marker missing):
|
|
199
|
+
Use percentage-based coords for unmarked elements:
|
|
200
|
+
{ "action": "click", "coord": { "x": 85.625, "y": 12.375 } }
|
|
201
|
+
|
|
202
|
+
Format: percentages 0-100, MUST use 3 decimals (0.000 = top-left, 50.000 = center, 100.000 = bottom-right).
|
|
203
|
+
After coord click, magenta "clicked" marker appears. Use view_previous_screenshot tool to verify if result unexpected.
|
|
204
|
+
|
|
205
|
+
NAVIGATION: Use navigate/goBack/goForward/reload actions (no elementRef needed).
|
|
206
|
+
Example: { "action": "navigate", "value": "https://..." }
|
|
207
|
+
DON'T click address bar - use navigate action. System waits for page load after navigation.
|
|
208
|
+
|
|
209
|
+
// Available actions: click, doubleClick, rightClick, hover, drag, fill, press, select, check, uncheck, focus, blur, scroll, navigate, goBack, goForward, reload
|
|
210
|
+
// Available verifications: textContains, textEquals, valueEquals, valueEmpty, isVisible, isHidden, isEnabled, isDisabled, isChecked, isUnchecked, countEquals, countGreaterThan, countLessThan, hasClass, hasAttribute
|
|
19
211
|
|
|
20
212
|
interface AgentDecisionLLMResponse {
|
|
21
|
-
status:
|
|
22
|
-
reasoning: string;
|
|
23
|
-
commands?:
|
|
24
|
-
commandReasoning?: string;
|
|
25
|
-
toolCalls?: Array<{
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
description: string;
|
|
50
|
-
clearingCommands: string[];
|
|
51
|
-
};
|
|
52
|
-
stepReEvaluation?: {
|
|
53
|
-
detected: boolean;
|
|
54
|
-
issue: "prior_incomplete" | "already_done" | "wrong_order" | null;
|
|
55
|
-
explanation: string;
|
|
56
|
-
};
|
|
213
|
+
status: "continue" | "complete" | "stuck" | "infeasible";
|
|
214
|
+
reasoning: string;
|
|
215
|
+
commands?: (SomCommand | SomVerification)[]; // REPAIR MODE: Can be empty [] if step already done/obsolete
|
|
216
|
+
commandReasoning?: string;
|
|
217
|
+
toolCalls?: Array<{ name: string; params: Record<string, any> }>;
|
|
218
|
+
noteToFutureSelf?: string;
|
|
219
|
+
experiences?: string[];
|
|
220
|
+
blockerDetected?: { description: string; clearingCommands: SomCommand[] };
|
|
221
|
+
debugInfo?: { suggestedPromptUpdates?: string; reasoning?: string };
|
|
222
|
+
}
|
|
223
|
+
\`\`\`
|
|
224
|
+
|
|
225
|
+
NOTETOSELF: Your only continuity. Include: hypothesis, strategy, backup plans if fails, what to verify, observations.
|
|
226
|
+
Example: "Strategy: Click ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands."
|
|
227
|
+
|
|
228
|
+
EXAMPLE RESPONSES:
|
|
229
|
+
|
|
230
|
+
Action step:
|
|
231
|
+
\`\`\`json
|
|
232
|
+
{
|
|
233
|
+
"status": "continue",
|
|
234
|
+
"reasoning": "Need to fill login form with credentials",
|
|
235
|
+
"commands": [
|
|
236
|
+
{ "elementRef": "5", "action": "fill", "value": "user@example.com" },
|
|
237
|
+
{ "elementRef": "7", "action": "fill", "value": "password123" },
|
|
238
|
+
{ "elementRef": "12", "action": "click" }
|
|
239
|
+
],
|
|
240
|
+
"commandReasoning": "Filling email (ID 5), password (ID 7), clicking submit (ID 12)"
|
|
57
241
|
}
|
|
242
|
+
\`\`\`
|
|
243
|
+
|
|
244
|
+
Verification step:
|
|
245
|
+
\`\`\`json
|
|
246
|
+
{
|
|
247
|
+
"status": "complete",
|
|
248
|
+
"reasoning": "Message sent and verified in conversation",
|
|
249
|
+
"commands": [
|
|
250
|
+
{ "elementRef": "3", "verificationType": "textContains", "expected": "You: Hello", "description": "Message appears in thread" },
|
|
251
|
+
{ "elementRef": "4", "verificationType": "valueEmpty", "description": "Input cleared" }
|
|
252
|
+
],
|
|
253
|
+
"commandReasoning": "Verifying message visible in conversation (ID 3) and input empty (ID 4)"
|
|
254
|
+
}
|
|
255
|
+
\`\`\`
|
|
256
|
+
|
|
257
|
+
REPAIR MODE - Step already completed (DELETE case):
|
|
258
|
+
\`\`\`json
|
|
259
|
+
{
|
|
260
|
+
"status": "complete",
|
|
261
|
+
"reasoning": "Step asked to 'Dismiss welcome modal' but I see no modal in current screenshot - it was already dismissed by prior steps",
|
|
262
|
+
"commands": [],
|
|
263
|
+
"commandReasoning": "No commands needed - step goal already achieved/obsolete"
|
|
264
|
+
}
|
|
265
|
+
\`\`\`
|
|
58
266
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
RULE #1: NEVER MARK "complete" IF ANY COMMAND FAILED
|
|
62
|
-
- Command failed (timeout, error, exception)? → status MUST be "continue" or "stuck"
|
|
63
|
-
- EVEN IF you think the goal might be achieved, if command failed → NOT "complete"
|
|
64
|
-
- System will OVERRIDE and force "continue" if you violate this
|
|
65
|
-
|
|
66
|
-
RULE #2: Decision tree:
|
|
67
|
-
- Command FAILED? → "continue" (retry different way) OR "stuck" (exhausted all attempts)
|
|
68
|
-
- Command SUCCEEDED? → "complete" (goal done) OR "continue" (need more actions)
|
|
69
|
-
|
|
70
|
-
Status meanings:
|
|
71
|
-
- "complete": Commands succeeded AND goal achieved
|
|
72
|
-
- "continue": Command failed OR need more actions
|
|
73
|
-
- "stuck": Tried 5 iterations, all failed, can't proceed
|
|
74
|
-
- "infeasible": Goal impossible (element truly doesn't exist)
|
|
75
|
-
|
|
76
|
-
Examples:
|
|
77
|
-
✅ Command: page.click('button') → Success → Goal done → status: "complete"
|
|
78
|
-
❌ Command: page.click('button') → Timeout → status: "complete" (WRONG! Must be "continue")
|
|
79
|
-
✅ Command: page.click('button') → Timeout → status: "continue" (try different selector)
|
|
80
|
-
|
|
81
|
-
STEP RE-EVALUATION (After 2+ failures - Question assumptions!):
|
|
82
|
-
|
|
83
|
-
After repeated failures, check:
|
|
84
|
-
1. LOOK BACK: Did prior steps actually complete? (Check COMPLETED vs page state)
|
|
85
|
-
2. LOOK FORWARD: Is current step already done?
|
|
86
|
-
3. LOOK AHEAD: Is next step more feasible with current page state?
|
|
87
|
-
|
|
88
|
-
Stick to original plan unless clear evidence suggests otherwise.
|
|
89
|
-
|
|
90
|
-
BLOCKER DETECTION:
|
|
91
|
-
|
|
92
|
-
Use when unexpected UI blocks current goal (modals, overlays, prompts).
|
|
93
|
-
Provide clearingCommands to dismiss blocker, then regular commands execute.
|
|
94
|
-
|
|
95
|
-
Example: Cookie modal → clearingCommands: ["click Accept"], commands: ["fill email"]
|
|
96
|
-
NOT blockers: Wrong selectors, missing elements (those are "infeasible")
|
|
97
|
-
|
|
98
|
-
EXPERIENCES - App-specific patterns only:
|
|
99
|
-
- Concise, 1-2 per iteration
|
|
100
|
-
- Focus on app quirks (custom dropdowns, data-testid patterns, semantic selector availability)
|
|
101
|
-
- NOT obvious things ("button has role=button") or individual selectors
|
|
102
|
-
- Combine related learnings
|
|
103
|
-
|
|
104
|
-
CRITICAL RULES FOR DECISION MAKING:
|
|
105
|
-
|
|
106
|
-
DECISION-MAKING PROCESS:
|
|
107
|
-
|
|
108
|
-
1. **FOCUS**: Do ONLY what current step asks. No extra actions unless step says "verify/check".
|
|
109
|
-
|
|
110
|
-
2. **EFFICIENCY**: Don't click before fill. Don't add unnecessary preparation. Minimal commands.
|
|
111
|
-
|
|
112
|
-
3. **WHEN COMMAND FAILS**: Analyze error → Check DOM for semantic selectors → Try different approach
|
|
113
|
-
Never repeat same selector. Avoid auto-generated IDs (#«r3»-form-item). Screenshot if stuck.
|
|
114
|
-
|
|
115
|
-
4. **DETECT LOOPS**: Same selector failed 2+ times? Set detectingLoop: true, take screenshot, use DIFFERENT semantic selector from ARIA tree.
|
|
116
|
-
|
|
117
|
-
5. **AVAILABLE CONTEXT**: page, expect (already imported), extractedData (from extract_data tool)
|
|
118
|
-
|
|
119
|
-
6. **USE DOM SNAPSHOT** (CRITICAL - Don't invent selectors!):
|
|
120
|
-
- You get INTERACTIVE ELEMENTS with: position, tag, id, text, SUGGESTED SELECTORS
|
|
121
|
-
- ONLY use what's in the list - DON'T invent text/names not shown
|
|
122
|
-
- Use EXACT selectors provided (#submit-btn, getByRole('button', {name: 'X'}), etc.)
|
|
123
|
-
- ⚠️ If text appears multiple times → scope to parent or use role filter
|
|
124
|
-
- Element not in list? → take_screenshot
|
|
125
|
-
- ARIA tree = source of truth for roles/names
|
|
126
|
-
|
|
127
|
-
⚠️ IGNORE NON-INTERACTIVE ELEMENTS:
|
|
128
|
-
- Tooltips (role="tooltip") - informational only, NOT clickable
|
|
129
|
-
- Status messages (role="status", role="alert") - display info, NOT clickable
|
|
130
|
-
- Popovers with no buttons inside - usually just show info on hover
|
|
131
|
-
- Hidden elements (aria-hidden="true", display:none) - can't interact
|
|
132
|
-
|
|
133
|
-
If you see a tooltip text in the goal, find the TRIGGER element (button/icon that shows the tooltip), NOT the tooltip itself.
|
|
134
|
-
|
|
135
|
-
⚠️ TOOLTIPS CAUSE STRICT MODE VIOLATIONS:
|
|
136
|
-
- Tooltips/popovers DUPLICATE text in the DOM (button + tooltip both have same text)
|
|
137
|
-
- BAD: getByText('Settings') → matches both button AND tooltip → STRICT MODE ERROR
|
|
138
|
-
- GOOD: getByRole('button', { name: 'Settings' }) → matches only button, ignores tooltip
|
|
139
|
-
- GOOD: locator('button').getByText('Settings') → scoped to button element
|
|
140
|
-
- Always prefer role-based selectors when text might appear in tooltips
|
|
141
|
-
|
|
142
|
-
7. **SELECTOR PREFERENCE** (CRITICAL):
|
|
143
|
-
Prefer in order:
|
|
144
|
-
1. getByRole/Label/Placeholder - Semantic, stable
|
|
145
|
-
2. getByText - BUT scope to parent if text appears multiple times (strict mode!)
|
|
146
|
-
3. data-testid or stable IDs
|
|
147
|
-
4. Avoid: Auto-generated IDs (#«r3»-form-item), unicode, complex CSS paths
|
|
148
|
-
|
|
149
|
-
⚠️ Common mistakes:
|
|
150
|
-
- getByText('Settings') when "Settings" appears 2+ times → STRICT MODE ERROR
|
|
151
|
-
Fix: locator('#parent').getByText('Settings') OR getByRole('button').filter({hasText: 'Settings'})
|
|
152
|
-
- Missing timeout on goto() → Add { timeout: 30000 }
|
|
153
|
-
- Using auto-generated IDs → Break when components re-render
|
|
154
|
-
|
|
155
|
-
8. **ASSERTIONS** (CRITICAL):
|
|
156
|
-
Use expect() ONLY when step explicitly asks: "verify", "check", "ensure", "confirm"
|
|
157
|
-
|
|
158
|
-
When to use:
|
|
159
|
-
- "Verify X appears" → await expect(locator).toBeVisible()
|
|
160
|
-
- "Check field is empty" → await expect(locator).toBeEmpty()
|
|
161
|
-
- "Confirm URL" → await expect(page).toHaveURL('...')
|
|
162
|
-
|
|
163
|
-
When NOT to use:
|
|
164
|
-
- "Send message" → DON'T verify it appeared (unless step asks)
|
|
165
|
-
- "Click Submit" → DON'T check button state
|
|
166
|
-
- "Fill field" → DON'T verify it's filled
|
|
167
|
-
|
|
168
|
-
9. **TOOLS vs COMMANDS**:
|
|
169
|
-
Tools = read-only info gathering (screenshot, recall_history, extract_data)
|
|
170
|
-
Commands = state changes (Playwright: goto, click, fill, etc.)
|
|
171
|
-
|
|
172
|
-
Navigation commands MUST include timeout:
|
|
173
|
-
- page.goto(url, { waitUntil: 'load', timeout: 30000 })
|
|
174
|
-
- page.waitForLoadState('load', { timeout: 30000 })
|
|
175
|
-
|
|
176
|
-
10. **ERROR ANALYSIS** (Think about what went wrong):
|
|
177
|
-
- "Timeout waiting for locator" → Selector doesn't exist, find different one in DOM
|
|
178
|
-
- "page.goto: Timeout" → Missing timeout param: page.goto(url, { timeout: 30000 })
|
|
179
|
-
- "strict mode violation" → Text appears multiple times. Scope to parent: locator('#parent').getByText()
|
|
180
|
-
- "Element is not <select>" → Custom dropdown, use .click() not .selectOption()
|
|
181
|
-
- Loop detected (same selector 2+ times) → Try completely different selector from ARIA tree
|
|
182
|
-
|
|
183
|
-
11. **WHEN TO RUN COMMANDS vs TOOLS**:
|
|
184
|
-
- Confident about selectors from DOM → Run commands directly
|
|
185
|
-
- Unsure or failed 2+ times → Take screenshot first
|
|
186
|
-
- First iteration of a step → Usually can run commands from DOM
|
|
187
|
-
- After successful command → mark "complete" if goal achieved (trust Playwright - if it succeeded, it worked)
|
|
188
|
-
|
|
189
|
-
12. **NOTE TO FUTURE SELF** (Tactical memory across iterations):
|
|
190
|
-
|
|
191
|
-
Write FREE-FORM notes for your next iteration about:
|
|
192
|
-
- What you tried and why it failed
|
|
193
|
-
- Hypothesis being tested
|
|
194
|
-
- Plan for next attempt
|
|
195
|
-
- Page behavior patterns observed
|
|
196
|
-
|
|
197
|
-
Your next iteration reads this FIRST - use it to maintain strategic continuity.
|
|
198
|
-
|
|
199
|
-
13. **COORDINATE-BASED ACTIONS** (Last resort after 3 selector failures):
|
|
200
|
-
|
|
201
|
-
Activated automatically after 3 failures. Use PERCENTAGES (0-100, 3 decimals):
|
|
202
|
-
- xPercent: 0=left, 100=right
|
|
203
|
-
- yPercent: 0=top, 100=bottom
|
|
204
|
-
|
|
205
|
-
Format:
|
|
206
|
-
{
|
|
207
|
-
"coordinateAction": {
|
|
208
|
-
"type": "coordinate",
|
|
209
|
-
"action": "click|doubleClick|rightClick|hover|drag|fill|scroll",
|
|
210
|
-
"xPercent": 15.755, "yPercent": 8.500,
|
|
211
|
-
"toXPercent": 45.25, "toYPercent": 8.50, // For drag
|
|
212
|
-
"value": "text", // For fill
|
|
213
|
-
"scrollAmount": 500 // For scroll
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
AFTER coordinate action succeeds:
|
|
218
|
-
- If goal verification unclear → CALL verify_action_result tool
|
|
219
|
-
- Tool compares before/after screenshots to confirm goal achieved
|
|
220
|
-
- If verified: mark status="complete"
|
|
221
|
-
- If not verified: try different coordinates (2 attempts max)
|
|
222
|
-
|
|
223
|
-
Example after coordinate click:
|
|
224
|
-
{
|
|
225
|
-
"status": "continue",
|
|
226
|
-
"reasoning": "Coordinate click succeeded, verifying if dashboard page loaded",
|
|
227
|
-
"toolCalls": [{"name": "verify_action_result", "params": {"expectedChange": "Dashboard page with data grid visible"}}],
|
|
228
|
-
"needsToolResults": true
|
|
229
|
-
}`;
|
|
267
|
+
OUTPUT FORMAT: JSON matching AgentDecisionLLMResponse interface above.`;
|
|
230
268
|
}
|
|
231
269
|
|
|
232
270
|
/**
|
|
@@ -238,17 +276,20 @@ DECISION-MAKING PROCESS:
|
|
|
238
276
|
YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
|
|
239
277
|
|
|
240
278
|
SCREENSHOT PROVIDED:
|
|
241
|
-
You will see a screenshot with
|
|
279
|
+
You will see a screenshot with color-coded bounding boxes and ID labels attached to each element.
|
|
242
280
|
|
|
243
281
|
CRITICAL - IDENTIFY THE CORRECT ELEMENT:
|
|
244
282
|
1. READ the step goal carefully - what specific element are you looking for?
|
|
245
|
-
2.
|
|
246
|
-
3.
|
|
283
|
+
2. Look for the colored bounding box that matches the element description
|
|
284
|
+
3. The ID label is at TOP-RIGHT corner, ABOVE the box (bottom of label touches top of box)
|
|
285
|
+
4. Match the label color to the bounding box color
|
|
286
|
+
5. LOCATE that element in the screenshot (NOT a similar-looking element!)
|
|
287
|
+
6. VERIFY position using screen regions:
|
|
247
288
|
- Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
|
|
248
289
|
- Center content: xPercent ~30-70%
|
|
249
290
|
- Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
|
|
250
|
-
|
|
251
|
-
|
|
291
|
+
7. CALCULATE percentages from element's CENTER position
|
|
292
|
+
8. SANITY CHECK your percentages:
|
|
252
293
|
- Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
|
|
253
294
|
- Button in top-left at 90%? WRONG - that's top-right!
|
|
254
295
|
- Element description says "left" but x > 50%? WRONG - recheck!
|
|
@@ -260,7 +301,7 @@ Goal: "Click Settings link in left navigation"
|
|
|
260
301
|
→ Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
|
|
261
302
|
→ Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
|
|
262
303
|
→ xPercent: 12.500, yPercent: 32.000
|
|
263
|
-
→ Sanity check: 12.5% is FAR LEFT
|
|
304
|
+
→ Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
|
|
264
305
|
→ Description: "Clicking center of Settings link in left sidebar"
|
|
265
306
|
|
|
266
307
|
CRITICAL VISUAL ESTIMATION TIPS:
|
|
@@ -329,15 +370,57 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
329
370
|
/**
|
|
330
371
|
* Build user prompt with context
|
|
331
372
|
*/
|
|
332
|
-
static buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
|
|
373
|
+
static buildUserPrompt(context: AgentContext, consecutiveFailures?: number, enableCoordinateMode: boolean = false): string {
|
|
333
374
|
const parts: string[] = [];
|
|
334
375
|
|
|
376
|
+
// Add SoM format reminder if screenshot is present
|
|
377
|
+
if (context.somScreenshot) {
|
|
378
|
+
parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
|
|
379
|
+
parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
|
|
380
|
+
parts.push(`Format: { "elementRef": "5", "action": "fill", "value": "text" }`);
|
|
381
|
+
parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Add repair mode context if present
|
|
385
|
+
if (context.priorSteps && context.priorSteps.length > 0) {
|
|
386
|
+
parts.push(`=== REPAIR MODE ===`);
|
|
387
|
+
parts.push(`Fixing a FAILED step in existing script. Page persisted from prior steps.\n`);
|
|
388
|
+
|
|
389
|
+
parts.push(`COMPLETED STEPS (already executed):`);
|
|
390
|
+
for (let i = 0; i < context.priorSteps.length; i++) {
|
|
391
|
+
parts.push(` ${i + 1}. [OK] ${context.priorSteps[i]}`);
|
|
392
|
+
}
|
|
393
|
+
parts.push(``);
|
|
394
|
+
|
|
395
|
+
parts.push(`>>> FAILED STEP ${context.priorSteps.length + 1}: ${context.currentStepGoal}`);
|
|
396
|
+
parts.push(` This step FAILED. Your job: fix it using current UI (SoM markers).\n`);
|
|
397
|
+
|
|
398
|
+
if (context.nextSteps && context.nextSteps.length > 0) {
|
|
399
|
+
parts.push(`REMAINING STEPS (auto-executes after you fix current):`);
|
|
400
|
+
for (let i = 0; i < context.nextSteps.length; i++) {
|
|
401
|
+
parts.push(` ${context.priorSteps.length + 2 + i}. ${context.nextSteps[i]}`);
|
|
402
|
+
}
|
|
403
|
+
parts.push(``);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
parts.push(`REPAIR STRATEGY:`);
|
|
407
|
+
parts.push(`- CRITICAL: First check if this step is STILL NEEDED (may already be done by prior step or now obsolete)`);
|
|
408
|
+
parts.push(` → If step goal already achieved/no longer needed: Return 0 commands + status "complete" (DELETE case)`);
|
|
409
|
+
parts.push(` → Example: "Dismiss modal" but modal already gone → 0 commands, status "complete"`);
|
|
410
|
+
parts.push(`- Use SoM markers to identify current elements`);
|
|
411
|
+
parts.push(`- Generate commands that work with CURRENT UI (not original script)`);
|
|
412
|
+
parts.push(`- CRITICAL: Once you fix this step, return status "complete" IMMEDIATELY (control goes back to script)`);
|
|
413
|
+
parts.push(` → Repair mode = single step fix, then hand back control`);
|
|
414
|
+
parts.push(` → Don't continue to next steps - script will auto-execute them`);
|
|
415
|
+
parts.push(`- DON'T redo completed steps - only fix the blocker\n`);
|
|
416
|
+
}
|
|
417
|
+
|
|
335
418
|
// Put static instructions first for LLM caching efficiency
|
|
336
419
|
parts.push('STEP EXECUTION RULES:');
|
|
337
420
|
parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
|
|
338
421
|
parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
|
|
339
422
|
parts.push('- Mark "complete" ONLY if commands succeeded');
|
|
340
|
-
parts.push('-
|
|
423
|
+
parts.push('- Screenshot tool: Use ONCE for visual context, then ACT (max 3 per step, system enforced)');
|
|
341
424
|
parts.push('- Max 5 iterations per step, then forced STUCK\n');
|
|
342
425
|
|
|
343
426
|
// Dynamic content follows (changes per iteration)
|
|
@@ -346,38 +429,67 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
346
429
|
// Display note from previous iteration (high priority tactical info)
|
|
347
430
|
if (context.noteFromPreviousIteration) {
|
|
348
431
|
const note = context.noteFromPreviousIteration;
|
|
349
|
-
parts.push(`📝 YOUR NOTE FROM ITERATION
|
|
432
|
+
parts.push(`📝 YOUR NOTE FROM PREVIOUS ITERATION:`);
|
|
350
433
|
parts.push(` ${note.content}`);
|
|
351
434
|
parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
|
|
435
|
+
parts.push(``);
|
|
436
|
+
parts.push(` ACTION REQUIRED:`);
|
|
437
|
+
parts.push(` 1. Did your previous action work? Check the screenshot!`);
|
|
438
|
+
parts.push(` 2. If it WORKED: Execute next step from your plan`);
|
|
439
|
+
parts.push(` 3. If it FAILED: Use your backup plan (try alternative IDs/methods)`);
|
|
440
|
+
parts.push(` 4. Write NEW noteToFutureSelf with:`);
|
|
441
|
+
parts.push(` - What worked/didn't work (learn from attempts)`);
|
|
442
|
+
parts.push(` - Updated strategy with new backup plan`);
|
|
443
|
+
parts.push(` - Next alternatives to try if this fails`);
|
|
444
|
+
parts.push(` - Build on previous note's reasoning`);
|
|
445
|
+
parts.push(``);
|
|
446
|
+
parts.push(` DON'T repeat failed actions - try your backup plan!`);
|
|
352
447
|
parts.push('');
|
|
353
448
|
}
|
|
354
449
|
|
|
355
|
-
// Check for screenshot loops (analysis paralysis)
|
|
450
|
+
// Check for screenshot loops (analysis paralysis) - PER STEP tracking
|
|
451
|
+
const screenshotsThisStep = context.recentSteps.filter(s =>
|
|
452
|
+
s.stepNumber === context.stepNumber &&
|
|
453
|
+
(s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'))
|
|
454
|
+
);
|
|
356
455
|
const recentScreenshots = context.recentSteps.slice(-3).filter(s =>
|
|
357
456
|
s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot')
|
|
358
457
|
);
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
parts.push(`
|
|
362
|
-
parts.push(`
|
|
363
|
-
parts.push(`
|
|
364
|
-
parts.push(`
|
|
365
|
-
parts.push(
|
|
458
|
+
|
|
459
|
+
if (screenshotsThisStep.length >= 3) {
|
|
460
|
+
parts.push(`[CRITICAL] SCREENSHOT LOOP DETECTED - ${screenshotsThisStep.length} SCREENSHOTS THIS STEP`);
|
|
461
|
+
parts.push(`ANALYSIS PARALYSIS! You keep gathering info but NEVER ACTING!`);
|
|
462
|
+
parts.push(`NO MORE SCREENSHOTS ALLOWED - YOU MUST ACT NOW!`);
|
|
463
|
+
parts.push(`Pick ANY selector from your prior DOM snapshots and TRY IT.`);
|
|
464
|
+
parts.push(`Even if uncertain, execute the command. Failure is better than analysis paralysis.`);
|
|
465
|
+
parts.push(`If toolCalls contains "take_screenshot" → SYSTEM WILL REJECT IT\n`);
|
|
466
|
+
} else if (recentScreenshots.length >= 2) {
|
|
467
|
+
parts.push(`[WARNING] ${recentScreenshots.length} screenshots in last 3 iterations - avoid more screenshots`);
|
|
468
|
+
parts.push(`Use selector recommendations from prior screenshots\n`);
|
|
366
469
|
}
|
|
367
470
|
|
|
368
471
|
// System warnings for accumulated failures
|
|
369
472
|
if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
|
|
370
|
-
parts.push(
|
|
371
|
-
|
|
473
|
+
parts.push(`[WARNING] SYSTEM WARNING: ${consecutiveFailures} failures!`);
|
|
474
|
+
// Only suggest screenshot if we haven't already taken multiple THIS STEP
|
|
475
|
+
if (screenshotsThisStep.length === 0) {
|
|
476
|
+
parts.push(`Take screenshot ONCE to see page state. Then ACT with selector.`);
|
|
477
|
+
} else {
|
|
478
|
+
parts.push(`You already have visual context. Try different selector NOW.`);
|
|
479
|
+
}
|
|
372
480
|
parts.push(`Question assumptions: Am I at the right step?`);
|
|
373
|
-
parts.push(
|
|
481
|
+
parts.push(`[WARNING]\n`);
|
|
374
482
|
} else if (consecutiveFailures && consecutiveFailures >= 4) {
|
|
375
|
-
parts.push(
|
|
376
|
-
|
|
483
|
+
parts.push(`[WARNING] CRITICAL: ${consecutiveFailures} failures!`);
|
|
484
|
+
if (enableCoordinateMode) {
|
|
485
|
+
parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
|
|
486
|
+
} else {
|
|
487
|
+
parts.push(`Next failure will force STUCK. Try radically different selector approach.\n`);
|
|
488
|
+
}
|
|
377
489
|
}
|
|
378
490
|
|
|
379
|
-
// Trigger coordinate mode if many failures (Phase 1: after 3 failures)
|
|
380
|
-
if (consecutiveFailures && consecutiveFailures >= 3) {
|
|
491
|
+
// Trigger coordinate mode if many failures (Phase 1: after 3 failures) - ONLY if enabled
|
|
492
|
+
if (enableCoordinateMode && consecutiveFailures && consecutiveFailures >= 3) {
|
|
381
493
|
parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯🎯`);
|
|
382
494
|
parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
|
|
383
495
|
parts.push(`You MUST use coordinate-based action now (percentages).`);
|
|
@@ -390,6 +502,8 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
390
502
|
parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
|
|
391
503
|
parts.push(`${context.currentStepGoal}`);
|
|
392
504
|
parts.push(``);
|
|
505
|
+
parts.push(`[WARNING] IMPORTANT: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
|
|
506
|
+
parts.push(`[WARNING] CRITICAL: Only interact with elements you SEE in the screenshot - no guessing/hallucinating!`);
|
|
393
507
|
parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
|
|
394
508
|
|
|
395
509
|
if (context.completedSteps.length > 0) {
|
|
@@ -399,14 +513,44 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
399
513
|
parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
|
|
400
514
|
}
|
|
401
515
|
|
|
516
|
+
// SoM screenshot (if available)
|
|
517
|
+
if (context.somScreenshot) {
|
|
518
|
+
parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
|
|
519
|
+
parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
|
|
520
|
+
parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
|
|
521
|
+
parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
|
|
522
|
+
parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
|
|
523
|
+
parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
|
|
524
|
+
parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
|
|
525
|
+
parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
|
|
526
|
+
parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
|
|
527
|
+
parts.push(``);
|
|
528
|
+
|
|
529
|
+
// SoM element map for disambiguation
|
|
530
|
+
if (context.somElementMap) {
|
|
531
|
+
parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
|
|
532
|
+
parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
|
|
533
|
+
parts.push(context.somElementMap);
|
|
534
|
+
parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
|
|
535
|
+
parts.push(``);
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
402
539
|
// Current page state (most variable content - at the end)
|
|
403
540
|
parts.push(`\nCURRENT PAGE:`);
|
|
404
541
|
parts.push(`URL: ${context.currentURL}`);
|
|
405
542
|
parts.push(`Title: ${context.currentPageInfo.title}`);
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
543
|
+
|
|
544
|
+
// Only include DOM details if NOT in SoM mode
|
|
545
|
+
if (!context.somScreenshot) {
|
|
546
|
+
parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
|
|
547
|
+
parts.push(context.currentPageInfo.formattedElements);
|
|
548
|
+
parts.push(`\nARIA TREE (hierarchical structure):`);
|
|
549
|
+
parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
|
|
550
|
+
} else {
|
|
551
|
+
// In SoM mode, skip DOM details - agent uses visual screenshot instead
|
|
552
|
+
parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
|
|
553
|
+
}
|
|
410
554
|
if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
|
|
411
555
|
parts.push('... (truncated)');
|
|
412
556
|
}
|
|
@@ -416,11 +560,11 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
416
560
|
if (context.recentSteps.length > 0) {
|
|
417
561
|
parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
|
|
418
562
|
for (const step of context.recentSteps) {
|
|
419
|
-
const status = step.result === 'success' ? '
|
|
563
|
+
const status = step.result === 'success' ? '[OK]' : '[FAIL]';
|
|
420
564
|
parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
|
|
421
565
|
parts.push(` Code: ${step.code}`);
|
|
422
566
|
if (step.result === 'failure' && step.error) {
|
|
423
|
-
parts.push(`
|
|
567
|
+
parts.push(` ERROR: ${step.error}`);
|
|
424
568
|
parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
|
|
425
569
|
} else {
|
|
426
570
|
parts.push(` Result: ${step.observation}`);
|
|
@@ -435,7 +579,7 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
435
579
|
i === 0 || s.code === arr[i-1].code
|
|
436
580
|
);
|
|
437
581
|
if (sameSelector) {
|
|
438
|
-
parts.push(
|
|
582
|
+
parts.push(`[WARNING] WARNING: You've tried the same selector multiple times and it failed!`);
|
|
439
583
|
parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
|
|
440
584
|
parts.push(` YOU MUST try a completely different selector this time!\n`);
|
|
441
585
|
}
|
|
@@ -458,16 +602,184 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
458
602
|
parts.push('');
|
|
459
603
|
}
|
|
460
604
|
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
605
|
+
return parts.join('\n');
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Build exploratory system prompt for autonomous exploration
|
|
610
|
+
*/
|
|
611
|
+
static buildExploratorySystemPrompt(toolDescriptions: string): string {
|
|
612
|
+
return `You are an autonomous exploration agent that discovers and tests web application features.
|
|
613
|
+
|
|
614
|
+
${toolDescriptions}
|
|
615
|
+
|
|
616
|
+
YOUR RESPONSE FORMAT - Output JSON matching this interface:
|
|
617
|
+
|
|
618
|
+
interface AgentDecisionLLMResponse {
|
|
619
|
+
status: string; // "continue" | "complete" | "stuck"
|
|
620
|
+
reasoning: string; // What you're exploring and why
|
|
621
|
+
|
|
622
|
+
// COMMANDS: Array of plain Playwright command strings
|
|
623
|
+
commands?: string[]; // Example: ["await page.fill('input[name=\"email\"]', 'test@example.com')", ...]
|
|
624
|
+
commandReasoning?: string;
|
|
625
|
+
toolCalls?: Array<{ // Tools to call (extract_data for menus, etc.)
|
|
626
|
+
name: string;
|
|
627
|
+
params: Record<string, any>;
|
|
628
|
+
}>;
|
|
629
|
+
toolReasoning?: string;
|
|
630
|
+
needsToolResults?: boolean;
|
|
631
|
+
noteToFutureSelf?: string;
|
|
632
|
+
coordinateAction?: { ... };
|
|
633
|
+
experiences?: string[]; // Use for BOTH app patterns AND exploration progress
|
|
634
|
+
blockerDetected?: { ... };
|
|
635
|
+
debugInfo?: { // Meta-learning: suggest prompt improvements (only when very confident)
|
|
636
|
+
suggestedPromptUpdates?: string;
|
|
637
|
+
reasoning?: string;
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
EXPLORATION MODE GUIDELINES:
|
|
642
|
+
|
|
643
|
+
1. **JOURNEY-FOCUSED EXPLORATION**: Follow the exploration prompt as your goal for THIS journey
|
|
644
|
+
- Example prompt: "Explore Dashboard and test all widgets"
|
|
645
|
+
- You should systematically test dashboard widgets, not wander off to other sections
|
|
646
|
+
- Stay focused on the given journey goal
|
|
647
|
+
|
|
648
|
+
2. **ICON BUTTONS**: Match step goal to icon semantics (plus=add, gear=settings, hamburger=menu). Check element map for confirmation. Don't randomly try - reason about fit.
|
|
649
|
+
|
|
650
|
+
3. **VISIBLE ELEMENTS ONLY**: Screenshot shows viewport only. Only interact with elements you SEE. If not visible, scroll or take_screenshot(isFullPage=true).
|
|
651
|
+
|
|
652
|
+
4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in experiences, check history to avoid repeating, prioritize unexplored areas.
|
|
653
|
+
|
|
654
|
+
5. **CREATIVE TESTING**: Test functionality thoroughly - try edge cases, verify features work, look for bugs.
|
|
655
|
+
|
|
656
|
+
7. **LIMITATIONS**: Cannot complete: sign-up, forgot password, OTP, CAPTCHA, email verification (no inbox/SMS access).
|
|
657
|
+
If encountered: CAPTCHA → stuck, sign-up/OTP → skip and explore other areas.
|
|
658
|
+
|
|
659
|
+
8. **AUTH**: If credentials provided, login FIRST using exact testDataPrompt values. Don't explore public pages or click sign-up.
|
|
660
|
+
|
|
661
|
+
9. **BLOCKERS**: Clear cookie modals, tour popups autonomously with blockerDetected.clearingCommands. CAPTCHA → stuck.
|
|
662
|
+
|
|
663
|
+
10. **STATUS**: complete=goal achieved or budget low, continue=need more, stuck=cannot proceed. Complete when journey goal met, don't wait for maxSteps.
|
|
664
|
+
|
|
665
|
+
11. **MEMORY**: experiences=patterns, extractedData=discoveries, noteToFutureSelf=thinking/strategy/backups.
|
|
666
|
+
|
|
667
|
+
CRITICAL: You're fully autonomous for THIS journey - no step-by-step instructions provided.
|
|
668
|
+
YOU decide the exploration path to meet the journey goal based on: journey prompt, current state, and memory.`;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
/**
|
|
672
|
+
* Build exploratory user prompt with context
|
|
673
|
+
*/
|
|
674
|
+
static buildExploratoryUserPrompt(
|
|
675
|
+
context: AgentContext,
|
|
676
|
+
explorationPrompt: string,
|
|
677
|
+
testDataPrompt?: string,
|
|
678
|
+
stepNumber?: number,
|
|
679
|
+
maxSteps?: number
|
|
680
|
+
): string {
|
|
681
|
+
const parts: string[] = [];
|
|
682
|
+
|
|
683
|
+
// Add SoM format reminder if screenshot is present
|
|
684
|
+
if (context.somScreenshot) {
|
|
685
|
+
parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
|
|
686
|
+
parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
|
|
687
|
+
parts.push(`Format: { "elementRef": "1", "action": "click" }`);
|
|
688
|
+
parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
parts.push('=== JOURNEY EXPLORATION CONTEXT ===\n');
|
|
692
|
+
parts.push(`GOAL: ${explorationPrompt}`);
|
|
693
|
+
parts.push(` (Focus on THIS specific goal - don't wander to unrelated areas)\n`);
|
|
694
|
+
|
|
695
|
+
if (testDataPrompt) {
|
|
696
|
+
parts.push(`TEST DATA/CREDENTIALS: ${testDataPrompt}`);
|
|
697
|
+
parts.push(` [WARNING] IMPORTANT: If credentials are provided above (email/username and password), you MUST:`);
|
|
698
|
+
parts.push(` - Use them to LOGIN and explore authenticated features`);
|
|
699
|
+
parts.push(` - Fill login forms with the exact credentials provided`);
|
|
700
|
+
parts.push(` - Don't waste time on public/unauthenticated pages when you can login`);
|
|
701
|
+
parts.push(` - Prioritize exploring the authenticated app experience\n`);
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
if (stepNumber && maxSteps) {
|
|
705
|
+
parts.push(`PROGRESS: Step ${stepNumber}/${maxSteps} (you can complete earlier if journey goal met)\n`);
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// Show discovered and tracked data from extractedData
|
|
709
|
+
if (context.extractedData && Object.keys(context.extractedData).length > 0) {
|
|
710
|
+
parts.push(`\nDISCOVERED DATA (this journey):`);
|
|
711
|
+
for (const [key, value] of Object.entries(context.extractedData)) {
|
|
712
|
+
parts.push(` ${key}: ${value}`);
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
// SoM screenshot (if available)
|
|
717
|
+
if (context.somScreenshot) {
|
|
718
|
+
parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
|
|
719
|
+
parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
|
|
720
|
+
parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
|
|
721
|
+
parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
|
|
722
|
+
parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
|
|
723
|
+
parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
|
|
724
|
+
parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
|
|
725
|
+
parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
|
|
726
|
+
parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
|
|
727
|
+
parts.push(``);
|
|
728
|
+
|
|
729
|
+
// SoM element map for disambiguation
|
|
730
|
+
if (context.somElementMap) {
|
|
731
|
+
parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
|
|
732
|
+
parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
|
|
733
|
+
parts.push(context.somElementMap);
|
|
734
|
+
parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
|
|
735
|
+
parts.push(``);
|
|
467
736
|
}
|
|
468
|
-
parts.push('');
|
|
469
737
|
}
|
|
470
738
|
|
|
739
|
+
parts.push(`\nCURRENT PAGE:`);
|
|
740
|
+
parts.push(`URL: ${context.currentURL}`);
|
|
741
|
+
parts.push(`Title: ${context.currentPageInfo.title}`);
|
|
742
|
+
|
|
743
|
+
// Only include DOM details if NOT in SoM mode
|
|
744
|
+
if (!context.somScreenshot) {
|
|
745
|
+
parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
|
|
746
|
+
parts.push(context.currentPageInfo.formattedElements);
|
|
747
|
+
parts.push(`\nARIA TREE (hierarchical structure):`);
|
|
748
|
+
parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
|
|
749
|
+
} else {
|
|
750
|
+
// In SoM mode, skip DOM details - agent uses visual screenshot
|
|
751
|
+
parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
|
|
752
|
+
}
|
|
753
|
+
if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
|
|
754
|
+
parts.push('... (truncated)');
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
// Recent actions
|
|
758
|
+
if (context.recentSteps.length > 0) {
|
|
759
|
+
parts.push(`\nRECENT ACTIONS (last ${context.recentSteps.length}):`);
|
|
760
|
+
for (const step of context.recentSteps) {
|
|
761
|
+
const status = step.result === 'success' ? '[OK]' : '[FAIL]';
|
|
762
|
+
parts.push(` ${status} ${step.action}`);
|
|
763
|
+
parts.push(` ${step.observation}`);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
// Learnings and exploration progress
|
|
768
|
+
if (context.experiences && context.experiences.length > 0) {
|
|
769
|
+
parts.push(`\nEXPLORATION NOTES & APP PATTERNS:`);
|
|
770
|
+
for (const exp of context.experiences) {
|
|
771
|
+
parts.push(` • ${exp}`);
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
// Note from previous iteration
|
|
776
|
+
if (context.noteFromPreviousIteration) {
|
|
777
|
+
parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.noteFromPreviousIteration.content}`);
|
|
778
|
+
parts.push(`Did it work? If yes, continue plan. If failed, try backup alternatives.`);
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
parts.push(`\nDECIDE NEXT ACTION: What to explore/test next? Check history to avoid repeating. Is goal achieved? Mark complete.`);
|
|
782
|
+
|
|
471
783
|
return parts.join('\n');
|
|
472
784
|
}
|
|
473
785
|
}
|