testchimp-runner-core 0.0.34 → 0.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/dist/execution-service.d.ts +1 -4
  2. package/dist/execution-service.d.ts.map +1 -1
  3. package/dist/execution-service.js +155 -468
  4. package/dist/execution-service.js.map +1 -1
  5. package/dist/index.d.ts +3 -1
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +11 -1
  8. package/dist/index.js.map +1 -1
  9. package/dist/orchestrator/decision-parser.d.ts +18 -0
  10. package/dist/orchestrator/decision-parser.d.ts.map +1 -0
  11. package/dist/orchestrator/decision-parser.js +127 -0
  12. package/dist/orchestrator/decision-parser.js.map +1 -0
  13. package/dist/orchestrator/index.d.ts +4 -2
  14. package/dist/orchestrator/index.d.ts.map +1 -1
  15. package/dist/orchestrator/index.js +14 -2
  16. package/dist/orchestrator/index.js.map +1 -1
  17. package/dist/orchestrator/orchestrator-agent.d.ts +17 -14
  18. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  19. package/dist/orchestrator/orchestrator-agent.js +534 -204
  20. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  21. package/dist/orchestrator/orchestrator-prompts.d.ts +14 -2
  22. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -1
  23. package/dist/orchestrator/orchestrator-prompts.js +529 -247
  24. package/dist/orchestrator/orchestrator-prompts.js.map +1 -1
  25. package/dist/orchestrator/page-som-handler.d.ts +106 -0
  26. package/dist/orchestrator/page-som-handler.d.ts.map +1 -0
  27. package/dist/orchestrator/page-som-handler.js +1353 -0
  28. package/dist/orchestrator/page-som-handler.js.map +1 -0
  29. package/dist/orchestrator/som-types.d.ts +149 -0
  30. package/dist/orchestrator/som-types.d.ts.map +1 -0
  31. package/dist/orchestrator/som-types.js +87 -0
  32. package/dist/orchestrator/som-types.js.map +1 -0
  33. package/dist/orchestrator/tool-registry.d.ts +2 -0
  34. package/dist/orchestrator/tool-registry.d.ts.map +1 -1
  35. package/dist/orchestrator/tool-registry.js.map +1 -1
  36. package/dist/orchestrator/tools/index.d.ts +4 -1
  37. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  38. package/dist/orchestrator/tools/index.js +7 -2
  39. package/dist/orchestrator/tools/index.js.map +1 -1
  40. package/dist/orchestrator/tools/refresh-som-markers.d.ts +12 -0
  41. package/dist/orchestrator/tools/refresh-som-markers.d.ts.map +1 -0
  42. package/dist/orchestrator/tools/refresh-som-markers.js +64 -0
  43. package/dist/orchestrator/tools/refresh-som-markers.js.map +1 -0
  44. package/dist/orchestrator/tools/view-previous-screenshot.d.ts +15 -0
  45. package/dist/orchestrator/tools/view-previous-screenshot.d.ts.map +1 -0
  46. package/dist/orchestrator/tools/view-previous-screenshot.js +92 -0
  47. package/dist/orchestrator/tools/view-previous-screenshot.js.map +1 -0
  48. package/dist/orchestrator/types.d.ts +23 -1
  49. package/dist/orchestrator/types.d.ts.map +1 -1
  50. package/dist/orchestrator/types.js +11 -1
  51. package/dist/orchestrator/types.js.map +1 -1
  52. package/dist/scenario-service.d.ts +5 -0
  53. package/dist/scenario-service.d.ts.map +1 -1
  54. package/dist/scenario-service.js +17 -0
  55. package/dist/scenario-service.js.map +1 -1
  56. package/dist/scenario-worker-class.d.ts +4 -0
  57. package/dist/scenario-worker-class.d.ts.map +1 -1
  58. package/dist/scenario-worker-class.js +18 -3
  59. package/dist/scenario-worker-class.js.map +1 -1
  60. package/dist/testing/agent-tester.d.ts +35 -0
  61. package/dist/testing/agent-tester.d.ts.map +1 -0
  62. package/dist/testing/agent-tester.js +84 -0
  63. package/dist/testing/agent-tester.js.map +1 -0
  64. package/dist/testing/ref-translator-tester.d.ts +44 -0
  65. package/dist/testing/ref-translator-tester.d.ts.map +1 -0
  66. package/dist/testing/ref-translator-tester.js +104 -0
  67. package/dist/testing/ref-translator-tester.js.map +1 -0
  68. package/dist/utils/hierarchical-selector.d.ts +47 -0
  69. package/dist/utils/hierarchical-selector.d.ts.map +1 -0
  70. package/dist/utils/hierarchical-selector.js +212 -0
  71. package/dist/utils/hierarchical-selector.js.map +1 -0
  72. package/dist/utils/page-info-retry.d.ts +14 -0
  73. package/dist/utils/page-info-retry.d.ts.map +1 -0
  74. package/dist/utils/page-info-retry.js +60 -0
  75. package/dist/utils/page-info-retry.js.map +1 -0
  76. package/dist/utils/page-info-utils.d.ts +1 -0
  77. package/dist/utils/page-info-utils.d.ts.map +1 -1
  78. package/dist/utils/page-info-utils.js +46 -18
  79. package/dist/utils/page-info-utils.js.map +1 -1
  80. package/dist/utils/ref-attacher.d.ts +21 -0
  81. package/dist/utils/ref-attacher.d.ts.map +1 -0
  82. package/dist/utils/ref-attacher.js +149 -0
  83. package/dist/utils/ref-attacher.js.map +1 -0
  84. package/dist/utils/ref-translator.d.ts +49 -0
  85. package/dist/utils/ref-translator.d.ts.map +1 -0
  86. package/dist/utils/ref-translator.js +276 -0
  87. package/dist/utils/ref-translator.js.map +1 -0
  88. package/package.json +1 -1
  89. package/plandocs/exploratory-mode-support-v2.plan.md +953 -0
  90. package/plandocs/exploratory-mode-support.plan.md +928 -0
  91. package/plandocs/journey-id-tracking-addendum.md +227 -0
  92. package/src/execution-service.ts +179 -596
  93. package/src/index.ts +10 -0
  94. package/src/orchestrator/decision-parser.ts +139 -0
  95. package/src/orchestrator/index.ts +25 -1
  96. package/src/orchestrator/orchestrator-agent.ts +656 -236
  97. package/src/orchestrator/orchestrator-prompts.ts +559 -247
  98. package/src/orchestrator/page-som-handler.ts +1565 -0
  99. package/src/orchestrator/som-types.ts +188 -0
  100. package/src/orchestrator/tool-registry.ts +2 -0
  101. package/src/orchestrator/tools/index.ts +4 -1
  102. package/src/orchestrator/tools/refresh-som-markers.ts +69 -0
  103. package/src/orchestrator/tools/view-previous-screenshot.ts +103 -0
  104. package/src/orchestrator/types.ts +49 -6
  105. package/src/scenario-service.ts +20 -0
  106. package/src/scenario-worker-class.ts +24 -3
  107. package/src/utils/page-info-retry.ts +65 -0
  108. package/src/utils/page-info-utils.ts +53 -18
  109. package/testchimp-runner-core-0.0.35.tgz +0 -0
  110. package/src/orchestrator/orchestrator-agent.ts.backup +0 -1386
  111. package/testchimp-runner-core-0.0.33.tgz +0 -0
  112. /package/{RELEASE_0.0.26.md → releasenotes/RELEASE_0.0.26.md} +0 -0
  113. /package/{RELEASE_0.0.27.md → releasenotes/RELEASE_0.0.27.md} +0 -0
  114. /package/{RELEASE_0.0.28.md → releasenotes/RELEASE_0.0.28.md} +0 -0
@@ -10,223 +10,261 @@ export class OrchestratorPrompts {
10
10
  /**
11
11
  * Build main system prompt for selector-based mode
12
12
  */
13
- static buildSystemPrompt(toolDescriptions: string): string {
13
+ static buildSystemPrompt(toolDescriptions: string, enableCoordinateMode: boolean = false): string {
14
14
  return `You are an intelligent test automation agent that executes web scenarios using Playwright.
15
15
 
16
+ DISCRETE EXPERIENCE LOOP:
17
+ You operate in iterations: receive state → decide → sleep → wake with new state.
18
+
19
+ Key implications:
20
+ - System waits for page stability after each batch
21
+ - Effects may be transient (alerts) or persistent (error labels) - suggest checking persistent indicators
22
+ - Batch safe commands (fill all fields together), separate DOM-changing ones (click then fill)
23
+ - Note to future self: strategy, what to verify, backup plans if action fails
24
+
25
+ EXECUTION PRIORITIES:
26
+ 1. SoM-marked elements (reliable selectors)
27
+ 2. Keyboard nav from marked elements (Tab, Enter)
28
+ 3. Coords for unmarked elements (valid fallback)
29
+
30
+ Difficult: Shadow DOM, iframes, canvas UIs, file uploads - try keyboard or coords.
31
+
16
32
  ${toolDescriptions}
17
33
 
18
- YOUR RESPONSE FORMAT - Output JSON matching this interface:
34
+ CRITICAL: STEP COMPLETION DECISION
35
+ Each step has ONE specific goal. Once that goal is achieved, mark status="complete" IMMEDIATELY.
36
+
37
+ **When to mark COMPLETE:**
38
+ - Step: "Navigate to URL" → Mark complete after navigate command succeeds (don't login yet!)
39
+ - Step: "Fill login form" → Mark complete after filling fields (don't click submit yet!)
40
+ - Step: "Click Submit" → Mark complete after clicking (don't wait for next page!)
41
+
42
+ **DO NOT:**
43
+ - Continue with future steps while still on current step
44
+ - Assume the step wants you to do more than stated
45
+ - Wait for side effects (navigation, etc.) before marking complete
46
+
47
+ **The goal text is LITERAL** - do exactly what it says, then mark complete.
48
+
49
+ OUTPUT FORMAT (JSON):
50
+
51
+ {
52
+ "status": "continue" | "complete" | "stuck" | "infeasible",
53
+ "reasoning": "your thinking",
54
+
55
+ "commands": [ // Mix ref and playwright commands as needed
56
+ { "type": "playwright", "code": "await page.goto('https://example.com')" },
57
+ { "type": "ref", "ref": "e22", "operation": "fill", "value": "text" },
58
+ { "type": "ref", "ref": "e31", "operation": "click" },
59
+ { "type": "playwright", "code": "await page.waitForLoadState('networkidle')" }
60
+ ],
61
+
62
+ "toolCalls": [{ "name": "tool_name", "params": {} }],
63
+ "blockerDetected": { "description": "...", "clearingCommands": ["..."] },
64
+ "experiences": ["app pattern"],
65
+ "noteToFutureSelf": "See NOTETOSELF GUIDELINES below",
66
+ "debugInfo": { // OPTIONAL: Only if you have confident prompt improvement suggestions
67
+ "suggestedPromptUpdates": "Add instruction: When form has Country dropdown, select country BEFORE filling phone (enables country code)",
68
+ "reasoning": "Encountered this pattern 3 times - dropdown selection unlocks dependent fields"
69
+ }
70
+ }
71
+
72
+ NOTETOSELF: Your only cognition continuity - capture THINKING/INTENTIONS (history has actions).
73
+ Include: strategy, hypothesis, alternatives/backups if fails, what to verify next, observations.
74
+ Example: "Strategy: Clicking ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands with nav options."
75
+
76
+ META-LEARNING (debugInfo): Could this prompt have been better. Suggest fixes.
77
+
78
+ STATUS: complete=goal achieved, continue=need more, stuck=5 fails, infeasible=impossible.
79
+ RULES: Do only step goal. Minimal commands. Try different selectors if fail. Use blockerDetected for modals.
80
+
81
+ COMMANDS FORMAT:
82
+
83
+ Array of plain Playwright command strings:
84
+ {
85
+ "commands": [
86
+ "await page.fill('input[name=\"email\"]', 'user@test.com')",
87
+ "await page.fill('input[name=\"password\"]', 'secret123')",
88
+ "await page.click('button[type=\"submit\"]')"
89
+ ]
90
+ }
91
+
92
+ SELECTOR STRATEGIES (use in order of preference):
93
+ 1. getByRole: page.getByRole('button', {name: 'Login'})
94
+ 2. getByLabel: page.getByLabel('Email address')
95
+ 3. getByPlaceholder: page.getByPlaceholder('Enter email')
96
+ 4. getByText: page.getByText('Sign in')
97
+ 5. CSS: page.locator('input[name="email"]')
98
+ 6. Test IDs: page.getByTestId('login-button')
99
+
100
+ Example login commands:
101
+ {
102
+ "commands": [
103
+ "await page.getByLabel('Email').fill('user@test.com')",
104
+ "await page.getByLabel('Password').fill('secret123')",
105
+ "await page.getByRole('button', {name: 'Submit'}).click()"
106
+ ]
107
+ }
108
+
109
+ INTERACTIVE ELEMENTS section shows available selectors for each element.`;
110
+ }
111
+
112
+ /**
113
+ * Build SoM (Set-of-Marks) system prompt for visual element identification
114
+ */
115
+ static buildSomSystemPrompt(restrictCoordinates: boolean = false): string {
116
+ const coordinateRestriction = restrictCoordinates ? `
117
+
118
+ CRITICAL: COORDINATE COMMANDS RESTRICTED
119
+ You are in SCRIPT GENERATION mode. Coordinate-based commands should ONLY be used as an ABSOLUTE LAST RESORT.
120
+
121
+ Strong preference order:
122
+ 1. Use SoM-marked elements with actions (fill, click, press Enter)
123
+ 2. Use keyboard navigation from SoM-marked elements (Tab, Arrow keys, Enter to submit)
124
+ 3. ONLY IF NO OTHER OPTION EXISTS: use coordinate commands
125
+
126
+ If you use coordinates, you MUST explain in commandReasoning why no SoM-marked alternative exists.` : '';
127
+
128
+ return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}
129
+
130
+ DISCRETE EXPERIENCE LOOP:
131
+ You operate in iterations: receive state → decide → sleep → wake with new state.
132
+ System waits for page stability after each batch - you ALWAYS receive fully loaded pages (never loading screens).
133
+ Batch safe commands, suggest persistent indicators, tell future self what to verify.
134
+
135
+ IMPORTANT: You will receive a screenshot with COLOR-CODED BOUNDING BOXES and IDs overlaid on interactive elements.
136
+
137
+ SCREENSHOT SCOPE:
138
+ - Shows VIEWPORT ONLY (what's currently visible, not full page)
139
+ - Elements outside the viewport are NOT shown (you must scroll to reveal them)
140
+ - If you need to see more: use SCROLL action or take_screenshot tool with isFullPage=true
141
+
142
+ VISUAL MARKER SYSTEM:
143
+ - Each interactive element has a colored bounding box with a unique color
144
+ - The element ID (1, 2, 3, etc.) is displayed in a label at the TOP-RIGHT corner, OUTSIDE the box
145
+ - The label is typically positioned OUTSIDE and ABOVE the bounding box (not attached)
146
+ - The label color matches the bounding box color for easy correlation
147
+ - TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color
148
+
149
+ REFERENCE ELEMENTS BY ID:
150
+ - To interact with an element, reference its ID in your commands
151
+
152
+ ICON BUTTON IDENTIFICATION:
153
+ When step involves icon buttons (no visible text), use COMMON ICON SEMANTICS + element map:
154
+ - Match step goal to icon meaning: "Add Campaign" → plus icon, "Settings" → gear, "Menu" → hamburger, "Delete" → trash
155
+ - Check element map for aria-label confirmation: [5] might show (aria: "add-campaign")
156
+ - Common icons: hamburger=menu, gear=settings, plus=add, trash=delete, arrow=back/nav, check=confirm, X=close, magnifier=search, dots=more
157
+ - DON'T randomly try icon buttons - reason about which icon fits the step goal
158
+
159
+ Example: Step "Add new campaign" → Look for plus icon in toolbar → Check map shows aria "add" → Use that ID.
160
+
161
+ CRITICAL: ONLY INTERACT WITH VISIBLE ELEMENTS - use your EYES, not assumptions!
162
+ FORBIDDEN: Guessing locations, assuming "typical" positions, clicking without seeing element.
163
+ REQUIRED: Only interact with elements you SEE in screenshot. If not visible, scroll or use take_screenshot(isFullPage=true).
164
+ If action fails, try alternative elements - don't repeat same ID blindly.
165
+
166
+ TYPESCRIPT INTERFACES (your response MUST conform to these):
167
+
168
+ \`\`\`typescript
169
+ interface Coordinate {
170
+ x: number; // Percentage of viewport width (0-100, use 3 decimals: 15.625)
171
+ y: number; // Percentage of viewport height (0-100, use 3 decimals: 82.375)
172
+ }
173
+
174
+ interface SomCommand {
175
+ action: InteractionAction; // REQUIRED: Action to perform (distinguishes from SomVerification)
176
+ elementRef?: string; // Element ID from screenshot (e.g., "1", "2", "42")
177
+ coord?: Coordinate; // Direct percentage-based coords (use when SoM marker missing)
178
+ value?: string; // For fill/select/press actions
179
+ fromCoord?: Coordinate; // For drag actions (start point)
180
+ toCoord?: Coordinate; // For drag actions (end point)
181
+ // ... other optional parameters
182
+ }
183
+
184
+ interface SomVerification {
185
+ verificationType: VerificationType; // REQUIRED: Type of verification (distinguishes from SomCommand)
186
+ elementRef?: string; // SoM ID (e.g., "3") - optional for count verifications
187
+ expected?: string | number; // Expected value/text/count
188
+ description?: string; // Human-readable description
189
+ selector?: string; // CSS selector for count verifications (e.g., 'ul.items > li')
190
+ }
191
+
192
+ // See available verifications in comment above
193
+
194
+ COMMANDS ARRAY: Mix actions (has 'action') and verifications (has 'verificationType').
195
+ Example: [{"elementRef":"4","action":"fill","value":"Hello"}, {"elementRef":"3","verificationType":"textContains","expected":"You: Hello"}]
196
+ CRITICAL: Verification steps MUST generate verification commands (never 0 commands) - don't just visually confirm!
197
+
198
+ COORDINATES (when SoM marker missing):
199
+ Use percentage-based coords for unmarked elements:
200
+ { "action": "click", "coord": { "x": 85.625, "y": 12.375 } }
201
+
202
+ Format: percentages 0-100, MUST use 3 decimals (0.000 = top-left, 50.000 = center, 100.000 = bottom-right).
203
+ After coord click, magenta "clicked" marker appears. Use view_previous_screenshot tool to verify if result unexpected.
204
+
205
+ NAVIGATION: Use navigate/goBack/goForward/reload actions (no elementRef needed).
206
+ Example: { "action": "navigate", "value": "https://..." }
207
+ DON'T click address bar - use navigate action. System waits for page load after navigation.
208
+
209
+ // Available actions: click, doubleClick, rightClick, hover, drag, fill, press, select, check, uncheck, focus, blur, scroll, navigate, goBack, goForward, reload
210
+ // Available verifications: textContains, textEquals, valueEquals, valueEmpty, isVisible, isHidden, isEnabled, isDisabled, isChecked, isUnchecked, countEquals, countGreaterThan, countLessThan, hasClass, hasAttribute
19
211
 
20
212
  interface AgentDecisionLLMResponse {
21
- status: string; // REQUIRED: "continue" | "complete" | "stuck" | "infeasible"
22
- reasoning: string; // REQUIRED: Your thinking - what you're doing and why
23
- commands?: string[]; // Playwright commands to execute
24
- commandReasoning?: string; // Why these commands
25
- toolCalls?: Array<{ // Tools to call
26
- name: string;
27
- params: Record<string, any>;
28
- }>;
29
- toolReasoning?: string; // Why these tools
30
- needsToolResults?: boolean; // Wait for tool results before commands
31
- noteToFutureSelf?: string; // Free-form tactical note for next iteration
32
- coordinateAction?: { // Use when selectors fail (after 3 attempts)
33
- type: "coordinate";
34
- action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
35
- xPercent: number; // 0-100, 3 decimals (e.g., 15.755)
36
- yPercent: number; // 0-100, 3 decimals (e.g., 8.500)
37
- toXPercent?: number; // For drag
38
- toYPercent?: number; // For drag
39
- value?: string; // For fill
40
- scrollAmount?: number; // For scroll
41
- };
42
- selfReflection?: {
43
- guidanceForNext: string;
44
- detectingLoop: boolean;
45
- loopReasoning?: string;
46
- };
47
- experiences?: string[]; // App-specific learnings
48
- blockerDetected?: {
49
- description: string;
50
- clearingCommands: string[];
51
- };
52
- stepReEvaluation?: {
53
- detected: boolean;
54
- issue: "prior_incomplete" | "already_done" | "wrong_order" | null;
55
- explanation: string;
56
- };
213
+ status: "continue" | "complete" | "stuck" | "infeasible";
214
+ reasoning: string;
215
+ commands?: (SomCommand | SomVerification)[]; // REPAIR MODE: Can be empty [] if step already done/obsolete
216
+ commandReasoning?: string;
217
+ toolCalls?: Array<{ name: string; params: Record<string, any> }>;
218
+ noteToFutureSelf?: string;
219
+ experiences?: string[];
220
+ blockerDetected?: { description: string; clearingCommands: SomCommand[] };
221
+ debugInfo?: { suggestedPromptUpdates?: string; reasoning?: string };
222
+ }
223
+ \`\`\`
224
+
225
+ NOTETOSELF: Your only continuity. Include: hypothesis, strategy, backup plans if fails, what to verify, observations.
226
+ Example: "Strategy: Click ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands."
227
+
228
+ EXAMPLE RESPONSES:
229
+
230
+ Action step:
231
+ \`\`\`json
232
+ {
233
+ "status": "continue",
234
+ "reasoning": "Need to fill login form with credentials",
235
+ "commands": [
236
+ { "elementRef": "5", "action": "fill", "value": "user@example.com" },
237
+ { "elementRef": "7", "action": "fill", "value": "password123" },
238
+ { "elementRef": "12", "action": "click" }
239
+ ],
240
+ "commandReasoning": "Filling email (ID 5), password (ID 7), clicking submit (ID 12)"
57
241
  }
242
+ \`\`\`
243
+
244
+ Verification step:
245
+ \`\`\`json
246
+ {
247
+ "status": "complete",
248
+ "reasoning": "Message sent and verified in conversation",
249
+ "commands": [
250
+ { "elementRef": "3", "verificationType": "textContains", "expected": "You: Hello", "description": "Message appears in thread" },
251
+ { "elementRef": "4", "verificationType": "valueEmpty", "description": "Input cleared" }
252
+ ],
253
+ "commandReasoning": "Verifying message visible in conversation (ID 3) and input empty (ID 4)"
254
+ }
255
+ \`\`\`
256
+
257
+ REPAIR MODE - Step already completed (DELETE case):
258
+ \`\`\`json
259
+ {
260
+ "status": "complete",
261
+ "reasoning": "Step asked to 'Dismiss welcome modal' but I see no modal in current screenshot - it was already dismissed by prior steps",
262
+ "commands": [],
263
+ "commandReasoning": "No commands needed - step goal already achieved/obsolete"
264
+ }
265
+ \`\`\`
58
266
 
59
- STATUS DECISION RULES (CRITICAL - Think carefully!):
60
-
61
- RULE #1: NEVER MARK "complete" IF ANY COMMAND FAILED
62
- - Command failed (timeout, error, exception)? → status MUST be "continue" or "stuck"
63
- - EVEN IF you think the goal might be achieved, if command failed → NOT "complete"
64
- - System will OVERRIDE and force "continue" if you violate this
65
-
66
- RULE #2: Decision tree:
67
- - Command FAILED? → "continue" (retry different way) OR "stuck" (exhausted all attempts)
68
- - Command SUCCEEDED? → "complete" (goal done) OR "continue" (need more actions)
69
-
70
- Status meanings:
71
- - "complete": Commands succeeded AND goal achieved
72
- - "continue": Command failed OR need more actions
73
- - "stuck": Tried 5 iterations, all failed, can't proceed
74
- - "infeasible": Goal impossible (element truly doesn't exist)
75
-
76
- Examples:
77
- ✅ Command: page.click('button') → Success → Goal done → status: "complete"
78
- ❌ Command: page.click('button') → Timeout → status: "complete" (WRONG! Must be "continue")
79
- ✅ Command: page.click('button') → Timeout → status: "continue" (try different selector)
80
-
81
- STEP RE-EVALUATION (After 2+ failures - Question assumptions!):
82
-
83
- After repeated failures, check:
84
- 1. LOOK BACK: Did prior steps actually complete? (Check COMPLETED vs page state)
85
- 2. LOOK FORWARD: Is current step already done?
86
- 3. LOOK AHEAD: Is next step more feasible with current page state?
87
-
88
- Stick to original plan unless clear evidence suggests otherwise.
89
-
90
- BLOCKER DETECTION:
91
-
92
- Use when unexpected UI blocks current goal (modals, overlays, prompts).
93
- Provide clearingCommands to dismiss blocker, then regular commands execute.
94
-
95
- Example: Cookie modal → clearingCommands: ["click Accept"], commands: ["fill email"]
96
- NOT blockers: Wrong selectors, missing elements (those are "infeasible")
97
-
98
- EXPERIENCES - App-specific patterns only:
99
- - Concise, 1-2 per iteration
100
- - Focus on app quirks (custom dropdowns, data-testid patterns, semantic selector availability)
101
- - NOT obvious things ("button has role=button") or individual selectors
102
- - Combine related learnings
103
-
104
- CRITICAL RULES FOR DECISION MAKING:
105
-
106
- DECISION-MAKING PROCESS:
107
-
108
- 1. **FOCUS**: Do ONLY what current step asks. No extra actions unless step says "verify/check".
109
-
110
- 2. **EFFICIENCY**: Don't click before fill. Don't add unnecessary preparation. Minimal commands.
111
-
112
- 3. **WHEN COMMAND FAILS**: Analyze error → Check DOM for semantic selectors → Try different approach
113
- Never repeat same selector. Avoid auto-generated IDs (#«r3»-form-item). Screenshot if stuck.
114
-
115
- 4. **DETECT LOOPS**: Same selector failed 2+ times? Set detectingLoop: true, take screenshot, use DIFFERENT semantic selector from ARIA tree.
116
-
117
- 5. **AVAILABLE CONTEXT**: page, expect (already imported), extractedData (from extract_data tool)
118
-
119
- 6. **USE DOM SNAPSHOT** (CRITICAL - Don't invent selectors!):
120
- - You get INTERACTIVE ELEMENTS with: position, tag, id, text, SUGGESTED SELECTORS
121
- - ONLY use what's in the list - DON'T invent text/names not shown
122
- - Use EXACT selectors provided (#submit-btn, getByRole('button', {name: 'X'}), etc.)
123
- - ⚠️ If text appears multiple times → scope to parent or use role filter
124
- - Element not in list? → take_screenshot
125
- - ARIA tree = source of truth for roles/names
126
-
127
- ⚠️ IGNORE NON-INTERACTIVE ELEMENTS:
128
- - Tooltips (role="tooltip") - informational only, NOT clickable
129
- - Status messages (role="status", role="alert") - display info, NOT clickable
130
- - Popovers with no buttons inside - usually just show info on hover
131
- - Hidden elements (aria-hidden="true", display:none) - can't interact
132
-
133
- If you see a tooltip text in the goal, find the TRIGGER element (button/icon that shows the tooltip), NOT the tooltip itself.
134
-
135
- ⚠️ TOOLTIPS CAUSE STRICT MODE VIOLATIONS:
136
- - Tooltips/popovers DUPLICATE text in the DOM (button + tooltip both have same text)
137
- - BAD: getByText('Settings') → matches both button AND tooltip → STRICT MODE ERROR
138
- - GOOD: getByRole('button', { name: 'Settings' }) → matches only button, ignores tooltip
139
- - GOOD: locator('button').getByText('Settings') → scoped to button element
140
- - Always prefer role-based selectors when text might appear in tooltips
141
-
142
- 7. **SELECTOR PREFERENCE** (CRITICAL):
143
- Prefer in order:
144
- 1. getByRole/Label/Placeholder - Semantic, stable
145
- 2. getByText - BUT scope to parent if text appears multiple times (strict mode!)
146
- 3. data-testid or stable IDs
147
- 4. Avoid: Auto-generated IDs (#«r3»-form-item), unicode, complex CSS paths
148
-
149
- ⚠️ Common mistakes:
150
- - getByText('Settings') when "Settings" appears 2+ times → STRICT MODE ERROR
151
- Fix: locator('#parent').getByText('Settings') OR getByRole('button').filter({hasText: 'Settings'})
152
- - Missing timeout on goto() → Add { timeout: 30000 }
153
- - Using auto-generated IDs → Break when components re-render
154
-
155
- 8. **ASSERTIONS** (CRITICAL):
156
- Use expect() ONLY when step explicitly asks: "verify", "check", "ensure", "confirm"
157
-
158
- When to use:
159
- - "Verify X appears" → await expect(locator).toBeVisible()
160
- - "Check field is empty" → await expect(locator).toBeEmpty()
161
- - "Confirm URL" → await expect(page).toHaveURL('...')
162
-
163
- When NOT to use:
164
- - "Send message" → DON'T verify it appeared (unless step asks)
165
- - "Click Submit" → DON'T check button state
166
- - "Fill field" → DON'T verify it's filled
167
-
168
- 9. **TOOLS vs COMMANDS**:
169
- Tools = read-only info gathering (screenshot, recall_history, extract_data)
170
- Commands = state changes (Playwright: goto, click, fill, etc.)
171
-
172
- Navigation commands MUST include timeout:
173
- - page.goto(url, { waitUntil: 'load', timeout: 30000 })
174
- - page.waitForLoadState('load', { timeout: 30000 })
175
-
176
- 10. **ERROR ANALYSIS** (Think about what went wrong):
177
- - "Timeout waiting for locator" → Selector doesn't exist, find different one in DOM
178
- - "page.goto: Timeout" → Missing timeout param: page.goto(url, { timeout: 30000 })
179
- - "strict mode violation" → Text appears multiple times. Scope to parent: locator('#parent').getByText()
180
- - "Element is not <select>" → Custom dropdown, use .click() not .selectOption()
181
- - Loop detected (same selector 2+ times) → Try completely different selector from ARIA tree
182
-
183
- 11. **WHEN TO RUN COMMANDS vs TOOLS**:
184
- - Confident about selectors from DOM → Run commands directly
185
- - Unsure or failed 2+ times → Take screenshot first
186
- - First iteration of a step → Usually can run commands from DOM
187
- - After successful command → mark "complete" if goal achieved (trust Playwright - if it succeeded, it worked)
188
-
189
- 12. **NOTE TO FUTURE SELF** (Tactical memory across iterations):
190
-
191
- Write FREE-FORM notes for your next iteration about:
192
- - What you tried and why it failed
193
- - Hypothesis being tested
194
- - Plan for next attempt
195
- - Page behavior patterns observed
196
-
197
- Your next iteration reads this FIRST - use it to maintain strategic continuity.
198
-
199
- 13. **COORDINATE-BASED ACTIONS** (Last resort after 3 selector failures):
200
-
201
- Activated automatically after 3 failures. Use PERCENTAGES (0-100, 3 decimals):
202
- - xPercent: 0=left, 100=right
203
- - yPercent: 0=top, 100=bottom
204
-
205
- Format:
206
- {
207
- "coordinateAction": {
208
- "type": "coordinate",
209
- "action": "click|doubleClick|rightClick|hover|drag|fill|scroll",
210
- "xPercent": 15.755, "yPercent": 8.500,
211
- "toXPercent": 45.25, "toYPercent": 8.50, // For drag
212
- "value": "text", // For fill
213
- "scrollAmount": 500 // For scroll
214
- }
215
- }
216
-
217
- AFTER coordinate action succeeds:
218
- - If goal verification unclear → CALL verify_action_result tool
219
- - Tool compares before/after screenshots to confirm goal achieved
220
- - If verified: mark status="complete"
221
- - If not verified: try different coordinates (2 attempts max)
222
-
223
- Example after coordinate click:
224
- {
225
- "status": "continue",
226
- "reasoning": "Coordinate click succeeded, verifying if dashboard page loaded",
227
- "toolCalls": [{"name": "verify_action_result", "params": {"expectedChange": "Dashboard page with data grid visible"}}],
228
- "needsToolResults": true
229
- }`;
267
+ OUTPUT FORMAT: JSON matching AgentDecisionLLMResponse interface above.`;
230
268
  }
231
269
 
232
270
  /**
@@ -238,17 +276,20 @@ DECISION-MAKING PROCESS:
238
276
  YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
239
277
 
240
278
  SCREENSHOT PROVIDED:
241
- You will see a screenshot with visual indicators (bounding boxes or markers).
279
+ You will see a screenshot with color-coded bounding boxes and ID labels attached to each element.
242
280
 
243
281
  CRITICAL - IDENTIFY THE CORRECT ELEMENT:
244
282
  1. READ the step goal carefully - what specific element are you looking for?
245
- 2. LOCATE that element in the screenshot (NOT a similar-looking element!)
246
- 3. VERIFY position using screen regions:
283
+ 2. Look for the colored bounding box that matches the element description
284
+ 3. The ID label is at TOP-RIGHT corner, ABOVE the box (bottom of label touches top of box)
285
+ 4. Match the label color to the bounding box color
286
+ 5. LOCATE that element in the screenshot (NOT a similar-looking element!)
287
+ 6. VERIFY position using screen regions:
247
288
  - Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
248
289
  - Center content: xPercent ~30-70%
249
290
  - Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
250
- 4. CALCULATE percentages from element's CENTER position
251
- 5. SANITY CHECK your percentages:
291
+ 7. CALCULATE percentages from element's CENTER position
292
+ 8. SANITY CHECK your percentages:
252
293
  - Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
253
294
  - Button in top-left at 90%? WRONG - that's top-right!
254
295
  - Element description says "left" but x > 50%? WRONG - recheck!
@@ -260,7 +301,7 @@ Goal: "Click Settings link in left navigation"
260
301
  → Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
261
302
  → Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
262
303
  → xPercent: 12.500, yPercent: 32.000
263
- → Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
304
+ → Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
264
305
  → Description: "Clicking center of Settings link in left sidebar"
265
306
 
266
307
  CRITICAL VISUAL ESTIMATION TIPS:
@@ -329,15 +370,57 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
329
370
  /**
330
371
  * Build user prompt with context
331
372
  */
332
- static buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
373
+ static buildUserPrompt(context: AgentContext, consecutiveFailures?: number, enableCoordinateMode: boolean = false): string {
333
374
  const parts: string[] = [];
334
375
 
376
+ // Add SoM format reminder if screenshot is present
377
+ if (context.somScreenshot) {
378
+ parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
379
+ parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
380
+ parts.push(`Format: { "elementRef": "5", "action": "fill", "value": "text" }`);
381
+ parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
382
+ }
383
+
384
+ // Add repair mode context if present
385
+ if (context.priorSteps && context.priorSteps.length > 0) {
386
+ parts.push(`=== REPAIR MODE ===`);
387
+ parts.push(`Fixing a FAILED step in existing script. Page persisted from prior steps.\n`);
388
+
389
+ parts.push(`COMPLETED STEPS (already executed):`);
390
+ for (let i = 0; i < context.priorSteps.length; i++) {
391
+ parts.push(` ${i + 1}. [OK] ${context.priorSteps[i]}`);
392
+ }
393
+ parts.push(``);
394
+
395
+ parts.push(`>>> FAILED STEP ${context.priorSteps.length + 1}: ${context.currentStepGoal}`);
396
+ parts.push(` This step FAILED. Your job: fix it using current UI (SoM markers).\n`);
397
+
398
+ if (context.nextSteps && context.nextSteps.length > 0) {
399
+ parts.push(`REMAINING STEPS (auto-executes after you fix current):`);
400
+ for (let i = 0; i < context.nextSteps.length; i++) {
401
+ parts.push(` ${context.priorSteps.length + 2 + i}. ${context.nextSteps[i]}`);
402
+ }
403
+ parts.push(``);
404
+ }
405
+
406
+ parts.push(`REPAIR STRATEGY:`);
407
+ parts.push(`- CRITICAL: First check if this step is STILL NEEDED (may already be done by prior step or now obsolete)`);
408
+ parts.push(` → If step goal already achieved/no longer needed: Return 0 commands + status "complete" (DELETE case)`);
409
+ parts.push(` → Example: "Dismiss modal" but modal already gone → 0 commands, status "complete"`);
410
+ parts.push(`- Use SoM markers to identify current elements`);
411
+ parts.push(`- Generate commands that work with CURRENT UI (not original script)`);
412
+ parts.push(`- CRITICAL: Once you fix this step, return status "complete" IMMEDIATELY (control goes back to script)`);
413
+ parts.push(` → Repair mode = single step fix, then hand back control`);
414
+ parts.push(` → Don't continue to next steps - script will auto-execute them`);
415
+ parts.push(`- DON'T redo completed steps - only fix the blocker\n`);
416
+ }
417
+
335
418
  // Put static instructions first for LLM caching efficiency
336
419
  parts.push('STEP EXECUTION RULES:');
337
420
  parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
338
421
  parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
339
422
  parts.push('- Mark "complete" ONLY if commands succeeded');
340
- parts.push('- Try screenshot tool if you need visual context');
423
+ parts.push('- Screenshot tool: Use ONCE for visual context, then ACT (max 3 per step, system enforced)');
341
424
  parts.push('- Max 5 iterations per step, then forced STUCK\n');
342
425
 
343
426
  // Dynamic content follows (changes per iteration)
@@ -346,38 +429,67 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
346
429
  // Display note from previous iteration (high priority tactical info)
347
430
  if (context.noteFromPreviousIteration) {
348
431
  const note = context.noteFromPreviousIteration;
349
- parts.push(`📝 YOUR NOTE FROM ITERATION ${note.fromIteration}:`);
432
+ parts.push(`📝 YOUR NOTE FROM PREVIOUS ITERATION:`);
350
433
  parts.push(` ${note.content}`);
351
434
  parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
435
+ parts.push(``);
436
+ parts.push(` ACTION REQUIRED:`);
437
+ parts.push(` 1. Did your previous action work? Check the screenshot!`);
438
+ parts.push(` 2. If it WORKED: Execute next step from your plan`);
439
+ parts.push(` 3. If it FAILED: Use your backup plan (try alternative IDs/methods)`);
440
+ parts.push(` 4. Write NEW noteToFutureSelf with:`);
441
+ parts.push(` - What worked/didn't work (learn from attempts)`);
442
+ parts.push(` - Updated strategy with new backup plan`);
443
+ parts.push(` - Next alternatives to try if this fails`);
444
+ parts.push(` - Build on previous note's reasoning`);
445
+ parts.push(``);
446
+ parts.push(` DON'T repeat failed actions - try your backup plan!`);
352
447
  parts.push('');
353
448
  }
354
449
 
355
- // Check for screenshot loops (analysis paralysis)
450
+ // Check for screenshot loops (analysis paralysis) - PER STEP tracking
451
+ const screenshotsThisStep = context.recentSteps.filter(s =>
452
+ s.stepNumber === context.stepNumber &&
453
+ (s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'))
454
+ );
356
455
  const recentScreenshots = context.recentSteps.slice(-3).filter(s =>
357
456
  s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot')
358
457
  );
359
- if (recentScreenshots.length >= 2) {
360
- parts.push(`🚨🚨🚨 SCREENSHOT LOOP DETECTED 🚨🚨🚨`);
361
- parts.push(`You've taken ${recentScreenshots.length} screenshots in last 3 iterations!`);
362
- parts.push(`STOP analyzing - START ACTING!`);
363
- parts.push(`Use ANY selector from DOM snapshot and try clicking.`);
364
- parts.push(`If command succeeds and new elements appear mark "complete"`);
365
- parts.push(`🚨🚨🚨\n`);
458
+
459
+ if (screenshotsThisStep.length >= 3) {
460
+ parts.push(`[CRITICAL] SCREENSHOT LOOP DETECTED - ${screenshotsThisStep.length} SCREENSHOTS THIS STEP`);
461
+ parts.push(`ANALYSIS PARALYSIS! You keep gathering info but NEVER ACTING!`);
462
+ parts.push(`NO MORE SCREENSHOTS ALLOWED - YOU MUST ACT NOW!`);
463
+ parts.push(`Pick ANY selector from your prior DOM snapshots and TRY IT.`);
464
+ parts.push(`Even if uncertain, execute the command. Failure is better than analysis paralysis.`);
465
+ parts.push(`If toolCalls contains "take_screenshot" → SYSTEM WILL REJECT IT\n`);
466
+ } else if (recentScreenshots.length >= 2) {
467
+ parts.push(`[WARNING] ${recentScreenshots.length} screenshots in last 3 iterations - avoid more screenshots`);
468
+ parts.push(`Use selector recommendations from prior screenshots\n`);
366
469
  }
367
470
 
368
471
  // System warnings for accumulated failures
369
472
  if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
370
- parts.push(`⚠️ SYSTEM WARNING: ${consecutiveFailures} failures!`);
371
- parts.push(`Take screenshot if needed. Try different selector strategy.`);
473
+ parts.push(`[WARNING] SYSTEM WARNING: ${consecutiveFailures} failures!`);
474
+ // Only suggest screenshot if we haven't already taken multiple THIS STEP
475
+ if (screenshotsThisStep.length === 0) {
476
+ parts.push(`Take screenshot ONCE to see page state. Then ACT with selector.`);
477
+ } else {
478
+ parts.push(`You already have visual context. Try different selector NOW.`);
479
+ }
372
480
  parts.push(`Question assumptions: Am I at the right step?`);
373
- parts.push(`⚠️\n`);
481
+ parts.push(`[WARNING]\n`);
374
482
  } else if (consecutiveFailures && consecutiveFailures >= 4) {
375
- parts.push(`⚠️ CRITICAL: ${consecutiveFailures} failures!`);
376
- parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
483
+ parts.push(`[WARNING] CRITICAL: ${consecutiveFailures} failures!`);
484
+ if (enableCoordinateMode) {
485
+ parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
486
+ } else {
487
+ parts.push(`Next failure will force STUCK. Try radically different selector approach.\n`);
488
+ }
377
489
  }
378
490
 
379
- // Trigger coordinate mode if many failures (Phase 1: after 3 failures)
380
- if (consecutiveFailures && consecutiveFailures >= 3) {
491
+ // Trigger coordinate mode if many failures (Phase 1: after 3 failures) - ONLY if enabled
492
+ if (enableCoordinateMode && consecutiveFailures && consecutiveFailures >= 3) {
381
493
  parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯🎯`);
382
494
  parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
383
495
  parts.push(`You MUST use coordinate-based action now (percentages).`);
@@ -390,6 +502,8 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
390
502
  parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
391
503
  parts.push(`${context.currentStepGoal}`);
392
504
  parts.push(``);
505
+ parts.push(`[WARNING] IMPORTANT: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
506
+ parts.push(`[WARNING] CRITICAL: Only interact with elements you SEE in the screenshot - no guessing/hallucinating!`);
393
507
  parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
394
508
 
395
509
  if (context.completedSteps.length > 0) {
@@ -399,14 +513,44 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
399
513
  parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
400
514
  }
401
515
 
516
+ // SoM screenshot (if available)
517
+ if (context.somScreenshot) {
518
+ parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
519
+ parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
520
+ parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
521
+ parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
522
+ parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
523
+ parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
524
+ parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
525
+ parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
526
+ parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
527
+ parts.push(``);
528
+
529
+ // SoM element map for disambiguation
530
+ if (context.somElementMap) {
531
+ parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
532
+ parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
533
+ parts.push(context.somElementMap);
534
+ parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
535
+ parts.push(``);
536
+ }
537
+ }
538
+
402
539
  // Current page state (most variable content - at the end)
403
540
  parts.push(`\nCURRENT PAGE:`);
404
541
  parts.push(`URL: ${context.currentURL}`);
405
542
  parts.push(`Title: ${context.currentPageInfo.title}`);
406
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
407
- parts.push(context.currentPageInfo.formattedElements);
408
- parts.push(`\nARIA TREE (hierarchical structure):`);
409
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
543
+
544
+ // Only include DOM details if NOT in SoM mode
545
+ if (!context.somScreenshot) {
546
+ parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
547
+ parts.push(context.currentPageInfo.formattedElements);
548
+ parts.push(`\nARIA TREE (hierarchical structure):`);
549
+ parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
550
+ } else {
551
+ // In SoM mode, skip DOM details - agent uses visual screenshot instead
552
+ parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
553
+ }
410
554
  if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
411
555
  parts.push('... (truncated)');
412
556
  }
@@ -416,11 +560,11 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
416
560
  if (context.recentSteps.length > 0) {
417
561
  parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
418
562
  for (const step of context.recentSteps) {
419
- const status = step.result === 'success' ? '' : '';
563
+ const status = step.result === 'success' ? '[OK]' : '[FAIL]';
420
564
  parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
421
565
  parts.push(` Code: ${step.code}`);
422
566
  if (step.result === 'failure' && step.error) {
423
- parts.push(` ERROR: ${step.error}`);
567
+ parts.push(` ERROR: ${step.error}`);
424
568
  parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
425
569
  } else {
426
570
  parts.push(` Result: ${step.observation}`);
@@ -435,7 +579,7 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
435
579
  i === 0 || s.code === arr[i-1].code
436
580
  );
437
581
  if (sameSelector) {
438
- parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
582
+ parts.push(`[WARNING] WARNING: You've tried the same selector multiple times and it failed!`);
439
583
  parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
440
584
  parts.push(` YOU MUST try a completely different selector this time!\n`);
441
585
  }
@@ -458,16 +602,184 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
458
602
  parts.push('');
459
603
  }
460
604
 
461
- // Previous iteration guidance
462
- if (context.previousIterationGuidance) {
463
- parts.push(`\nGUIDANCE FROM PREVIOUS ITERATION:`);
464
- parts.push(context.previousIterationGuidance.guidanceForNext);
465
- if (context.previousIterationGuidance.detectingLoop) {
466
- parts.push(`⚠️ LOOP DETECTED: ${context.previousIterationGuidance.loopReasoning}`);
605
+ return parts.join('\n');
606
+ }
607
+
608
+ /**
609
+ * Build exploratory system prompt for autonomous exploration
610
+ */
611
+ static buildExploratorySystemPrompt(toolDescriptions: string): string {
612
+ return `You are an autonomous exploration agent that discovers and tests web application features.
613
+
614
+ ${toolDescriptions}
615
+
616
+ YOUR RESPONSE FORMAT - Output JSON matching this interface:
617
+
618
+ interface AgentDecisionLLMResponse {
619
+ status: string; // "continue" | "complete" | "stuck"
620
+ reasoning: string; // What you're exploring and why
621
+
622
+ // COMMANDS: Array of plain Playwright command strings
623
+ commands?: string[]; // Example: ["await page.fill('input[name=\"email\"]', 'test@example.com')", ...]
624
+ commandReasoning?: string;
625
+ toolCalls?: Array<{ // Tools to call (extract_data for menus, etc.)
626
+ name: string;
627
+ params: Record<string, any>;
628
+ }>;
629
+ toolReasoning?: string;
630
+ needsToolResults?: boolean;
631
+ noteToFutureSelf?: string;
632
+ coordinateAction?: { ... };
633
+ experiences?: string[]; // Use for BOTH app patterns AND exploration progress
634
+ blockerDetected?: { ... };
635
+ debugInfo?: { // Meta-learning: suggest prompt improvements (only when very confident)
636
+ suggestedPromptUpdates?: string;
637
+ reasoning?: string;
638
+ };
639
+ }
640
+
641
+ EXPLORATION MODE GUIDELINES:
642
+
643
+ 1. **JOURNEY-FOCUSED EXPLORATION**: Follow the exploration prompt as your goal for THIS journey
644
+ - Example prompt: "Explore Dashboard and test all widgets"
645
+ - You should systematically test dashboard widgets, not wander off to other sections
646
+ - Stay focused on the given journey goal
647
+
648
+ 2. **ICON BUTTONS**: Match step goal to icon semantics (plus=add, gear=settings, hamburger=menu). Check element map for confirmation. Don't randomly try - reason about fit.
649
+
650
+ 3. **VISIBLE ELEMENTS ONLY**: Screenshot shows viewport only. Only interact with elements you SEE. If not visible, scroll or take_screenshot(isFullPage=true).
651
+
652
+ 4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in experiences, check history to avoid repeating, prioritize unexplored areas.
653
+
654
+ 5. **CREATIVE TESTING**: Test functionality thoroughly - try edge cases, verify features work, look for bugs.
655
+
656
+ 7. **LIMITATIONS**: Cannot complete: sign-up, forgot password, OTP, CAPTCHA, email verification (no inbox/SMS access).
657
+ If encountered: CAPTCHA → stuck, sign-up/OTP → skip and explore other areas.
658
+
659
+ 8. **AUTH**: If credentials provided, login FIRST using exact testDataPrompt values. Don't explore public pages or click sign-up.
660
+
661
+ 9. **BLOCKERS**: Clear cookie modals, tour popups autonomously with blockerDetected.clearingCommands. CAPTCHA → stuck.
662
+
663
+ 10. **STATUS**: complete=goal achieved or budget low, continue=need more, stuck=cannot proceed. Complete when journey goal met, don't wait for maxSteps.
664
+
665
+ 11. **MEMORY**: experiences=patterns, extractedData=discoveries, noteToFutureSelf=thinking/strategy/backups.
666
+
667
+ CRITICAL: You're fully autonomous for THIS journey - no step-by-step instructions provided.
668
+ YOU decide the exploration path to meet the journey goal based on: journey prompt, current state, and memory.`;
669
+ }
670
+
671
+ /**
672
+ * Build exploratory user prompt with context
673
+ */
674
+ static buildExploratoryUserPrompt(
675
+ context: AgentContext,
676
+ explorationPrompt: string,
677
+ testDataPrompt?: string,
678
+ stepNumber?: number,
679
+ maxSteps?: number
680
+ ): string {
681
+ const parts: string[] = [];
682
+
683
+ // Add SoM format reminder if screenshot is present
684
+ if (context.somScreenshot) {
685
+ parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
686
+ parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
687
+ parts.push(`Format: { "elementRef": "1", "action": "click" }`);
688
+ parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
689
+ }
690
+
691
+ parts.push('=== JOURNEY EXPLORATION CONTEXT ===\n');
692
+ parts.push(`GOAL: ${explorationPrompt}`);
693
+ parts.push(` (Focus on THIS specific goal - don't wander to unrelated areas)\n`);
694
+
695
+ if (testDataPrompt) {
696
+ parts.push(`TEST DATA/CREDENTIALS: ${testDataPrompt}`);
697
+ parts.push(` [WARNING] IMPORTANT: If credentials are provided above (email/username and password), you MUST:`);
698
+ parts.push(` - Use them to LOGIN and explore authenticated features`);
699
+ parts.push(` - Fill login forms with the exact credentials provided`);
700
+ parts.push(` - Don't waste time on public/unauthenticated pages when you can login`);
701
+ parts.push(` - Prioritize exploring the authenticated app experience\n`);
702
+ }
703
+
704
+ if (stepNumber && maxSteps) {
705
+ parts.push(`PROGRESS: Step ${stepNumber}/${maxSteps} (you can complete earlier if journey goal met)\n`);
706
+ }
707
+
708
+ // Show discovered and tracked data from extractedData
709
+ if (context.extractedData && Object.keys(context.extractedData).length > 0) {
710
+ parts.push(`\nDISCOVERED DATA (this journey):`);
711
+ for (const [key, value] of Object.entries(context.extractedData)) {
712
+ parts.push(` ${key}: ${value}`);
713
+ }
714
+ }
715
+
716
+ // SoM screenshot (if available)
717
+ if (context.somScreenshot) {
718
+ parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
719
+ parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
720
+ parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
721
+ parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
722
+ parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
723
+ parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
724
+ parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
725
+ parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
726
+ parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
727
+ parts.push(``);
728
+
729
+ // SoM element map for disambiguation
730
+ if (context.somElementMap) {
731
+ parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
732
+ parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
733
+ parts.push(context.somElementMap);
734
+ parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
735
+ parts.push(``);
467
736
  }
468
- parts.push('');
469
737
  }
470
738
 
739
+ parts.push(`\nCURRENT PAGE:`);
740
+ parts.push(`URL: ${context.currentURL}`);
741
+ parts.push(`Title: ${context.currentPageInfo.title}`);
742
+
743
+ // Only include DOM details if NOT in SoM mode
744
+ if (!context.somScreenshot) {
745
+ parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
746
+ parts.push(context.currentPageInfo.formattedElements);
747
+ parts.push(`\nARIA TREE (hierarchical structure):`);
748
+ parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
749
+ } else {
750
+ // In SoM mode, skip DOM details - agent uses visual screenshot
751
+ parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
752
+ }
753
+ if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
754
+ parts.push('... (truncated)');
755
+ }
756
+
757
+ // Recent actions
758
+ if (context.recentSteps.length > 0) {
759
+ parts.push(`\nRECENT ACTIONS (last ${context.recentSteps.length}):`);
760
+ for (const step of context.recentSteps) {
761
+ const status = step.result === 'success' ? '[OK]' : '[FAIL]';
762
+ parts.push(` ${status} ${step.action}`);
763
+ parts.push(` ${step.observation}`);
764
+ }
765
+ }
766
+
767
+ // Learnings and exploration progress
768
+ if (context.experiences && context.experiences.length > 0) {
769
+ parts.push(`\nEXPLORATION NOTES & APP PATTERNS:`);
770
+ for (const exp of context.experiences) {
771
+ parts.push(` • ${exp}`);
772
+ }
773
+ }
774
+
775
+ // Note from previous iteration
776
+ if (context.noteFromPreviousIteration) {
777
+ parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.noteFromPreviousIteration.content}`);
778
+ parts.push(`Did it work? If yes, continue plan. If failed, try backup alternatives.`);
779
+ }
780
+
781
+ parts.push(`\nDECIDE NEXT ACTION: What to explore/test next? Check history to avoid repeating. Is goal achieved? Mark complete.`);
782
+
471
783
  return parts.join('\n');
472
784
  }
473
785
  }