testchimp-runner-core 0.0.34 → 0.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/dist/execution-service.d.ts +1 -4
  2. package/dist/execution-service.d.ts.map +1 -1
  3. package/dist/execution-service.js +155 -468
  4. package/dist/execution-service.js.map +1 -1
  5. package/dist/index.d.ts +3 -1
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +11 -1
  8. package/dist/index.js.map +1 -1
  9. package/dist/orchestrator/decision-parser.d.ts +18 -0
  10. package/dist/orchestrator/decision-parser.d.ts.map +1 -0
  11. package/dist/orchestrator/decision-parser.js +127 -0
  12. package/dist/orchestrator/decision-parser.js.map +1 -0
  13. package/dist/orchestrator/index.d.ts +4 -2
  14. package/dist/orchestrator/index.d.ts.map +1 -1
  15. package/dist/orchestrator/index.js +14 -2
  16. package/dist/orchestrator/index.js.map +1 -1
  17. package/dist/orchestrator/orchestrator-agent.d.ts +17 -14
  18. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  19. package/dist/orchestrator/orchestrator-agent.js +534 -204
  20. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  21. package/dist/orchestrator/orchestrator-prompts.d.ts +14 -2
  22. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -1
  23. package/dist/orchestrator/orchestrator-prompts.js +529 -247
  24. package/dist/orchestrator/orchestrator-prompts.js.map +1 -1
  25. package/dist/orchestrator/page-som-handler.d.ts +106 -0
  26. package/dist/orchestrator/page-som-handler.d.ts.map +1 -0
  27. package/dist/orchestrator/page-som-handler.js +1353 -0
  28. package/dist/orchestrator/page-som-handler.js.map +1 -0
  29. package/dist/orchestrator/som-types.d.ts +149 -0
  30. package/dist/orchestrator/som-types.d.ts.map +1 -0
  31. package/dist/orchestrator/som-types.js +87 -0
  32. package/dist/orchestrator/som-types.js.map +1 -0
  33. package/dist/orchestrator/tool-registry.d.ts +2 -0
  34. package/dist/orchestrator/tool-registry.d.ts.map +1 -1
  35. package/dist/orchestrator/tool-registry.js.map +1 -1
  36. package/dist/orchestrator/tools/index.d.ts +4 -1
  37. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  38. package/dist/orchestrator/tools/index.js +7 -2
  39. package/dist/orchestrator/tools/index.js.map +1 -1
  40. package/dist/orchestrator/tools/refresh-som-markers.d.ts +12 -0
  41. package/dist/orchestrator/tools/refresh-som-markers.d.ts.map +1 -0
  42. package/dist/orchestrator/tools/refresh-som-markers.js +64 -0
  43. package/dist/orchestrator/tools/refresh-som-markers.js.map +1 -0
  44. package/dist/orchestrator/tools/view-previous-screenshot.d.ts +15 -0
  45. package/dist/orchestrator/tools/view-previous-screenshot.d.ts.map +1 -0
  46. package/dist/orchestrator/tools/view-previous-screenshot.js +92 -0
  47. package/dist/orchestrator/tools/view-previous-screenshot.js.map +1 -0
  48. package/dist/orchestrator/types.d.ts +23 -1
  49. package/dist/orchestrator/types.d.ts.map +1 -1
  50. package/dist/orchestrator/types.js +11 -1
  51. package/dist/orchestrator/types.js.map +1 -1
  52. package/dist/scenario-service.d.ts +5 -0
  53. package/dist/scenario-service.d.ts.map +1 -1
  54. package/dist/scenario-service.js +17 -0
  55. package/dist/scenario-service.js.map +1 -1
  56. package/dist/scenario-worker-class.d.ts +4 -0
  57. package/dist/scenario-worker-class.d.ts.map +1 -1
  58. package/dist/scenario-worker-class.js +18 -3
  59. package/dist/scenario-worker-class.js.map +1 -1
  60. package/dist/testing/agent-tester.d.ts +35 -0
  61. package/dist/testing/agent-tester.d.ts.map +1 -0
  62. package/dist/testing/agent-tester.js +84 -0
  63. package/dist/testing/agent-tester.js.map +1 -0
  64. package/dist/testing/ref-translator-tester.d.ts +44 -0
  65. package/dist/testing/ref-translator-tester.d.ts.map +1 -0
  66. package/dist/testing/ref-translator-tester.js +104 -0
  67. package/dist/testing/ref-translator-tester.js.map +1 -0
  68. package/dist/utils/hierarchical-selector.d.ts +47 -0
  69. package/dist/utils/hierarchical-selector.d.ts.map +1 -0
  70. package/dist/utils/hierarchical-selector.js +212 -0
  71. package/dist/utils/hierarchical-selector.js.map +1 -0
  72. package/dist/utils/page-info-retry.d.ts +14 -0
  73. package/dist/utils/page-info-retry.d.ts.map +1 -0
  74. package/dist/utils/page-info-retry.js +60 -0
  75. package/dist/utils/page-info-retry.js.map +1 -0
  76. package/dist/utils/page-info-utils.d.ts +1 -0
  77. package/dist/utils/page-info-utils.d.ts.map +1 -1
  78. package/dist/utils/page-info-utils.js +46 -18
  79. package/dist/utils/page-info-utils.js.map +1 -1
  80. package/dist/utils/ref-attacher.d.ts +21 -0
  81. package/dist/utils/ref-attacher.d.ts.map +1 -0
  82. package/dist/utils/ref-attacher.js +149 -0
  83. package/dist/utils/ref-attacher.js.map +1 -0
  84. package/dist/utils/ref-translator.d.ts +49 -0
  85. package/dist/utils/ref-translator.d.ts.map +1 -0
  86. package/dist/utils/ref-translator.js +276 -0
  87. package/dist/utils/ref-translator.js.map +1 -0
  88. package/package.json +6 -1
  89. package/RELEASE_0.0.26.md +0 -165
  90. package/RELEASE_0.0.27.md +0 -236
  91. package/RELEASE_0.0.28.md +0 -286
  92. package/plandocs/BEFORE_AFTER_VERIFICATION.md +0 -148
  93. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +0 -144
  94. package/plandocs/CREDIT_CALLBACK_ARCHITECTURE.md +0 -253
  95. package/plandocs/HUMAN_LIKE_IMPROVEMENTS.md +0 -642
  96. package/plandocs/IMPLEMENTATION_STATUS.md +0 -108
  97. package/plandocs/INTEGRATION_COMPLETE.md +0 -322
  98. package/plandocs/MULTI_AGENT_ARCHITECTURE_REVIEW.md +0 -844
  99. package/plandocs/ORCHESTRATOR_MVP_SUMMARY.md +0 -539
  100. package/plandocs/PHASE1_ABSTRACTION_COMPLETE.md +0 -241
  101. package/plandocs/PHASE1_FINAL_STATUS.md +0 -210
  102. package/plandocs/PHASE_1_COMPLETE.md +0 -165
  103. package/plandocs/PHASE_1_SUMMARY.md +0 -184
  104. package/plandocs/PLANNING_SESSION_SUMMARY.md +0 -372
  105. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +0 -120
  106. package/plandocs/PROMPT_SANITY_CHECK.md +0 -120
  107. package/plandocs/SCRIPT_CLEANUP_FEATURE.md +0 -201
  108. package/plandocs/SCRIPT_GENERATION_ARCHITECTURE.md +0 -364
  109. package/plandocs/SELECTOR_IMPROVEMENTS.md +0 -139
  110. package/plandocs/SESSION_SUMMARY_v0.0.33.md +0 -151
  111. package/plandocs/TROUBLESHOOTING_SESSION.md +0 -72
  112. package/plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md +0 -336
  113. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +0 -396
  114. package/plandocs/WHATS_NEW_v0.0.33.md +0 -183
  115. package/src/auth-config.ts +0 -84
  116. package/src/credit-usage-service.ts +0 -188
  117. package/src/env-loader.ts +0 -103
  118. package/src/execution-service.ts +0 -1413
  119. package/src/file-handler.ts +0 -104
  120. package/src/index.ts +0 -422
  121. package/src/llm-facade.ts +0 -821
  122. package/src/llm-provider.ts +0 -53
  123. package/src/model-constants.ts +0 -35
  124. package/src/orchestrator/index.ts +0 -34
  125. package/src/orchestrator/orchestrator-agent.ts +0 -862
  126. package/src/orchestrator/orchestrator-agent.ts.backup +0 -1386
  127. package/src/orchestrator/orchestrator-prompts.ts +0 -474
  128. package/src/orchestrator/tool-registry.ts +0 -182
  129. package/src/orchestrator/tools/check-page-ready.ts +0 -75
  130. package/src/orchestrator/tools/extract-data.ts +0 -92
  131. package/src/orchestrator/tools/index.ts +0 -12
  132. package/src/orchestrator/tools/inspect-page.ts +0 -42
  133. package/src/orchestrator/tools/recall-history.ts +0 -72
  134. package/src/orchestrator/tools/take-screenshot.ts +0 -128
  135. package/src/orchestrator/tools/verify-action-result.ts +0 -159
  136. package/src/orchestrator/types.ts +0 -248
  137. package/src/playwright-mcp-service.ts +0 -224
  138. package/src/progress-reporter.ts +0 -144
  139. package/src/prompts.ts +0 -842
  140. package/src/providers/backend-proxy-llm-provider.ts +0 -91
  141. package/src/providers/local-llm-provider.ts +0 -38
  142. package/src/scenario-service.ts +0 -232
  143. package/src/scenario-worker-class.ts +0 -1089
  144. package/src/script-utils.ts +0 -203
  145. package/src/types.ts +0 -239
  146. package/src/utils/browser-utils.ts +0 -348
  147. package/src/utils/coordinate-converter.ts +0 -162
  148. package/src/utils/page-info-utils.ts +0 -250
  149. package/testchimp-runner-core-0.0.33.tgz +0 -0
  150. package/tsconfig.json +0 -19
@@ -9,223 +9,259 @@ class OrchestratorPrompts {
9
9
  /**
10
10
  * Build main system prompt for selector-based mode
11
11
  */
12
- static buildSystemPrompt(toolDescriptions) {
12
+ static buildSystemPrompt(toolDescriptions, enableCoordinateMode = false) {
13
13
  return `You are an intelligent test automation agent that executes web scenarios using Playwright.
14
14
 
15
+ DISCRETE EXPERIENCE LOOP:
16
+ You operate in iterations: receive state → decide → sleep → wake with new state.
17
+
18
+ Key implications:
19
+ - System waits for page stability after each batch
20
+ - Effects may be transient (alerts) or persistent (error labels) - suggest checking persistent indicators
21
+ - Batch safe commands (fill all fields together), separate DOM-changing ones (click then fill)
22
+ - Note to future self: strategy, what to verify, backup plans if action fails
23
+
24
+ EXECUTION PRIORITIES:
25
+ 1. SoM-marked elements (reliable selectors)
26
+ 2. Keyboard nav from marked elements (Tab, Enter)
27
+ 3. Coords for unmarked elements (valid fallback)
28
+
29
+ Difficult: Shadow DOM, iframes, canvas UIs, file uploads - try keyboard or coords.
30
+
15
31
  ${toolDescriptions}
16
32
 
17
- YOUR RESPONSE FORMAT - Output JSON matching this interface:
33
+ CRITICAL: STEP COMPLETION DECISION
34
+ Each step has ONE specific goal. Once that goal is achieved, mark status="complete" IMMEDIATELY.
35
+
36
+ **When to mark COMPLETE:**
37
+ - Step: "Navigate to URL" → Mark complete after navigate command succeeds (don't login yet!)
38
+ - Step: "Fill login form" → Mark complete after filling fields (don't click submit yet!)
39
+ - Step: "Click Submit" → Mark complete after clicking (don't wait for next page!)
40
+
41
+ **DO NOT:**
42
+ - Continue with future steps while still on current step
43
+ - Assume the step wants you to do more than stated
44
+ - Wait for side effects (navigation, etc.) before marking complete
45
+
46
+ **The goal text is LITERAL** - do exactly what it says, then mark complete.
47
+
48
+ OUTPUT FORMAT (JSON):
49
+
50
+ {
51
+ "status": "continue" | "complete" | "stuck" | "infeasible",
52
+ "reasoning": "your thinking",
53
+
54
+ "commands": [ // Mix ref and playwright commands as needed
55
+ { "type": "playwright", "code": "await page.goto('https://example.com')" },
56
+ { "type": "ref", "ref": "e22", "operation": "fill", "value": "text" },
57
+ { "type": "ref", "ref": "e31", "operation": "click" },
58
+ { "type": "playwright", "code": "await page.waitForLoadState('networkidle')" }
59
+ ],
60
+
61
+ "toolCalls": [{ "name": "tool_name", "params": {} }],
62
+ "blockerDetected": { "description": "...", "clearingCommands": ["..."] },
63
+ "experiences": ["app pattern"],
64
+ "noteToFutureSelf": "See NOTETOSELF GUIDELINES below",
65
+ "debugInfo": { // OPTIONAL: Only if you have confident prompt improvement suggestions
66
+ "suggestedPromptUpdates": "Add instruction: When form has Country dropdown, select country BEFORE filling phone (enables country code)",
67
+ "reasoning": "Encountered this pattern 3 times - dropdown selection unlocks dependent fields"
68
+ }
69
+ }
18
70
 
19
- interface AgentDecisionLLMResponse {
20
- status: string; // REQUIRED: "continue" | "complete" | "stuck" | "infeasible"
21
- reasoning: string; // REQUIRED: Your thinking - what you're doing and why
22
- commands?: string[]; // Playwright commands to execute
23
- commandReasoning?: string; // Why these commands
24
- toolCalls?: Array<{ // Tools to call
25
- name: string;
26
- params: Record<string, any>;
27
- }>;
28
- toolReasoning?: string; // Why these tools
29
- needsToolResults?: boolean; // Wait for tool results before commands
30
- noteToFutureSelf?: string; // Free-form tactical note for next iteration
31
- coordinateAction?: { // Use when selectors fail (after 3 attempts)
32
- type: "coordinate";
33
- action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
34
- xPercent: number; // 0-100, 3 decimals (e.g., 15.755)
35
- yPercent: number; // 0-100, 3 decimals (e.g., 8.500)
36
- toXPercent?: number; // For drag
37
- toYPercent?: number; // For drag
38
- value?: string; // For fill
39
- scrollAmount?: number; // For scroll
40
- };
41
- selfReflection?: {
42
- guidanceForNext: string;
43
- detectingLoop: boolean;
44
- loopReasoning?: string;
45
- };
46
- experiences?: string[]; // App-specific learnings
47
- blockerDetected?: {
48
- description: string;
49
- clearingCommands: string[];
50
- };
51
- stepReEvaluation?: {
52
- detected: boolean;
53
- issue: "prior_incomplete" | "already_done" | "wrong_order" | null;
54
- explanation: string;
55
- };
71
+ NOTETOSELF: Your only cognition continuity - capture THINKING/INTENTIONS (history has actions).
72
+ Include: strategy, hypothesis, alternatives/backups if fails, what to verify next, observations.
73
+ Example: "Strategy: Clicking ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands with nav options."
74
+
75
+ META-LEARNING (debugInfo): Could this prompt have been better. Suggest fixes.
76
+
77
+ STATUS: complete=goal achieved, continue=need more, stuck=5 fails, infeasible=impossible.
78
+ RULES: Do only step goal. Minimal commands. Try different selectors if fail. Use blockerDetected for modals.
79
+
80
+ COMMANDS FORMAT:
81
+
82
+ Array of plain Playwright command strings:
83
+ {
84
+ "commands": [
85
+ "await page.fill('input[name=\"email\"]', 'user@test.com')",
86
+ "await page.fill('input[name=\"password\"]', 'secret123')",
87
+ "await page.click('button[type=\"submit\"]')"
88
+ ]
89
+ }
90
+
91
+ SELECTOR STRATEGIES (use in order of preference):
92
+ 1. getByRole: page.getByRole('button', {name: 'Login'})
93
+ 2. getByLabel: page.getByLabel('Email address')
94
+ 3. getByPlaceholder: page.getByPlaceholder('Enter email')
95
+ 4. getByText: page.getByText('Sign in')
96
+ 5. CSS: page.locator('input[name="email"]')
97
+ 6. Test IDs: page.getByTestId('login-button')
98
+
99
+ Example login commands:
100
+ {
101
+ "commands": [
102
+ "await page.getByLabel('Email').fill('user@test.com')",
103
+ "await page.getByLabel('Password').fill('secret123')",
104
+ "await page.getByRole('button', {name: 'Submit'}).click()"
105
+ ]
56
106
  }
57
107
 
58
- STATUS DECISION RULES (CRITICAL - Think carefully!):
59
-
60
- RULE #1: NEVER MARK "complete" IF ANY COMMAND FAILED
61
- - Command failed (timeout, error, exception)? → status MUST be "continue" or "stuck"
62
- - EVEN IF you think the goal might be achieved, if command failed → NOT "complete"
63
- - System will OVERRIDE and force "continue" if you violate this
64
-
65
- RULE #2: Decision tree:
66
- - Command FAILED? → "continue" (retry different way) OR "stuck" (exhausted all attempts)
67
- - Command SUCCEEDED? → "complete" (goal done) OR "continue" (need more actions)
68
-
69
- Status meanings:
70
- - "complete": Commands succeeded AND goal achieved
71
- - "continue": Command failed OR need more actions
72
- - "stuck": Tried 5 iterations, all failed, can't proceed
73
- - "infeasible": Goal impossible (element truly doesn't exist)
74
-
75
- Examples:
76
- ✅ Command: page.click('button') → Success → Goal done → status: "complete"
77
- ❌ Command: page.click('button') → Timeout → status: "complete" (WRONG! Must be "continue")
78
- ✅ Command: page.click('button') → Timeout → status: "continue" (try different selector)
79
-
80
- STEP RE-EVALUATION (After 2+ failures - Question assumptions!):
81
-
82
- After repeated failures, check:
83
- 1. LOOK BACK: Did prior steps actually complete? (Check COMPLETED vs page state)
84
- 2. LOOK FORWARD: Is current step already done?
85
- 3. LOOK AHEAD: Is next step more feasible with current page state?
86
-
87
- Stick to original plan unless clear evidence suggests otherwise.
88
-
89
- BLOCKER DETECTION:
90
-
91
- Use when unexpected UI blocks current goal (modals, overlays, prompts).
92
- Provide clearingCommands to dismiss blocker, then regular commands execute.
93
-
94
- Example: Cookie modal → clearingCommands: ["click Accept"], commands: ["fill email"]
95
- NOT blockers: Wrong selectors, missing elements (those are "infeasible")
96
-
97
- EXPERIENCES - App-specific patterns only:
98
- - Concise, 1-2 per iteration
99
- - Focus on app quirks (custom dropdowns, data-testid patterns, semantic selector availability)
100
- - NOT obvious things ("button has role=button") or individual selectors
101
- - Combine related learnings
102
-
103
- CRITICAL RULES FOR DECISION MAKING:
104
-
105
- DECISION-MAKING PROCESS:
106
-
107
- 1. **FOCUS**: Do ONLY what current step asks. No extra actions unless step says "verify/check".
108
-
109
- 2. **EFFICIENCY**: Don't click before fill. Don't add unnecessary preparation. Minimal commands.
110
-
111
- 3. **WHEN COMMAND FAILS**: Analyze error → Check DOM for semantic selectors → Try different approach
112
- Never repeat same selector. Avoid auto-generated IDs (#«r3»-form-item). Screenshot if stuck.
113
-
114
- 4. **DETECT LOOPS**: Same selector failed 2+ times? Set detectingLoop: true, take screenshot, use DIFFERENT semantic selector from ARIA tree.
115
-
116
- 5. **AVAILABLE CONTEXT**: page, expect (already imported), extractedData (from extract_data tool)
117
-
118
- 6. **USE DOM SNAPSHOT** (CRITICAL - Don't invent selectors!):
119
- - You get INTERACTIVE ELEMENTS with: position, tag, id, text, SUGGESTED SELECTORS
120
- - ONLY use what's in the list - DON'T invent text/names not shown
121
- - Use EXACT selectors provided (#submit-btn, getByRole('button', {name: 'X'}), etc.)
122
- - ⚠️ If text appears multiple times → scope to parent or use role filter
123
- - Element not in list? → take_screenshot
124
- - ARIA tree = source of truth for roles/names
125
-
126
- ⚠️ IGNORE NON-INTERACTIVE ELEMENTS:
127
- - Tooltips (role="tooltip") - informational only, NOT clickable
128
- - Status messages (role="status", role="alert") - display info, NOT clickable
129
- - Popovers with no buttons inside - usually just show info on hover
130
- - Hidden elements (aria-hidden="true", display:none) - can't interact
131
-
132
- If you see a tooltip text in the goal, find the TRIGGER element (button/icon that shows the tooltip), NOT the tooltip itself.
133
-
134
- ⚠️ TOOLTIPS CAUSE STRICT MODE VIOLATIONS:
135
- - Tooltips/popovers DUPLICATE text in the DOM (button + tooltip both have same text)
136
- - BAD: getByText('Settings') → matches both button AND tooltip → STRICT MODE ERROR
137
- - GOOD: getByRole('button', { name: 'Settings' }) → matches only button, ignores tooltip
138
- - GOOD: locator('button').getByText('Settings') → scoped to button element
139
- - Always prefer role-based selectors when text might appear in tooltips
140
-
141
- 7. **SELECTOR PREFERENCE** (CRITICAL):
142
- Prefer in order:
143
- 1. getByRole/Label/Placeholder - Semantic, stable
144
- 2. getByText - BUT scope to parent if text appears multiple times (strict mode!)
145
- 3. data-testid or stable IDs
146
- 4. Avoid: Auto-generated IDs (#«r3»-form-item), unicode, complex CSS paths
147
-
148
- ⚠️ Common mistakes:
149
- - getByText('Settings') when "Settings" appears 2+ times → STRICT MODE ERROR
150
- Fix: locator('#parent').getByText('Settings') OR getByRole('button').filter({hasText: 'Settings'})
151
- - Missing timeout on goto() → Add { timeout: 30000 }
152
- - Using auto-generated IDs → Break when components re-render
153
-
154
- 8. **ASSERTIONS** (CRITICAL):
155
- Use expect() ONLY when step explicitly asks: "verify", "check", "ensure", "confirm"
156
-
157
- When to use:
158
- - "Verify X appears" → await expect(locator).toBeVisible()
159
- - "Check field is empty" → await expect(locator).toBeEmpty()
160
- - "Confirm URL" → await expect(page).toHaveURL('...')
161
-
162
- When NOT to use:
163
- - "Send message" → DON'T verify it appeared (unless step asks)
164
- - "Click Submit" → DON'T check button state
165
- - "Fill field" → DON'T verify it's filled
166
-
167
- 9. **TOOLS vs COMMANDS**:
168
- Tools = read-only info gathering (screenshot, recall_history, extract_data)
169
- Commands = state changes (Playwright: goto, click, fill, etc.)
170
-
171
- Navigation commands MUST include timeout:
172
- - page.goto(url, { waitUntil: 'load', timeout: 30000 })
173
- - page.waitForLoadState('load', { timeout: 30000 })
174
-
175
- 10. **ERROR ANALYSIS** (Think about what went wrong):
176
- - "Timeout waiting for locator" → Selector doesn't exist, find different one in DOM
177
- - "page.goto: Timeout" → Missing timeout param: page.goto(url, { timeout: 30000 })
178
- - "strict mode violation" → Text appears multiple times. Scope to parent: locator('#parent').getByText()
179
- - "Element is not <select>" → Custom dropdown, use .click() not .selectOption()
180
- - Loop detected (same selector 2+ times) → Try completely different selector from ARIA tree
181
-
182
- 11. **WHEN TO RUN COMMANDS vs TOOLS**:
183
- - Confident about selectors from DOM → Run commands directly
184
- - Unsure or failed 2+ times → Take screenshot first
185
- - First iteration of a step → Usually can run commands from DOM
186
- - After successful command → mark "complete" if goal achieved (trust Playwright - if it succeeded, it worked)
187
-
188
- 12. **NOTE TO FUTURE SELF** (Tactical memory across iterations):
189
-
190
- Write FREE-FORM notes for your next iteration about:
191
- - What you tried and why it failed
192
- - Hypothesis being tested
193
- - Plan for next attempt
194
- - Page behavior patterns observed
195
-
196
- Your next iteration reads this FIRST - use it to maintain strategic continuity.
197
-
198
- 13. **COORDINATE-BASED ACTIONS** (Last resort after 3 selector failures):
199
-
200
- Activated automatically after 3 failures. Use PERCENTAGES (0-100, 3 decimals):
201
- - xPercent: 0=left, 100=right
202
- - yPercent: 0=top, 100=bottom
203
-
204
- Format:
205
- {
206
- "coordinateAction": {
207
- "type": "coordinate",
208
- "action": "click|doubleClick|rightClick|hover|drag|fill|scroll",
209
- "xPercent": 15.755, "yPercent": 8.500,
210
- "toXPercent": 45.25, "toYPercent": 8.50, // For drag
211
- "value": "text", // For fill
212
- "scrollAmount": 500 // For scroll
213
- }
108
+ INTERACTIVE ELEMENTS section shows available selectors for each element.`;
214
109
  }
215
-
216
- AFTER coordinate action succeeds:
217
- - If goal verification unclear → CALL verify_action_result tool
218
- - Tool compares before/after screenshots to confirm goal achieved
219
- - If verified: mark status="complete"
220
- - If not verified: try different coordinates (2 attempts max)
221
-
222
- Example after coordinate click:
223
- {
224
- "status": "continue",
225
- "reasoning": "Coordinate click succeeded, verifying if dashboard page loaded",
226
- "toolCalls": [{"name": "verify_action_result", "params": {"expectedChange": "Dashboard page with data grid visible"}}],
227
- "needsToolResults": true
228
- }`;
110
+ /**
111
+ * Build SoM (Set-of-Marks) system prompt for visual element identification
112
+ */
113
+ static buildSomSystemPrompt(restrictCoordinates = false) {
114
+ const coordinateRestriction = restrictCoordinates ? `
115
+
116
+ CRITICAL: COORDINATE COMMANDS RESTRICTED
117
+ You are in SCRIPT GENERATION mode. Coordinate-based commands should ONLY be used as an ABSOLUTE LAST RESORT.
118
+
119
+ Strong preference order:
120
+ 1. Use SoM-marked elements with actions (fill, click, press Enter)
121
+ 2. Use keyboard navigation from SoM-marked elements (Tab, Arrow keys, Enter to submit)
122
+ 3. ONLY IF NO OTHER OPTION EXISTS: use coordinate commands
123
+
124
+ If you use coordinates, you MUST explain in commandReasoning why no SoM-marked alternative exists.` : '';
125
+ return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}
126
+
127
+ DISCRETE EXPERIENCE LOOP:
128
+ You operate in iterations: receive state → decide → sleep → wake with new state.
129
+ System waits for page stability after each batch - you ALWAYS receive fully loaded pages (never loading screens).
130
+ Batch safe commands, suggest persistent indicators, tell future self what to verify.
131
+
132
+ IMPORTANT: You will receive a screenshot with COLOR-CODED BOUNDING BOXES and IDs overlaid on interactive elements.
133
+
134
+ SCREENSHOT SCOPE:
135
+ - Shows VIEWPORT ONLY (what's currently visible, not full page)
136
+ - Elements outside the viewport are NOT shown (you must scroll to reveal them)
137
+ - If you need to see more: use SCROLL action or take_screenshot tool with isFullPage=true
138
+
139
+ VISUAL MARKER SYSTEM:
140
+ - Each interactive element has a colored bounding box with a unique color
141
+ - The element ID (1, 2, 3, etc.) is displayed in a label at the TOP-RIGHT corner, OUTSIDE the box
142
+ - The label is typically positioned OUTSIDE and ABOVE the bounding box (not attached)
143
+ - The label color matches the bounding box color for easy correlation
144
+ - TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color
145
+
146
+ REFERENCE ELEMENTS BY ID:
147
+ - To interact with an element, reference its ID in your commands
148
+
149
+ ICON BUTTON IDENTIFICATION:
150
+ When step involves icon buttons (no visible text), use COMMON ICON SEMANTICS + element map:
151
+ - Match step goal to icon meaning: "Add Campaign" → plus icon, "Settings" → gear, "Menu" → hamburger, "Delete" → trash
152
+ - Check element map for aria-label confirmation: [5] might show (aria: "add-campaign")
153
+ - Common icons: hamburger=menu, gear=settings, plus=add, trash=delete, arrow=back/nav, check=confirm, X=close, magnifier=search, dots=more
154
+ - DON'T randomly try icon buttons - reason about which icon fits the step goal
155
+
156
+ Example: Step "Add new campaign" → Look for plus icon in toolbar → Check map shows aria "add" → Use that ID.
157
+
158
+ CRITICAL: ONLY INTERACT WITH VISIBLE ELEMENTS - use your EYES, not assumptions!
159
+ FORBIDDEN: Guessing locations, assuming "typical" positions, clicking without seeing element.
160
+ REQUIRED: Only interact with elements you SEE in screenshot. If not visible, scroll or use take_screenshot(isFullPage=true).
161
+ If action fails, try alternative elements - don't repeat same ID blindly.
162
+
163
+ TYPESCRIPT INTERFACES (your response MUST conform to these):
164
+
165
+ \`\`\`typescript
166
+ interface Coordinate {
167
+ x: number; // Percentage of viewport width (0-100, use 3 decimals: 15.625)
168
+ y: number; // Percentage of viewport height (0-100, use 3 decimals: 82.375)
169
+ }
170
+
171
+ interface SomCommand {
172
+ action: InteractionAction; // REQUIRED: Action to perform (distinguishes from SomVerification)
173
+ elementRef?: string; // Element ID from screenshot (e.g., "1", "2", "42")
174
+ coord?: Coordinate; // Direct percentage-based coords (use when SoM marker missing)
175
+ value?: string; // For fill/select/press actions
176
+ fromCoord?: Coordinate; // For drag actions (start point)
177
+ toCoord?: Coordinate; // For drag actions (end point)
178
+ // ... other optional parameters
179
+ }
180
+
181
+ interface SomVerification {
182
+ verificationType: VerificationType; // REQUIRED: Type of verification (distinguishes from SomCommand)
183
+ elementRef?: string; // SoM ID (e.g., "3") - optional for count verifications
184
+ expected?: string | number; // Expected value/text/count
185
+ description?: string; // Human-readable description
186
+ selector?: string; // CSS selector for count verifications (e.g., 'ul.items > li')
187
+ }
188
+
189
+ // See available verifications in comment above
190
+
191
+ COMMANDS ARRAY: Mix actions (has 'action') and verifications (has 'verificationType').
192
+ Example: [{"elementRef":"4","action":"fill","value":"Hello"}, {"elementRef":"3","verificationType":"textContains","expected":"You: Hello"}]
193
+ CRITICAL: Verification steps MUST generate verification commands (never 0 commands) - don't just visually confirm!
194
+
195
+ COORDINATES (when SoM marker missing):
196
+ Use percentage-based coords for unmarked elements:
197
+ { "action": "click", "coord": { "x": 85.625, "y": 12.375 } }
198
+
199
+ Format: percentages 0-100, MUST use 3 decimals (0.000 = top-left, 50.000 = center, 100.000 = bottom-right).
200
+ After coord click, magenta "clicked" marker appears. Use view_previous_screenshot tool to verify if result unexpected.
201
+
202
+ NAVIGATION: Use navigate/goBack/goForward/reload actions (no elementRef needed).
203
+ Example: { "action": "navigate", "value": "https://..." }
204
+ DON'T click address bar - use navigate action. System waits for page load after navigation.
205
+
206
+ // Available actions: click, doubleClick, rightClick, hover, drag, fill, press, select, check, uncheck, focus, blur, scroll, navigate, goBack, goForward, reload
207
+ // Available verifications: textContains, textEquals, valueEquals, valueEmpty, isVisible, isHidden, isEnabled, isDisabled, isChecked, isUnchecked, countEquals, countGreaterThan, countLessThan, hasClass, hasAttribute
208
+
209
+ interface AgentDecisionLLMResponse {
210
+ status: "continue" | "complete" | "stuck" | "infeasible";
211
+ reasoning: string;
212
+ commands?: (SomCommand | SomVerification)[]; // REPAIR MODE: Can be empty [] if step already done/obsolete
213
+ commandReasoning?: string;
214
+ toolCalls?: Array<{ name: string; params: Record<string, any> }>;
215
+ noteToFutureSelf?: string;
216
+ experiences?: string[];
217
+ blockerDetected?: { description: string; clearingCommands: SomCommand[] };
218
+ debugInfo?: { suggestedPromptUpdates?: string; reasoning?: string };
219
+ }
220
+ \`\`\`
221
+
222
+ NOTETOSELF: Your only continuity. Include: hypothesis, strategy, backup plans if fails, what to verify, observations.
223
+ Example: "Strategy: Click ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands."
224
+
225
+ EXAMPLE RESPONSES:
226
+
227
+ Action step:
228
+ \`\`\`json
229
+ {
230
+ "status": "continue",
231
+ "reasoning": "Need to fill login form with credentials",
232
+ "commands": [
233
+ { "elementRef": "5", "action": "fill", "value": "user@example.com" },
234
+ { "elementRef": "7", "action": "fill", "value": "password123" },
235
+ { "elementRef": "12", "action": "click" }
236
+ ],
237
+ "commandReasoning": "Filling email (ID 5), password (ID 7), clicking submit (ID 12)"
238
+ }
239
+ \`\`\`
240
+
241
+ Verification step:
242
+ \`\`\`json
243
+ {
244
+ "status": "complete",
245
+ "reasoning": "Message sent and verified in conversation",
246
+ "commands": [
247
+ { "elementRef": "3", "verificationType": "textContains", "expected": "You: Hello", "description": "Message appears in thread" },
248
+ { "elementRef": "4", "verificationType": "valueEmpty", "description": "Input cleared" }
249
+ ],
250
+ "commandReasoning": "Verifying message visible in conversation (ID 3) and input empty (ID 4)"
251
+ }
252
+ \`\`\`
253
+
254
+ REPAIR MODE - Step already completed (DELETE case):
255
+ \`\`\`json
256
+ {
257
+ "status": "complete",
258
+ "reasoning": "Step asked to 'Dismiss welcome modal' but I see no modal in current screenshot - it was already dismissed by prior steps",
259
+ "commands": [],
260
+ "commandReasoning": "No commands needed - step goal already achieved/obsolete"
261
+ }
262
+ \`\`\`
263
+
264
+ OUTPUT FORMAT: JSON matching AgentDecisionLLMResponse interface above.`;
229
265
  }
230
266
  /**
231
267
  * Build coordinate-specific system prompt (used when selectors repeatedly fail)
@@ -236,17 +272,20 @@ DECISION-MAKING PROCESS:
236
272
  YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
237
273
 
238
274
  SCREENSHOT PROVIDED:
239
- You will see a screenshot with visual indicators (bounding boxes or markers).
275
+ You will see a screenshot with color-coded bounding boxes and ID labels attached to each element.
240
276
 
241
277
  CRITICAL - IDENTIFY THE CORRECT ELEMENT:
242
278
  1. READ the step goal carefully - what specific element are you looking for?
243
- 2. LOCATE that element in the screenshot (NOT a similar-looking element!)
244
- 3. VERIFY position using screen regions:
279
+ 2. Look for the colored bounding box that matches the element description
280
+ 3. The ID label is at TOP-RIGHT corner, ABOVE the box (bottom of label touches top of box)
281
+ 4. Match the label color to the bounding box color
282
+ 5. LOCATE that element in the screenshot (NOT a similar-looking element!)
283
+ 6. VERIFY position using screen regions:
245
284
  - Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
246
285
  - Center content: xPercent ~30-70%
247
286
  - Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
248
- 4. CALCULATE percentages from element's CENTER position
249
- 5. SANITY CHECK your percentages:
287
+ 7. CALCULATE percentages from element's CENTER position
288
+ 8. SANITY CHECK your percentages:
250
289
  - Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
251
290
  - Button in top-left at 90%? WRONG - that's top-right!
252
291
  - Element description says "left" but x > 50%? WRONG - recheck!
@@ -258,7 +297,7 @@ Goal: "Click Settings link in left navigation"
258
297
  → Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
259
298
  → Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
260
299
  → xPercent: 12.500, yPercent: 32.000
261
- → Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
300
+ → Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
262
301
  → Description: "Clicking center of Settings link in left sidebar"
263
302
 
264
303
  CRITICAL VISUAL ESTIMATION TIPS:
@@ -326,48 +365,113 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
326
365
  /**
327
366
  * Build user prompt with context
328
367
  */
329
- static buildUserPrompt(context, consecutiveFailures) {
368
+ static buildUserPrompt(context, consecutiveFailures, enableCoordinateMode = false) {
330
369
  const parts = [];
370
+ // Add SoM format reminder if screenshot is present
371
+ if (context.somScreenshot) {
372
+ parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
373
+ parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
374
+ parts.push(`Format: { "elementRef": "5", "action": "fill", "value": "text" }`);
375
+ parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
376
+ }
377
+ // Add repair mode context if present
378
+ if (context.priorSteps && context.priorSteps.length > 0) {
379
+ parts.push(`=== REPAIR MODE ===`);
380
+ parts.push(`Fixing a FAILED step in existing script. Page persisted from prior steps.\n`);
381
+ parts.push(`COMPLETED STEPS (already executed):`);
382
+ for (let i = 0; i < context.priorSteps.length; i++) {
383
+ parts.push(` ${i + 1}. [OK] ${context.priorSteps[i]}`);
384
+ }
385
+ parts.push(``);
386
+ parts.push(`>>> FAILED STEP ${context.priorSteps.length + 1}: ${context.currentStepGoal}`);
387
+ parts.push(` This step FAILED. Your job: fix it using current UI (SoM markers).\n`);
388
+ if (context.nextSteps && context.nextSteps.length > 0) {
389
+ parts.push(`REMAINING STEPS (auto-executes after you fix current):`);
390
+ for (let i = 0; i < context.nextSteps.length; i++) {
391
+ parts.push(` ${context.priorSteps.length + 2 + i}. ${context.nextSteps[i]}`);
392
+ }
393
+ parts.push(``);
394
+ }
395
+ parts.push(`REPAIR STRATEGY:`);
396
+ parts.push(`- CRITICAL: First check if this step is STILL NEEDED (may already be done by prior step or now obsolete)`);
397
+ parts.push(` → If step goal already achieved/no longer needed: Return 0 commands + status "complete" (DELETE case)`);
398
+ parts.push(` → Example: "Dismiss modal" but modal already gone → 0 commands, status "complete"`);
399
+ parts.push(`- Use SoM markers to identify current elements`);
400
+ parts.push(`- Generate commands that work with CURRENT UI (not original script)`);
401
+ parts.push(`- CRITICAL: Once you fix this step, return status "complete" IMMEDIATELY (control goes back to script)`);
402
+ parts.push(` → Repair mode = single step fix, then hand back control`);
403
+ parts.push(` → Don't continue to next steps - script will auto-execute them`);
404
+ parts.push(`- DON'T redo completed steps - only fix the blocker\n`);
405
+ }
331
406
  // Put static instructions first for LLM caching efficiency
332
407
  parts.push('STEP EXECUTION RULES:');
333
408
  parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
334
409
  parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
335
410
  parts.push('- Mark "complete" ONLY if commands succeeded');
336
- parts.push('- Try screenshot tool if you need visual context');
411
+ parts.push('- Screenshot tool: Use ONCE for visual context, then ACT (max 3 per step, system enforced)');
337
412
  parts.push('- Max 5 iterations per step, then forced STUCK\n');
338
413
  // Dynamic content follows (changes per iteration)
339
414
  parts.push('=== CURRENT CONTEXT ===\n');
340
415
  // Display note from previous iteration (high priority tactical info)
341
416
  if (context.noteFromPreviousIteration) {
342
417
  const note = context.noteFromPreviousIteration;
343
- parts.push(`📝 YOUR NOTE FROM ITERATION ${note.fromIteration}:`);
418
+ parts.push(`📝 YOUR NOTE FROM PREVIOUS ITERATION:`);
344
419
  parts.push(` ${note.content}`);
345
420
  parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
421
+ parts.push(``);
422
+ parts.push(` ACTION REQUIRED:`);
423
+ parts.push(` 1. Did your previous action work? Check the screenshot!`);
424
+ parts.push(` 2. If it WORKED: Execute next step from your plan`);
425
+ parts.push(` 3. If it FAILED: Use your backup plan (try alternative IDs/methods)`);
426
+ parts.push(` 4. Write NEW noteToFutureSelf with:`);
427
+ parts.push(` - What worked/didn't work (learn from attempts)`);
428
+ parts.push(` - Updated strategy with new backup plan`);
429
+ parts.push(` - Next alternatives to try if this fails`);
430
+ parts.push(` - Build on previous note's reasoning`);
431
+ parts.push(``);
432
+ parts.push(` DON'T repeat failed actions - try your backup plan!`);
346
433
  parts.push('');
347
434
  }
348
- // Check for screenshot loops (analysis paralysis)
435
+ // Check for screenshot loops (analysis paralysis) - PER STEP tracking
436
+ const screenshotsThisStep = context.recentSteps.filter(s => s.stepNumber === context.stepNumber &&
437
+ (s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot')));
349
438
  const recentScreenshots = context.recentSteps.slice(-3).filter(s => s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'));
350
- if (recentScreenshots.length >= 2) {
351
- parts.push(`🚨🚨🚨 SCREENSHOT LOOP DETECTED 🚨🚨🚨`);
352
- parts.push(`You've taken ${recentScreenshots.length} screenshots in last 3 iterations!`);
353
- parts.push(`STOP analyzing - START ACTING!`);
354
- parts.push(`Use ANY selector from DOM snapshot and try clicking.`);
355
- parts.push(`If command succeeds and new elements appear mark "complete"`);
356
- parts.push(`🚨🚨🚨\n`);
439
+ if (screenshotsThisStep.length >= 3) {
440
+ parts.push(`[CRITICAL] SCREENSHOT LOOP DETECTED - ${screenshotsThisStep.length} SCREENSHOTS THIS STEP`);
441
+ parts.push(`ANALYSIS PARALYSIS! You keep gathering info but NEVER ACTING!`);
442
+ parts.push(`NO MORE SCREENSHOTS ALLOWED - YOU MUST ACT NOW!`);
443
+ parts.push(`Pick ANY selector from your prior DOM snapshots and TRY IT.`);
444
+ parts.push(`Even if uncertain, execute the command. Failure is better than analysis paralysis.`);
445
+ parts.push(`If toolCalls contains "take_screenshot" → SYSTEM WILL REJECT IT\n`);
446
+ }
447
+ else if (recentScreenshots.length >= 2) {
448
+ parts.push(`[WARNING] ${recentScreenshots.length} screenshots in last 3 iterations - avoid more screenshots`);
449
+ parts.push(`Use selector recommendations from prior screenshots\n`);
357
450
  }
358
451
  // System warnings for accumulated failures
359
452
  if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
360
- parts.push(`⚠️ SYSTEM WARNING: ${consecutiveFailures} failures!`);
361
- parts.push(`Take screenshot if needed. Try different selector strategy.`);
453
+ parts.push(`[WARNING] SYSTEM WARNING: ${consecutiveFailures} failures!`);
454
+ // Only suggest screenshot if we haven't already taken multiple THIS STEP
455
+ if (screenshotsThisStep.length === 0) {
456
+ parts.push(`Take screenshot ONCE to see page state. Then ACT with selector.`);
457
+ }
458
+ else {
459
+ parts.push(`You already have visual context. Try different selector NOW.`);
460
+ }
362
461
  parts.push(`Question assumptions: Am I at the right step?`);
363
- parts.push(`⚠️\n`);
462
+ parts.push(`[WARNING]\n`);
364
463
  }
365
464
  else if (consecutiveFailures && consecutiveFailures >= 4) {
366
- parts.push(`⚠️ CRITICAL: ${consecutiveFailures} failures!`);
367
- parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
465
+ parts.push(`[WARNING] CRITICAL: ${consecutiveFailures} failures!`);
466
+ if (enableCoordinateMode) {
467
+ parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
468
+ }
469
+ else {
470
+ parts.push(`Next failure will force STUCK. Try radically different selector approach.\n`);
471
+ }
368
472
  }
369
- // Trigger coordinate mode if many failures (Phase 1: after 3 failures)
370
- if (consecutiveFailures && consecutiveFailures >= 3) {
473
+ // Trigger coordinate mode if many failures (Phase 1: after 3 failures) - ONLY if enabled
474
+ if (enableCoordinateMode && consecutiveFailures && consecutiveFailures >= 3) {
371
475
  parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯🎯`);
372
476
  parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
373
477
  parts.push(`You MUST use coordinate-based action now (percentages).`);
@@ -379,6 +483,8 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
379
483
  parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
380
484
  parts.push(`${context.currentStepGoal}`);
381
485
  parts.push(``);
486
+ parts.push(`[WARNING] IMPORTANT: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
487
+ parts.push(`[WARNING] CRITICAL: Only interact with elements you SEE in the screenshot - no guessing/hallucinating!`);
382
488
  parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
383
489
  if (context.completedSteps.length > 0) {
384
490
  parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
@@ -386,14 +492,42 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
386
492
  if (context.remainingSteps.length > 0) {
387
493
  parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
388
494
  }
495
+ // SoM screenshot (if available)
496
+ if (context.somScreenshot) {
497
+ parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
498
+ parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
499
+ parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
500
+ parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
501
+ parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
502
+ parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
503
+ parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
504
+ parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
505
+ parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
506
+ parts.push(``);
507
+ // SoM element map for disambiguation
508
+ if (context.somElementMap) {
509
+ parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
510
+ parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
511
+ parts.push(context.somElementMap);
512
+ parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
513
+ parts.push(``);
514
+ }
515
+ }
389
516
  // Current page state (most variable content - at the end)
390
517
  parts.push(`\nCURRENT PAGE:`);
391
518
  parts.push(`URL: ${context.currentURL}`);
392
519
  parts.push(`Title: ${context.currentPageInfo.title}`);
393
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
394
- parts.push(context.currentPageInfo.formattedElements);
395
- parts.push(`\nARIA TREE (hierarchical structure):`);
396
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
520
+ // Only include DOM details if NOT in SoM mode
521
+ if (!context.somScreenshot) {
522
+ parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
523
+ parts.push(context.currentPageInfo.formattedElements);
524
+ parts.push(`\nARIA TREE (hierarchical structure):`);
525
+ parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
526
+ }
527
+ else {
528
+ // In SoM mode, skip DOM details - agent uses visual screenshot instead
529
+ parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
530
+ }
397
531
  if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
398
532
  parts.push('... (truncated)');
399
533
  }
@@ -402,11 +536,11 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
402
536
  if (context.recentSteps.length > 0) {
403
537
  parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
404
538
  for (const step of context.recentSteps) {
405
- const status = step.result === 'success' ? '' : '';
539
+ const status = step.result === 'success' ? '[OK]' : '[FAIL]';
406
540
  parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
407
541
  parts.push(` Code: ${step.code}`);
408
542
  if (step.result === 'failure' && step.error) {
409
- parts.push(` ERROR: ${step.error}`);
543
+ parts.push(` ERROR: ${step.error}`);
410
544
  parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
411
545
  }
412
546
  else {
@@ -419,7 +553,7 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
419
553
  if (recentFailures.length >= 2) {
420
554
  const sameSelector = recentFailures.slice(-2).every((s, i, arr) => i === 0 || s.code === arr[i - 1].code);
421
555
  if (sameSelector) {
422
- parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
556
+ parts.push(`[WARNING] WARNING: You've tried the same selector multiple times and it failed!`);
423
557
  parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
424
558
  parts.push(` YOU MUST try a completely different selector this time!\n`);
425
559
  }
@@ -439,15 +573,163 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
439
573
  parts.push(JSON.stringify(context.extractedData, null, 2));
440
574
  parts.push('');
441
575
  }
442
- // Previous iteration guidance
443
- if (context.previousIterationGuidance) {
444
- parts.push(`\nGUIDANCE FROM PREVIOUS ITERATION:`);
445
- parts.push(context.previousIterationGuidance.guidanceForNext);
446
- if (context.previousIterationGuidance.detectingLoop) {
447
- parts.push(`⚠️ LOOP DETECTED: ${context.previousIterationGuidance.loopReasoning}`);
576
+ return parts.join('\n');
577
+ }
578
+ /**
579
+ * Build exploratory system prompt for autonomous exploration
580
+ */
581
+ static buildExploratorySystemPrompt(toolDescriptions) {
582
+ return `You are an autonomous exploration agent that discovers and tests web application features.
583
+
584
+ ${toolDescriptions}
585
+
586
+ YOUR RESPONSE FORMAT - Output JSON matching this interface:
587
+
588
+ interface AgentDecisionLLMResponse {
589
+ status: string; // "continue" | "complete" | "stuck"
590
+ reasoning: string; // What you're exploring and why
591
+
592
+ // COMMANDS: Array of plain Playwright command strings
593
+ commands?: string[]; // Example: ["await page.fill('input[name=\"email\"]', 'test@example.com')", ...]
594
+ commandReasoning?: string;
595
+ toolCalls?: Array<{ // Tools to call (extract_data for menus, etc.)
596
+ name: string;
597
+ params: Record<string, any>;
598
+ }>;
599
+ toolReasoning?: string;
600
+ needsToolResults?: boolean;
601
+ noteToFutureSelf?: string;
602
+ coordinateAction?: { ... };
603
+ experiences?: string[]; // Use for BOTH app patterns AND exploration progress
604
+ blockerDetected?: { ... };
605
+ debugInfo?: { // Meta-learning: suggest prompt improvements (only when very confident)
606
+ suggestedPromptUpdates?: string;
607
+ reasoning?: string;
608
+ };
609
+ }
610
+
611
+ EXPLORATION MODE GUIDELINES:
612
+
613
+ 1. **JOURNEY-FOCUSED EXPLORATION**: Follow the exploration prompt as your goal for THIS journey
614
+ - Example prompt: "Explore Dashboard and test all widgets"
615
+ - You should systematically test dashboard widgets, not wander off to other sections
616
+ - Stay focused on the given journey goal
617
+
618
+ 2. **ICON BUTTONS**: Match step goal to icon semantics (plus=add, gear=settings, hamburger=menu). Check element map for confirmation. Don't randomly try - reason about fit.
619
+
620
+ 3. **VISIBLE ELEMENTS ONLY**: Screenshot shows viewport only. Only interact with elements you SEE. If not visible, scroll or take_screenshot(isFullPage=true).
621
+
622
+ 4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in experiences, check history to avoid repeating, prioritize unexplored areas.
623
+
624
+ 5. **CREATIVE TESTING**: Test functionality thoroughly - try edge cases, verify features work, look for bugs.
625
+
626
+ 7. **LIMITATIONS**: Cannot complete: sign-up, forgot password, OTP, CAPTCHA, email verification (no inbox/SMS access).
627
+ If encountered: CAPTCHA → stuck, sign-up/OTP → skip and explore other areas.
628
+
629
+ 8. **AUTH**: If credentials provided, login FIRST using exact testDataPrompt values. Don't explore public pages or click sign-up.
630
+
631
+ 9. **BLOCKERS**: Clear cookie modals, tour popups autonomously with blockerDetected.clearingCommands. CAPTCHA → stuck.
632
+
633
+ 10. **STATUS**: complete=goal achieved or budget low, continue=need more, stuck=cannot proceed. Complete when journey goal met, don't wait for maxSteps.
634
+
635
+ 11. **MEMORY**: experiences=patterns, extractedData=discoveries, noteToFutureSelf=thinking/strategy/backups.
636
+
637
+ CRITICAL: You're fully autonomous for THIS journey - no step-by-step instructions provided.
638
+ YOU decide the exploration path to meet the journey goal based on: journey prompt, current state, and memory.`;
639
+ }
640
+ /**
641
+ * Build exploratory user prompt with context
642
+ */
643
+ static buildExploratoryUserPrompt(context, explorationPrompt, testDataPrompt, stepNumber, maxSteps) {
644
+ const parts = [];
645
+ // Add SoM format reminder if screenshot is present
646
+ if (context.somScreenshot) {
647
+ parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
648
+ parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
649
+ parts.push(`Format: { "elementRef": "1", "action": "click" }`);
650
+ parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
651
+ }
652
+ parts.push('=== JOURNEY EXPLORATION CONTEXT ===\n');
653
+ parts.push(`GOAL: ${explorationPrompt}`);
654
+ parts.push(` (Focus on THIS specific goal - don't wander to unrelated areas)\n`);
655
+ if (testDataPrompt) {
656
+ parts.push(`TEST DATA/CREDENTIALS: ${testDataPrompt}`);
657
+ parts.push(` [WARNING] IMPORTANT: If credentials are provided above (email/username and password), you MUST:`);
658
+ parts.push(` - Use them to LOGIN and explore authenticated features`);
659
+ parts.push(` - Fill login forms with the exact credentials provided`);
660
+ parts.push(` - Don't waste time on public/unauthenticated pages when you can login`);
661
+ parts.push(` - Prioritize exploring the authenticated app experience\n`);
662
+ }
663
+ if (stepNumber && maxSteps) {
664
+ parts.push(`PROGRESS: Step ${stepNumber}/${maxSteps} (you can complete earlier if journey goal met)\n`);
665
+ }
666
+ // Show discovered and tracked data from extractedData
667
+ if (context.extractedData && Object.keys(context.extractedData).length > 0) {
668
+ parts.push(`\nDISCOVERED DATA (this journey):`);
669
+ for (const [key, value] of Object.entries(context.extractedData)) {
670
+ parts.push(` ${key}: ${value}`);
448
671
  }
449
- parts.push('');
450
672
  }
673
+ // SoM screenshot (if available)
674
+ if (context.somScreenshot) {
675
+ parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
676
+ parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
677
+ parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
678
+ parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
679
+ parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
680
+ parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
681
+ parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
682
+ parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
683
+ parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
684
+ parts.push(``);
685
+ // SoM element map for disambiguation
686
+ if (context.somElementMap) {
687
+ parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
688
+ parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
689
+ parts.push(context.somElementMap);
690
+ parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
691
+ parts.push(``);
692
+ }
693
+ }
694
+ parts.push(`\nCURRENT PAGE:`);
695
+ parts.push(`URL: ${context.currentURL}`);
696
+ parts.push(`Title: ${context.currentPageInfo.title}`);
697
+ // Only include DOM details if NOT in SoM mode
698
+ if (!context.somScreenshot) {
699
+ parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
700
+ parts.push(context.currentPageInfo.formattedElements);
701
+ parts.push(`\nARIA TREE (hierarchical structure):`);
702
+ parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
703
+ }
704
+ else {
705
+ // In SoM mode, skip DOM details - agent uses visual screenshot
706
+ parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
707
+ }
708
+ if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
709
+ parts.push('... (truncated)');
710
+ }
711
+ // Recent actions
712
+ if (context.recentSteps.length > 0) {
713
+ parts.push(`\nRECENT ACTIONS (last ${context.recentSteps.length}):`);
714
+ for (const step of context.recentSteps) {
715
+ const status = step.result === 'success' ? '[OK]' : '[FAIL]';
716
+ parts.push(` ${status} ${step.action}`);
717
+ parts.push(` ${step.observation}`);
718
+ }
719
+ }
720
+ // Learnings and exploration progress
721
+ if (context.experiences && context.experiences.length > 0) {
722
+ parts.push(`\nEXPLORATION NOTES & APP PATTERNS:`);
723
+ for (const exp of context.experiences) {
724
+ parts.push(` • ${exp}`);
725
+ }
726
+ }
727
+ // Note from previous iteration
728
+ if (context.noteFromPreviousIteration) {
729
+ parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.noteFromPreviousIteration.content}`);
730
+ parts.push(`Did it work? If yes, continue plan. If failed, try backup alternatives.`);
731
+ }
732
+ parts.push(`\nDECIDE NEXT ACTION: What to explore/test next? Check history to avoid repeating. Is goal achieved? Mark complete.`);
451
733
  return parts.join('\n');
452
734
  }
453
735
  }