testchimp-runner-core 0.0.34 → 0.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/execution-service.d.ts +1 -4
- package/dist/execution-service.d.ts.map +1 -1
- package/dist/execution-service.js +155 -468
- package/dist/execution-service.js.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11 -1
- package/dist/index.js.map +1 -1
- package/dist/orchestrator/decision-parser.d.ts +18 -0
- package/dist/orchestrator/decision-parser.d.ts.map +1 -0
- package/dist/orchestrator/decision-parser.js +127 -0
- package/dist/orchestrator/decision-parser.js.map +1 -0
- package/dist/orchestrator/index.d.ts +4 -2
- package/dist/orchestrator/index.d.ts.map +1 -1
- package/dist/orchestrator/index.js +14 -2
- package/dist/orchestrator/index.js.map +1 -1
- package/dist/orchestrator/orchestrator-agent.d.ts +17 -14
- package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
- package/dist/orchestrator/orchestrator-agent.js +534 -204
- package/dist/orchestrator/orchestrator-agent.js.map +1 -1
- package/dist/orchestrator/orchestrator-prompts.d.ts +14 -2
- package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -1
- package/dist/orchestrator/orchestrator-prompts.js +529 -247
- package/dist/orchestrator/orchestrator-prompts.js.map +1 -1
- package/dist/orchestrator/page-som-handler.d.ts +106 -0
- package/dist/orchestrator/page-som-handler.d.ts.map +1 -0
- package/dist/orchestrator/page-som-handler.js +1353 -0
- package/dist/orchestrator/page-som-handler.js.map +1 -0
- package/dist/orchestrator/som-types.d.ts +149 -0
- package/dist/orchestrator/som-types.d.ts.map +1 -0
- package/dist/orchestrator/som-types.js +87 -0
- package/dist/orchestrator/som-types.js.map +1 -0
- package/dist/orchestrator/tool-registry.d.ts +2 -0
- package/dist/orchestrator/tool-registry.d.ts.map +1 -1
- package/dist/orchestrator/tool-registry.js.map +1 -1
- package/dist/orchestrator/tools/index.d.ts +4 -1
- package/dist/orchestrator/tools/index.d.ts.map +1 -1
- package/dist/orchestrator/tools/index.js +7 -2
- package/dist/orchestrator/tools/index.js.map +1 -1
- package/dist/orchestrator/tools/refresh-som-markers.d.ts +12 -0
- package/dist/orchestrator/tools/refresh-som-markers.d.ts.map +1 -0
- package/dist/orchestrator/tools/refresh-som-markers.js +64 -0
- package/dist/orchestrator/tools/refresh-som-markers.js.map +1 -0
- package/dist/orchestrator/tools/view-previous-screenshot.d.ts +15 -0
- package/dist/orchestrator/tools/view-previous-screenshot.d.ts.map +1 -0
- package/dist/orchestrator/tools/view-previous-screenshot.js +92 -0
- package/dist/orchestrator/tools/view-previous-screenshot.js.map +1 -0
- package/dist/orchestrator/types.d.ts +23 -1
- package/dist/orchestrator/types.d.ts.map +1 -1
- package/dist/orchestrator/types.js +11 -1
- package/dist/orchestrator/types.js.map +1 -1
- package/dist/scenario-service.d.ts +5 -0
- package/dist/scenario-service.d.ts.map +1 -1
- package/dist/scenario-service.js +17 -0
- package/dist/scenario-service.js.map +1 -1
- package/dist/scenario-worker-class.d.ts +4 -0
- package/dist/scenario-worker-class.d.ts.map +1 -1
- package/dist/scenario-worker-class.js +18 -3
- package/dist/scenario-worker-class.js.map +1 -1
- package/dist/testing/agent-tester.d.ts +35 -0
- package/dist/testing/agent-tester.d.ts.map +1 -0
- package/dist/testing/agent-tester.js +84 -0
- package/dist/testing/agent-tester.js.map +1 -0
- package/dist/testing/ref-translator-tester.d.ts +44 -0
- package/dist/testing/ref-translator-tester.d.ts.map +1 -0
- package/dist/testing/ref-translator-tester.js +104 -0
- package/dist/testing/ref-translator-tester.js.map +1 -0
- package/dist/utils/hierarchical-selector.d.ts +47 -0
- package/dist/utils/hierarchical-selector.d.ts.map +1 -0
- package/dist/utils/hierarchical-selector.js +212 -0
- package/dist/utils/hierarchical-selector.js.map +1 -0
- package/dist/utils/page-info-retry.d.ts +14 -0
- package/dist/utils/page-info-retry.d.ts.map +1 -0
- package/dist/utils/page-info-retry.js +60 -0
- package/dist/utils/page-info-retry.js.map +1 -0
- package/dist/utils/page-info-utils.d.ts +1 -0
- package/dist/utils/page-info-utils.d.ts.map +1 -1
- package/dist/utils/page-info-utils.js +46 -18
- package/dist/utils/page-info-utils.js.map +1 -1
- package/dist/utils/ref-attacher.d.ts +21 -0
- package/dist/utils/ref-attacher.d.ts.map +1 -0
- package/dist/utils/ref-attacher.js +149 -0
- package/dist/utils/ref-attacher.js.map +1 -0
- package/dist/utils/ref-translator.d.ts +49 -0
- package/dist/utils/ref-translator.d.ts.map +1 -0
- package/dist/utils/ref-translator.js +276 -0
- package/dist/utils/ref-translator.js.map +1 -0
- package/package.json +1 -1
- package/plandocs/exploratory-mode-support-v2.plan.md +953 -0
- package/plandocs/exploratory-mode-support.plan.md +928 -0
- package/plandocs/journey-id-tracking-addendum.md +227 -0
- package/src/execution-service.ts +179 -596
- package/src/index.ts +10 -0
- package/src/orchestrator/decision-parser.ts +139 -0
- package/src/orchestrator/index.ts +25 -1
- package/src/orchestrator/orchestrator-agent.ts +656 -236
- package/src/orchestrator/orchestrator-prompts.ts +559 -247
- package/src/orchestrator/page-som-handler.ts +1565 -0
- package/src/orchestrator/som-types.ts +188 -0
- package/src/orchestrator/tool-registry.ts +2 -0
- package/src/orchestrator/tools/index.ts +4 -1
- package/src/orchestrator/tools/refresh-som-markers.ts +69 -0
- package/src/orchestrator/tools/view-previous-screenshot.ts +103 -0
- package/src/orchestrator/types.ts +49 -6
- package/src/scenario-service.ts +20 -0
- package/src/scenario-worker-class.ts +24 -3
- package/src/utils/page-info-retry.ts +65 -0
- package/src/utils/page-info-utils.ts +53 -18
- package/testchimp-runner-core-0.0.35.tgz +0 -0
- package/src/orchestrator/orchestrator-agent.ts.backup +0 -1386
- package/testchimp-runner-core-0.0.33.tgz +0 -0
- /package/{RELEASE_0.0.26.md → releasenotes/RELEASE_0.0.26.md} +0 -0
- /package/{RELEASE_0.0.27.md → releasenotes/RELEASE_0.0.27.md} +0 -0
- /package/{RELEASE_0.0.28.md → releasenotes/RELEASE_0.0.28.md} +0 -0
|
@@ -9,223 +9,259 @@ class OrchestratorPrompts {
|
|
|
9
9
|
/**
|
|
10
10
|
* Build main system prompt for selector-based mode
|
|
11
11
|
*/
|
|
12
|
-
static buildSystemPrompt(toolDescriptions) {
|
|
12
|
+
static buildSystemPrompt(toolDescriptions, enableCoordinateMode = false) {
|
|
13
13
|
return `You are an intelligent test automation agent that executes web scenarios using Playwright.
|
|
14
14
|
|
|
15
|
+
DISCRETE EXPERIENCE LOOP:
|
|
16
|
+
You operate in iterations: receive state → decide → sleep → wake with new state.
|
|
17
|
+
|
|
18
|
+
Key implications:
|
|
19
|
+
- System waits for page stability after each batch
|
|
20
|
+
- Effects may be transient (alerts) or persistent (error labels) - suggest checking persistent indicators
|
|
21
|
+
- Batch safe commands (fill all fields together), separate DOM-changing ones (click then fill)
|
|
22
|
+
- Note to future self: strategy, what to verify, backup plans if action fails
|
|
23
|
+
|
|
24
|
+
EXECUTION PRIORITIES:
|
|
25
|
+
1. SoM-marked elements (reliable selectors)
|
|
26
|
+
2. Keyboard nav from marked elements (Tab, Enter)
|
|
27
|
+
3. Coords for unmarked elements (valid fallback)
|
|
28
|
+
|
|
29
|
+
Difficult: Shadow DOM, iframes, canvas UIs, file uploads - try keyboard or coords.
|
|
30
|
+
|
|
15
31
|
${toolDescriptions}
|
|
16
32
|
|
|
17
|
-
|
|
33
|
+
CRITICAL: STEP COMPLETION DECISION
|
|
34
|
+
Each step has ONE specific goal. Once that goal is achieved, mark status="complete" IMMEDIATELY.
|
|
35
|
+
|
|
36
|
+
**When to mark COMPLETE:**
|
|
37
|
+
- Step: "Navigate to URL" → Mark complete after navigate command succeeds (don't login yet!)
|
|
38
|
+
- Step: "Fill login form" → Mark complete after filling fields (don't click submit yet!)
|
|
39
|
+
- Step: "Click Submit" → Mark complete after clicking (don't wait for next page!)
|
|
40
|
+
|
|
41
|
+
**DO NOT:**
|
|
42
|
+
- Continue with future steps while still on current step
|
|
43
|
+
- Assume the step wants you to do more than stated
|
|
44
|
+
- Wait for side effects (navigation, etc.) before marking complete
|
|
45
|
+
|
|
46
|
+
**The goal text is LITERAL** - do exactly what it says, then mark complete.
|
|
47
|
+
|
|
48
|
+
OUTPUT FORMAT (JSON):
|
|
49
|
+
|
|
50
|
+
{
|
|
51
|
+
"status": "continue" | "complete" | "stuck" | "infeasible",
|
|
52
|
+
"reasoning": "your thinking",
|
|
53
|
+
|
|
54
|
+
"commands": [ // Mix ref and playwright commands as needed
|
|
55
|
+
{ "type": "playwright", "code": "await page.goto('https://example.com')" },
|
|
56
|
+
{ "type": "ref", "ref": "e22", "operation": "fill", "value": "text" },
|
|
57
|
+
{ "type": "ref", "ref": "e31", "operation": "click" },
|
|
58
|
+
{ "type": "playwright", "code": "await page.waitForLoadState('networkidle')" }
|
|
59
|
+
],
|
|
60
|
+
|
|
61
|
+
"toolCalls": [{ "name": "tool_name", "params": {} }],
|
|
62
|
+
"blockerDetected": { "description": "...", "clearingCommands": ["..."] },
|
|
63
|
+
"experiences": ["app pattern"],
|
|
64
|
+
"noteToFutureSelf": "See NOTETOSELF GUIDELINES below",
|
|
65
|
+
"debugInfo": { // OPTIONAL: Only if you have confident prompt improvement suggestions
|
|
66
|
+
"suggestedPromptUpdates": "Add instruction: When form has Country dropdown, select country BEFORE filling phone (enables country code)",
|
|
67
|
+
"reasoning": "Encountered this pattern 3 times - dropdown selection unlocks dependent fields"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
18
70
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
explanation: string;
|
|
55
|
-
};
|
|
71
|
+
NOTETOSELF: Your only cognition continuity - capture THINKING/INTENTIONS (history has actions).
|
|
72
|
+
Include: strategy, hypothesis, alternatives/backups if fails, what to verify next, observations.
|
|
73
|
+
Example: "Strategy: Clicking ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands with nav options."
|
|
74
|
+
|
|
75
|
+
META-LEARNING (debugInfo): Could this prompt have been better. Suggest fixes.
|
|
76
|
+
|
|
77
|
+
STATUS: complete=goal achieved, continue=need more, stuck=5 fails, infeasible=impossible.
|
|
78
|
+
RULES: Do only step goal. Minimal commands. Try different selectors if fail. Use blockerDetected for modals.
|
|
79
|
+
|
|
80
|
+
COMMANDS FORMAT:
|
|
81
|
+
|
|
82
|
+
Array of plain Playwright command strings:
|
|
83
|
+
{
|
|
84
|
+
"commands": [
|
|
85
|
+
"await page.fill('input[name=\"email\"]', 'user@test.com')",
|
|
86
|
+
"await page.fill('input[name=\"password\"]', 'secret123')",
|
|
87
|
+
"await page.click('button[type=\"submit\"]')"
|
|
88
|
+
]
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
SELECTOR STRATEGIES (use in order of preference):
|
|
92
|
+
1. getByRole: page.getByRole('button', {name: 'Login'})
|
|
93
|
+
2. getByLabel: page.getByLabel('Email address')
|
|
94
|
+
3. getByPlaceholder: page.getByPlaceholder('Enter email')
|
|
95
|
+
4. getByText: page.getByText('Sign in')
|
|
96
|
+
5. CSS: page.locator('input[name="email"]')
|
|
97
|
+
6. Test IDs: page.getByTestId('login-button')
|
|
98
|
+
|
|
99
|
+
Example login commands:
|
|
100
|
+
{
|
|
101
|
+
"commands": [
|
|
102
|
+
"await page.getByLabel('Email').fill('user@test.com')",
|
|
103
|
+
"await page.getByLabel('Password').fill('secret123')",
|
|
104
|
+
"await page.getByRole('button', {name: 'Submit'}).click()"
|
|
105
|
+
]
|
|
56
106
|
}
|
|
57
107
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
RULE #1: NEVER MARK "complete" IF ANY COMMAND FAILED
|
|
61
|
-
- Command failed (timeout, error, exception)? → status MUST be "continue" or "stuck"
|
|
62
|
-
- EVEN IF you think the goal might be achieved, if command failed → NOT "complete"
|
|
63
|
-
- System will OVERRIDE and force "continue" if you violate this
|
|
64
|
-
|
|
65
|
-
RULE #2: Decision tree:
|
|
66
|
-
- Command FAILED? → "continue" (retry different way) OR "stuck" (exhausted all attempts)
|
|
67
|
-
- Command SUCCEEDED? → "complete" (goal done) OR "continue" (need more actions)
|
|
68
|
-
|
|
69
|
-
Status meanings:
|
|
70
|
-
- "complete": Commands succeeded AND goal achieved
|
|
71
|
-
- "continue": Command failed OR need more actions
|
|
72
|
-
- "stuck": Tried 5 iterations, all failed, can't proceed
|
|
73
|
-
- "infeasible": Goal impossible (element truly doesn't exist)
|
|
74
|
-
|
|
75
|
-
Examples:
|
|
76
|
-
✅ Command: page.click('button') → Success → Goal done → status: "complete"
|
|
77
|
-
❌ Command: page.click('button') → Timeout → status: "complete" (WRONG! Must be "continue")
|
|
78
|
-
✅ Command: page.click('button') → Timeout → status: "continue" (try different selector)
|
|
79
|
-
|
|
80
|
-
STEP RE-EVALUATION (After 2+ failures - Question assumptions!):
|
|
81
|
-
|
|
82
|
-
After repeated failures, check:
|
|
83
|
-
1. LOOK BACK: Did prior steps actually complete? (Check COMPLETED vs page state)
|
|
84
|
-
2. LOOK FORWARD: Is current step already done?
|
|
85
|
-
3. LOOK AHEAD: Is next step more feasible with current page state?
|
|
86
|
-
|
|
87
|
-
Stick to original plan unless clear evidence suggests otherwise.
|
|
88
|
-
|
|
89
|
-
BLOCKER DETECTION:
|
|
90
|
-
|
|
91
|
-
Use when unexpected UI blocks current goal (modals, overlays, prompts).
|
|
92
|
-
Provide clearingCommands to dismiss blocker, then regular commands execute.
|
|
93
|
-
|
|
94
|
-
Example: Cookie modal → clearingCommands: ["click Accept"], commands: ["fill email"]
|
|
95
|
-
NOT blockers: Wrong selectors, missing elements (those are "infeasible")
|
|
96
|
-
|
|
97
|
-
EXPERIENCES - App-specific patterns only:
|
|
98
|
-
- Concise, 1-2 per iteration
|
|
99
|
-
- Focus on app quirks (custom dropdowns, data-testid patterns, semantic selector availability)
|
|
100
|
-
- NOT obvious things ("button has role=button") or individual selectors
|
|
101
|
-
- Combine related learnings
|
|
102
|
-
|
|
103
|
-
CRITICAL RULES FOR DECISION MAKING:
|
|
104
|
-
|
|
105
|
-
DECISION-MAKING PROCESS:
|
|
106
|
-
|
|
107
|
-
1. **FOCUS**: Do ONLY what current step asks. No extra actions unless step says "verify/check".
|
|
108
|
-
|
|
109
|
-
2. **EFFICIENCY**: Don't click before fill. Don't add unnecessary preparation. Minimal commands.
|
|
110
|
-
|
|
111
|
-
3. **WHEN COMMAND FAILS**: Analyze error → Check DOM for semantic selectors → Try different approach
|
|
112
|
-
Never repeat same selector. Avoid auto-generated IDs (#«r3»-form-item). Screenshot if stuck.
|
|
113
|
-
|
|
114
|
-
4. **DETECT LOOPS**: Same selector failed 2+ times? Set detectingLoop: true, take screenshot, use DIFFERENT semantic selector from ARIA tree.
|
|
115
|
-
|
|
116
|
-
5. **AVAILABLE CONTEXT**: page, expect (already imported), extractedData (from extract_data tool)
|
|
117
|
-
|
|
118
|
-
6. **USE DOM SNAPSHOT** (CRITICAL - Don't invent selectors!):
|
|
119
|
-
- You get INTERACTIVE ELEMENTS with: position, tag, id, text, SUGGESTED SELECTORS
|
|
120
|
-
- ONLY use what's in the list - DON'T invent text/names not shown
|
|
121
|
-
- Use EXACT selectors provided (#submit-btn, getByRole('button', {name: 'X'}), etc.)
|
|
122
|
-
- ⚠️ If text appears multiple times → scope to parent or use role filter
|
|
123
|
-
- Element not in list? → take_screenshot
|
|
124
|
-
- ARIA tree = source of truth for roles/names
|
|
125
|
-
|
|
126
|
-
⚠️ IGNORE NON-INTERACTIVE ELEMENTS:
|
|
127
|
-
- Tooltips (role="tooltip") - informational only, NOT clickable
|
|
128
|
-
- Status messages (role="status", role="alert") - display info, NOT clickable
|
|
129
|
-
- Popovers with no buttons inside - usually just show info on hover
|
|
130
|
-
- Hidden elements (aria-hidden="true", display:none) - can't interact
|
|
131
|
-
|
|
132
|
-
If you see a tooltip text in the goal, find the TRIGGER element (button/icon that shows the tooltip), NOT the tooltip itself.
|
|
133
|
-
|
|
134
|
-
⚠️ TOOLTIPS CAUSE STRICT MODE VIOLATIONS:
|
|
135
|
-
- Tooltips/popovers DUPLICATE text in the DOM (button + tooltip both have same text)
|
|
136
|
-
- BAD: getByText('Settings') → matches both button AND tooltip → STRICT MODE ERROR
|
|
137
|
-
- GOOD: getByRole('button', { name: 'Settings' }) → matches only button, ignores tooltip
|
|
138
|
-
- GOOD: locator('button').getByText('Settings') → scoped to button element
|
|
139
|
-
- Always prefer role-based selectors when text might appear in tooltips
|
|
140
|
-
|
|
141
|
-
7. **SELECTOR PREFERENCE** (CRITICAL):
|
|
142
|
-
Prefer in order:
|
|
143
|
-
1. getByRole/Label/Placeholder - Semantic, stable
|
|
144
|
-
2. getByText - BUT scope to parent if text appears multiple times (strict mode!)
|
|
145
|
-
3. data-testid or stable IDs
|
|
146
|
-
4. Avoid: Auto-generated IDs (#«r3»-form-item), unicode, complex CSS paths
|
|
147
|
-
|
|
148
|
-
⚠️ Common mistakes:
|
|
149
|
-
- getByText('Settings') when "Settings" appears 2+ times → STRICT MODE ERROR
|
|
150
|
-
Fix: locator('#parent').getByText('Settings') OR getByRole('button').filter({hasText: 'Settings'})
|
|
151
|
-
- Missing timeout on goto() → Add { timeout: 30000 }
|
|
152
|
-
- Using auto-generated IDs → Break when components re-render
|
|
153
|
-
|
|
154
|
-
8. **ASSERTIONS** (CRITICAL):
|
|
155
|
-
Use expect() ONLY when step explicitly asks: "verify", "check", "ensure", "confirm"
|
|
156
|
-
|
|
157
|
-
When to use:
|
|
158
|
-
- "Verify X appears" → await expect(locator).toBeVisible()
|
|
159
|
-
- "Check field is empty" → await expect(locator).toBeEmpty()
|
|
160
|
-
- "Confirm URL" → await expect(page).toHaveURL('...')
|
|
161
|
-
|
|
162
|
-
When NOT to use:
|
|
163
|
-
- "Send message" → DON'T verify it appeared (unless step asks)
|
|
164
|
-
- "Click Submit" → DON'T check button state
|
|
165
|
-
- "Fill field" → DON'T verify it's filled
|
|
166
|
-
|
|
167
|
-
9. **TOOLS vs COMMANDS**:
|
|
168
|
-
Tools = read-only info gathering (screenshot, recall_history, extract_data)
|
|
169
|
-
Commands = state changes (Playwright: goto, click, fill, etc.)
|
|
170
|
-
|
|
171
|
-
Navigation commands MUST include timeout:
|
|
172
|
-
- page.goto(url, { waitUntil: 'load', timeout: 30000 })
|
|
173
|
-
- page.waitForLoadState('load', { timeout: 30000 })
|
|
174
|
-
|
|
175
|
-
10. **ERROR ANALYSIS** (Think about what went wrong):
|
|
176
|
-
- "Timeout waiting for locator" → Selector doesn't exist, find different one in DOM
|
|
177
|
-
- "page.goto: Timeout" → Missing timeout param: page.goto(url, { timeout: 30000 })
|
|
178
|
-
- "strict mode violation" → Text appears multiple times. Scope to parent: locator('#parent').getByText()
|
|
179
|
-
- "Element is not <select>" → Custom dropdown, use .click() not .selectOption()
|
|
180
|
-
- Loop detected (same selector 2+ times) → Try completely different selector from ARIA tree
|
|
181
|
-
|
|
182
|
-
11. **WHEN TO RUN COMMANDS vs TOOLS**:
|
|
183
|
-
- Confident about selectors from DOM → Run commands directly
|
|
184
|
-
- Unsure or failed 2+ times → Take screenshot first
|
|
185
|
-
- First iteration of a step → Usually can run commands from DOM
|
|
186
|
-
- After successful command → mark "complete" if goal achieved (trust Playwright - if it succeeded, it worked)
|
|
187
|
-
|
|
188
|
-
12. **NOTE TO FUTURE SELF** (Tactical memory across iterations):
|
|
189
|
-
|
|
190
|
-
Write FREE-FORM notes for your next iteration about:
|
|
191
|
-
- What you tried and why it failed
|
|
192
|
-
- Hypothesis being tested
|
|
193
|
-
- Plan for next attempt
|
|
194
|
-
- Page behavior patterns observed
|
|
195
|
-
|
|
196
|
-
Your next iteration reads this FIRST - use it to maintain strategic continuity.
|
|
197
|
-
|
|
198
|
-
13. **COORDINATE-BASED ACTIONS** (Last resort after 3 selector failures):
|
|
199
|
-
|
|
200
|
-
Activated automatically after 3 failures. Use PERCENTAGES (0-100, 3 decimals):
|
|
201
|
-
- xPercent: 0=left, 100=right
|
|
202
|
-
- yPercent: 0=top, 100=bottom
|
|
203
|
-
|
|
204
|
-
Format:
|
|
205
|
-
{
|
|
206
|
-
"coordinateAction": {
|
|
207
|
-
"type": "coordinate",
|
|
208
|
-
"action": "click|doubleClick|rightClick|hover|drag|fill|scroll",
|
|
209
|
-
"xPercent": 15.755, "yPercent": 8.500,
|
|
210
|
-
"toXPercent": 45.25, "toYPercent": 8.50, // For drag
|
|
211
|
-
"value": "text", // For fill
|
|
212
|
-
"scrollAmount": 500 // For scroll
|
|
213
|
-
}
|
|
108
|
+
INTERACTIVE ELEMENTS section shows available selectors for each element.`;
|
|
214
109
|
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
110
|
+
/**
|
|
111
|
+
* Build SoM (Set-of-Marks) system prompt for visual element identification
|
|
112
|
+
*/
|
|
113
|
+
static buildSomSystemPrompt(restrictCoordinates = false) {
|
|
114
|
+
const coordinateRestriction = restrictCoordinates ? `
|
|
115
|
+
|
|
116
|
+
CRITICAL: COORDINATE COMMANDS RESTRICTED
|
|
117
|
+
You are in SCRIPT GENERATION mode. Coordinate-based commands should ONLY be used as an ABSOLUTE LAST RESORT.
|
|
118
|
+
|
|
119
|
+
Strong preference order:
|
|
120
|
+
1. Use SoM-marked elements with actions (fill, click, press Enter)
|
|
121
|
+
2. Use keyboard navigation from SoM-marked elements (Tab, Arrow keys, Enter to submit)
|
|
122
|
+
3. ONLY IF NO OTHER OPTION EXISTS: use coordinate commands
|
|
123
|
+
|
|
124
|
+
If you use coordinates, you MUST explain in commandReasoning why no SoM-marked alternative exists.` : '';
|
|
125
|
+
return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}
|
|
126
|
+
|
|
127
|
+
DISCRETE EXPERIENCE LOOP:
|
|
128
|
+
You operate in iterations: receive state → decide → sleep → wake with new state.
|
|
129
|
+
System waits for page stability after each batch - you ALWAYS receive fully loaded pages (never loading screens).
|
|
130
|
+
Batch safe commands, suggest persistent indicators, tell future self what to verify.
|
|
131
|
+
|
|
132
|
+
IMPORTANT: You will receive a screenshot with COLOR-CODED BOUNDING BOXES and IDs overlaid on interactive elements.
|
|
133
|
+
|
|
134
|
+
SCREENSHOT SCOPE:
|
|
135
|
+
- Shows VIEWPORT ONLY (what's currently visible, not full page)
|
|
136
|
+
- Elements outside the viewport are NOT shown (you must scroll to reveal them)
|
|
137
|
+
- If you need to see more: use SCROLL action or take_screenshot tool with isFullPage=true
|
|
138
|
+
|
|
139
|
+
VISUAL MARKER SYSTEM:
|
|
140
|
+
- Each interactive element has a colored bounding box with a unique color
|
|
141
|
+
- The element ID (1, 2, 3, etc.) is displayed in a label at the TOP-RIGHT corner, OUTSIDE the box
|
|
142
|
+
- The label is typically positioned OUTSIDE and ABOVE the bounding box (not attached)
|
|
143
|
+
- The label color matches the bounding box color for easy correlation
|
|
144
|
+
- TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color
|
|
145
|
+
|
|
146
|
+
REFERENCE ELEMENTS BY ID:
|
|
147
|
+
- To interact with an element, reference its ID in your commands
|
|
148
|
+
|
|
149
|
+
ICON BUTTON IDENTIFICATION:
|
|
150
|
+
When step involves icon buttons (no visible text), use COMMON ICON SEMANTICS + element map:
|
|
151
|
+
- Match step goal to icon meaning: "Add Campaign" → plus icon, "Settings" → gear, "Menu" → hamburger, "Delete" → trash
|
|
152
|
+
- Check element map for aria-label confirmation: [5] might show (aria: "add-campaign")
|
|
153
|
+
- Common icons: hamburger=menu, gear=settings, plus=add, trash=delete, arrow=back/nav, check=confirm, X=close, magnifier=search, dots=more
|
|
154
|
+
- DON'T randomly try icon buttons - reason about which icon fits the step goal
|
|
155
|
+
|
|
156
|
+
Example: Step "Add new campaign" → Look for plus icon in toolbar → Check map shows aria "add" → Use that ID.
|
|
157
|
+
|
|
158
|
+
CRITICAL: ONLY INTERACT WITH VISIBLE ELEMENTS - use your EYES, not assumptions!
|
|
159
|
+
FORBIDDEN: Guessing locations, assuming "typical" positions, clicking without seeing element.
|
|
160
|
+
REQUIRED: Only interact with elements you SEE in screenshot. If not visible, scroll or use take_screenshot(isFullPage=true).
|
|
161
|
+
If action fails, try alternative elements - don't repeat same ID blindly.
|
|
162
|
+
|
|
163
|
+
TYPESCRIPT INTERFACES (your response MUST conform to these):
|
|
164
|
+
|
|
165
|
+
\`\`\`typescript
|
|
166
|
+
interface Coordinate {
|
|
167
|
+
x: number; // Percentage of viewport width (0-100, use 3 decimals: 15.625)
|
|
168
|
+
y: number; // Percentage of viewport height (0-100, use 3 decimals: 82.375)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
interface SomCommand {
|
|
172
|
+
action: InteractionAction; // REQUIRED: Action to perform (distinguishes from SomVerification)
|
|
173
|
+
elementRef?: string; // Element ID from screenshot (e.g., "1", "2", "42")
|
|
174
|
+
coord?: Coordinate; // Direct percentage-based coords (use when SoM marker missing)
|
|
175
|
+
value?: string; // For fill/select/press actions
|
|
176
|
+
fromCoord?: Coordinate; // For drag actions (start point)
|
|
177
|
+
toCoord?: Coordinate; // For drag actions (end point)
|
|
178
|
+
// ... other optional parameters
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
interface SomVerification {
|
|
182
|
+
verificationType: VerificationType; // REQUIRED: Type of verification (distinguishes from SomCommand)
|
|
183
|
+
elementRef?: string; // SoM ID (e.g., "3") - optional for count verifications
|
|
184
|
+
expected?: string | number; // Expected value/text/count
|
|
185
|
+
description?: string; // Human-readable description
|
|
186
|
+
selector?: string; // CSS selector for count verifications (e.g., 'ul.items > li')
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// See available verifications in comment above
|
|
190
|
+
|
|
191
|
+
COMMANDS ARRAY: Mix actions (has 'action') and verifications (has 'verificationType').
|
|
192
|
+
Example: [{"elementRef":"4","action":"fill","value":"Hello"}, {"elementRef":"3","verificationType":"textContains","expected":"You: Hello"}]
|
|
193
|
+
CRITICAL: Verification steps MUST generate verification commands (never 0 commands) - don't just visually confirm!
|
|
194
|
+
|
|
195
|
+
COORDINATES (when SoM marker missing):
|
|
196
|
+
Use percentage-based coords for unmarked elements:
|
|
197
|
+
{ "action": "click", "coord": { "x": 85.625, "y": 12.375 } }
|
|
198
|
+
|
|
199
|
+
Format: percentages 0-100, MUST use 3 decimals (0.000 = top-left, 50.000 = center, 100.000 = bottom-right).
|
|
200
|
+
After coord click, magenta "clicked" marker appears. Use view_previous_screenshot tool to verify if result unexpected.
|
|
201
|
+
|
|
202
|
+
NAVIGATION: Use navigate/goBack/goForward/reload actions (no elementRef needed).
|
|
203
|
+
Example: { "action": "navigate", "value": "https://..." }
|
|
204
|
+
DON'T click address bar - use navigate action. System waits for page load after navigation.
|
|
205
|
+
|
|
206
|
+
// Available actions: click, doubleClick, rightClick, hover, drag, fill, press, select, check, uncheck, focus, blur, scroll, navigate, goBack, goForward, reload
|
|
207
|
+
// Available verifications: textContains, textEquals, valueEquals, valueEmpty, isVisible, isHidden, isEnabled, isDisabled, isChecked, isUnchecked, countEquals, countGreaterThan, countLessThan, hasClass, hasAttribute
|
|
208
|
+
|
|
209
|
+
interface AgentDecisionLLMResponse {
|
|
210
|
+
status: "continue" | "complete" | "stuck" | "infeasible";
|
|
211
|
+
reasoning: string;
|
|
212
|
+
commands?: (SomCommand | SomVerification)[]; // REPAIR MODE: Can be empty [] if step already done/obsolete
|
|
213
|
+
commandReasoning?: string;
|
|
214
|
+
toolCalls?: Array<{ name: string; params: Record<string, any> }>;
|
|
215
|
+
noteToFutureSelf?: string;
|
|
216
|
+
experiences?: string[];
|
|
217
|
+
blockerDetected?: { description: string; clearingCommands: SomCommand[] };
|
|
218
|
+
debugInfo?: { suggestedPromptUpdates?: string; reasoning?: string };
|
|
219
|
+
}
|
|
220
|
+
\`\`\`
|
|
221
|
+
|
|
222
|
+
NOTETOSELF: Your only continuity. Include: hypothesis, strategy, backup plans if fails, what to verify, observations.
|
|
223
|
+
Example: "Strategy: Click ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands."
|
|
224
|
+
|
|
225
|
+
EXAMPLE RESPONSES:
|
|
226
|
+
|
|
227
|
+
Action step:
|
|
228
|
+
\`\`\`json
|
|
229
|
+
{
|
|
230
|
+
"status": "continue",
|
|
231
|
+
"reasoning": "Need to fill login form with credentials",
|
|
232
|
+
"commands": [
|
|
233
|
+
{ "elementRef": "5", "action": "fill", "value": "user@example.com" },
|
|
234
|
+
{ "elementRef": "7", "action": "fill", "value": "password123" },
|
|
235
|
+
{ "elementRef": "12", "action": "click" }
|
|
236
|
+
],
|
|
237
|
+
"commandReasoning": "Filling email (ID 5), password (ID 7), clicking submit (ID 12)"
|
|
238
|
+
}
|
|
239
|
+
\`\`\`
|
|
240
|
+
|
|
241
|
+
Verification step:
|
|
242
|
+
\`\`\`json
|
|
243
|
+
{
|
|
244
|
+
"status": "complete",
|
|
245
|
+
"reasoning": "Message sent and verified in conversation",
|
|
246
|
+
"commands": [
|
|
247
|
+
{ "elementRef": "3", "verificationType": "textContains", "expected": "You: Hello", "description": "Message appears in thread" },
|
|
248
|
+
{ "elementRef": "4", "verificationType": "valueEmpty", "description": "Input cleared" }
|
|
249
|
+
],
|
|
250
|
+
"commandReasoning": "Verifying message visible in conversation (ID 3) and input empty (ID 4)"
|
|
251
|
+
}
|
|
252
|
+
\`\`\`
|
|
253
|
+
|
|
254
|
+
REPAIR MODE - Step already completed (DELETE case):
|
|
255
|
+
\`\`\`json
|
|
256
|
+
{
|
|
257
|
+
"status": "complete",
|
|
258
|
+
"reasoning": "Step asked to 'Dismiss welcome modal' but I see no modal in current screenshot - it was already dismissed by prior steps",
|
|
259
|
+
"commands": [],
|
|
260
|
+
"commandReasoning": "No commands needed - step goal already achieved/obsolete"
|
|
261
|
+
}
|
|
262
|
+
\`\`\`
|
|
263
|
+
|
|
264
|
+
OUTPUT FORMAT: JSON matching AgentDecisionLLMResponse interface above.`;
|
|
229
265
|
}
|
|
230
266
|
/**
|
|
231
267
|
* Build coordinate-specific system prompt (used when selectors repeatedly fail)
|
|
@@ -236,17 +272,20 @@ DECISION-MAKING PROCESS:
|
|
|
236
272
|
YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
|
|
237
273
|
|
|
238
274
|
SCREENSHOT PROVIDED:
|
|
239
|
-
You will see a screenshot with
|
|
275
|
+
You will see a screenshot with color-coded bounding boxes and ID labels attached to each element.
|
|
240
276
|
|
|
241
277
|
CRITICAL - IDENTIFY THE CORRECT ELEMENT:
|
|
242
278
|
1. READ the step goal carefully - what specific element are you looking for?
|
|
243
|
-
2.
|
|
244
|
-
3.
|
|
279
|
+
2. Look for the colored bounding box that matches the element description
|
|
280
|
+
3. The ID label is at TOP-RIGHT corner, ABOVE the box (bottom of label touches top of box)
|
|
281
|
+
4. Match the label color to the bounding box color
|
|
282
|
+
5. LOCATE that element in the screenshot (NOT a similar-looking element!)
|
|
283
|
+
6. VERIFY position using screen regions:
|
|
245
284
|
- Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
|
|
246
285
|
- Center content: xPercent ~30-70%
|
|
247
286
|
- Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
|
|
248
|
-
|
|
249
|
-
|
|
287
|
+
7. CALCULATE percentages from element's CENTER position
|
|
288
|
+
8. SANITY CHECK your percentages:
|
|
250
289
|
- Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
|
|
251
290
|
- Button in top-left at 90%? WRONG - that's top-right!
|
|
252
291
|
- Element description says "left" but x > 50%? WRONG - recheck!
|
|
@@ -258,7 +297,7 @@ Goal: "Click Settings link in left navigation"
|
|
|
258
297
|
→ Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
|
|
259
298
|
→ Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
|
|
260
299
|
→ xPercent: 12.500, yPercent: 32.000
|
|
261
|
-
→ Sanity check: 12.5% is FAR LEFT
|
|
300
|
+
→ Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
|
|
262
301
|
→ Description: "Clicking center of Settings link in left sidebar"
|
|
263
302
|
|
|
264
303
|
CRITICAL VISUAL ESTIMATION TIPS:
|
|
@@ -326,48 +365,113 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
326
365
|
/**
|
|
327
366
|
* Build user prompt with context
|
|
328
367
|
*/
|
|
329
|
-
static buildUserPrompt(context, consecutiveFailures) {
|
|
368
|
+
static buildUserPrompt(context, consecutiveFailures, enableCoordinateMode = false) {
|
|
330
369
|
const parts = [];
|
|
370
|
+
// Add SoM format reminder if screenshot is present
|
|
371
|
+
if (context.somScreenshot) {
|
|
372
|
+
parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
|
|
373
|
+
parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
|
|
374
|
+
parts.push(`Format: { "elementRef": "5", "action": "fill", "value": "text" }`);
|
|
375
|
+
parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
|
|
376
|
+
}
|
|
377
|
+
// Add repair mode context if present
|
|
378
|
+
if (context.priorSteps && context.priorSteps.length > 0) {
|
|
379
|
+
parts.push(`=== REPAIR MODE ===`);
|
|
380
|
+
parts.push(`Fixing a FAILED step in existing script. Page persisted from prior steps.\n`);
|
|
381
|
+
parts.push(`COMPLETED STEPS (already executed):`);
|
|
382
|
+
for (let i = 0; i < context.priorSteps.length; i++) {
|
|
383
|
+
parts.push(` ${i + 1}. [OK] ${context.priorSteps[i]}`);
|
|
384
|
+
}
|
|
385
|
+
parts.push(``);
|
|
386
|
+
parts.push(`>>> FAILED STEP ${context.priorSteps.length + 1}: ${context.currentStepGoal}`);
|
|
387
|
+
parts.push(` This step FAILED. Your job: fix it using current UI (SoM markers).\n`);
|
|
388
|
+
if (context.nextSteps && context.nextSteps.length > 0) {
|
|
389
|
+
parts.push(`REMAINING STEPS (auto-executes after you fix current):`);
|
|
390
|
+
for (let i = 0; i < context.nextSteps.length; i++) {
|
|
391
|
+
parts.push(` ${context.priorSteps.length + 2 + i}. ${context.nextSteps[i]}`);
|
|
392
|
+
}
|
|
393
|
+
parts.push(``);
|
|
394
|
+
}
|
|
395
|
+
parts.push(`REPAIR STRATEGY:`);
|
|
396
|
+
parts.push(`- CRITICAL: First check if this step is STILL NEEDED (may already be done by prior step or now obsolete)`);
|
|
397
|
+
parts.push(` → If step goal already achieved/no longer needed: Return 0 commands + status "complete" (DELETE case)`);
|
|
398
|
+
parts.push(` → Example: "Dismiss modal" but modal already gone → 0 commands, status "complete"`);
|
|
399
|
+
parts.push(`- Use SoM markers to identify current elements`);
|
|
400
|
+
parts.push(`- Generate commands that work with CURRENT UI (not original script)`);
|
|
401
|
+
parts.push(`- CRITICAL: Once you fix this step, return status "complete" IMMEDIATELY (control goes back to script)`);
|
|
402
|
+
parts.push(` → Repair mode = single step fix, then hand back control`);
|
|
403
|
+
parts.push(` → Don't continue to next steps - script will auto-execute them`);
|
|
404
|
+
parts.push(`- DON'T redo completed steps - only fix the blocker\n`);
|
|
405
|
+
}
|
|
331
406
|
// Put static instructions first for LLM caching efficiency
|
|
332
407
|
parts.push('STEP EXECUTION RULES:');
|
|
333
408
|
parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
|
|
334
409
|
parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
|
|
335
410
|
parts.push('- Mark "complete" ONLY if commands succeeded');
|
|
336
|
-
parts.push('-
|
|
411
|
+
parts.push('- Screenshot tool: Use ONCE for visual context, then ACT (max 3 per step, system enforced)');
|
|
337
412
|
parts.push('- Max 5 iterations per step, then forced STUCK\n');
|
|
338
413
|
// Dynamic content follows (changes per iteration)
|
|
339
414
|
parts.push('=== CURRENT CONTEXT ===\n');
|
|
340
415
|
// Display note from previous iteration (high priority tactical info)
|
|
341
416
|
if (context.noteFromPreviousIteration) {
|
|
342
417
|
const note = context.noteFromPreviousIteration;
|
|
343
|
-
parts.push(`📝 YOUR NOTE FROM ITERATION
|
|
418
|
+
parts.push(`📝 YOUR NOTE FROM PREVIOUS ITERATION:`);
|
|
344
419
|
parts.push(` ${note.content}`);
|
|
345
420
|
parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
|
|
421
|
+
parts.push(``);
|
|
422
|
+
parts.push(` ACTION REQUIRED:`);
|
|
423
|
+
parts.push(` 1. Did your previous action work? Check the screenshot!`);
|
|
424
|
+
parts.push(` 2. If it WORKED: Execute next step from your plan`);
|
|
425
|
+
parts.push(` 3. If it FAILED: Use your backup plan (try alternative IDs/methods)`);
|
|
426
|
+
parts.push(` 4. Write NEW noteToFutureSelf with:`);
|
|
427
|
+
parts.push(` - What worked/didn't work (learn from attempts)`);
|
|
428
|
+
parts.push(` - Updated strategy with new backup plan`);
|
|
429
|
+
parts.push(` - Next alternatives to try if this fails`);
|
|
430
|
+
parts.push(` - Build on previous note's reasoning`);
|
|
431
|
+
parts.push(``);
|
|
432
|
+
parts.push(` DON'T repeat failed actions - try your backup plan!`);
|
|
346
433
|
parts.push('');
|
|
347
434
|
}
|
|
348
|
-
// Check for screenshot loops (analysis paralysis)
|
|
435
|
+
// Check for screenshot loops (analysis paralysis) - PER STEP tracking
|
|
436
|
+
const screenshotsThisStep = context.recentSteps.filter(s => s.stepNumber === context.stepNumber &&
|
|
437
|
+
(s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot')));
|
|
349
438
|
const recentScreenshots = context.recentSteps.slice(-3).filter(s => s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'));
|
|
350
|
-
if (
|
|
351
|
-
parts.push(
|
|
352
|
-
parts.push(`You
|
|
353
|
-
parts.push(`
|
|
354
|
-
parts.push(`
|
|
355
|
-
parts.push(`
|
|
356
|
-
parts.push(
|
|
439
|
+
if (screenshotsThisStep.length >= 3) {
|
|
440
|
+
parts.push(`[CRITICAL] SCREENSHOT LOOP DETECTED - ${screenshotsThisStep.length} SCREENSHOTS THIS STEP`);
|
|
441
|
+
parts.push(`ANALYSIS PARALYSIS! You keep gathering info but NEVER ACTING!`);
|
|
442
|
+
parts.push(`NO MORE SCREENSHOTS ALLOWED - YOU MUST ACT NOW!`);
|
|
443
|
+
parts.push(`Pick ANY selector from your prior DOM snapshots and TRY IT.`);
|
|
444
|
+
parts.push(`Even if uncertain, execute the command. Failure is better than analysis paralysis.`);
|
|
445
|
+
parts.push(`If toolCalls contains "take_screenshot" → SYSTEM WILL REJECT IT\n`);
|
|
446
|
+
}
|
|
447
|
+
else if (recentScreenshots.length >= 2) {
|
|
448
|
+
parts.push(`[WARNING] ${recentScreenshots.length} screenshots in last 3 iterations - avoid more screenshots`);
|
|
449
|
+
parts.push(`Use selector recommendations from prior screenshots\n`);
|
|
357
450
|
}
|
|
358
451
|
// System warnings for accumulated failures
|
|
359
452
|
if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
|
|
360
|
-
parts.push(
|
|
361
|
-
|
|
453
|
+
parts.push(`[WARNING] SYSTEM WARNING: ${consecutiveFailures} failures!`);
|
|
454
|
+
// Only suggest screenshot if we haven't already taken multiple THIS STEP
|
|
455
|
+
if (screenshotsThisStep.length === 0) {
|
|
456
|
+
parts.push(`Take screenshot ONCE to see page state. Then ACT with selector.`);
|
|
457
|
+
}
|
|
458
|
+
else {
|
|
459
|
+
parts.push(`You already have visual context. Try different selector NOW.`);
|
|
460
|
+
}
|
|
362
461
|
parts.push(`Question assumptions: Am I at the right step?`);
|
|
363
|
-
parts.push(
|
|
462
|
+
parts.push(`[WARNING]\n`);
|
|
364
463
|
}
|
|
365
464
|
else if (consecutiveFailures && consecutiveFailures >= 4) {
|
|
366
|
-
parts.push(
|
|
367
|
-
|
|
465
|
+
parts.push(`[WARNING] CRITICAL: ${consecutiveFailures} failures!`);
|
|
466
|
+
if (enableCoordinateMode) {
|
|
467
|
+
parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
|
|
468
|
+
}
|
|
469
|
+
else {
|
|
470
|
+
parts.push(`Next failure will force STUCK. Try radically different selector approach.\n`);
|
|
471
|
+
}
|
|
368
472
|
}
|
|
369
|
-
// Trigger coordinate mode if many failures (Phase 1: after 3 failures)
|
|
370
|
-
if (consecutiveFailures && consecutiveFailures >= 3) {
|
|
473
|
+
// Trigger coordinate mode if many failures (Phase 1: after 3 failures) - ONLY if enabled
|
|
474
|
+
if (enableCoordinateMode && consecutiveFailures && consecutiveFailures >= 3) {
|
|
371
475
|
parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯🎯`);
|
|
372
476
|
parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
|
|
373
477
|
parts.push(`You MUST use coordinate-based action now (percentages).`);
|
|
@@ -379,6 +483,8 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
379
483
|
parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
|
|
380
484
|
parts.push(`${context.currentStepGoal}`);
|
|
381
485
|
parts.push(``);
|
|
486
|
+
parts.push(`[WARNING] IMPORTANT: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
|
|
487
|
+
parts.push(`[WARNING] CRITICAL: Only interact with elements you SEE in the screenshot - no guessing/hallucinating!`);
|
|
382
488
|
parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
|
|
383
489
|
if (context.completedSteps.length > 0) {
|
|
384
490
|
parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
|
|
@@ -386,14 +492,42 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
386
492
|
if (context.remainingSteps.length > 0) {
|
|
387
493
|
parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
|
|
388
494
|
}
|
|
495
|
+
// SoM screenshot (if available)
|
|
496
|
+
if (context.somScreenshot) {
|
|
497
|
+
parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
|
|
498
|
+
parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
|
|
499
|
+
parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
|
|
500
|
+
parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
|
|
501
|
+
parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
|
|
502
|
+
parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
|
|
503
|
+
parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
|
|
504
|
+
parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
|
|
505
|
+
parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
|
|
506
|
+
parts.push(``);
|
|
507
|
+
// SoM element map for disambiguation
|
|
508
|
+
if (context.somElementMap) {
|
|
509
|
+
parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
|
|
510
|
+
parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
|
|
511
|
+
parts.push(context.somElementMap);
|
|
512
|
+
parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
|
|
513
|
+
parts.push(``);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
389
516
|
// Current page state (most variable content - at the end)
|
|
390
517
|
parts.push(`\nCURRENT PAGE:`);
|
|
391
518
|
parts.push(`URL: ${context.currentURL}`);
|
|
392
519
|
parts.push(`Title: ${context.currentPageInfo.title}`);
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
520
|
+
// Only include DOM details if NOT in SoM mode
|
|
521
|
+
if (!context.somScreenshot) {
|
|
522
|
+
parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
|
|
523
|
+
parts.push(context.currentPageInfo.formattedElements);
|
|
524
|
+
parts.push(`\nARIA TREE (hierarchical structure):`);
|
|
525
|
+
parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
|
|
526
|
+
}
|
|
527
|
+
else {
|
|
528
|
+
// In SoM mode, skip DOM details - agent uses visual screenshot instead
|
|
529
|
+
parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
|
|
530
|
+
}
|
|
397
531
|
if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
|
|
398
532
|
parts.push('... (truncated)');
|
|
399
533
|
}
|
|
@@ -402,11 +536,11 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
402
536
|
if (context.recentSteps.length > 0) {
|
|
403
537
|
parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
|
|
404
538
|
for (const step of context.recentSteps) {
|
|
405
|
-
const status = step.result === 'success' ? '
|
|
539
|
+
const status = step.result === 'success' ? '[OK]' : '[FAIL]';
|
|
406
540
|
parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
|
|
407
541
|
parts.push(` Code: ${step.code}`);
|
|
408
542
|
if (step.result === 'failure' && step.error) {
|
|
409
|
-
parts.push(`
|
|
543
|
+
parts.push(` ERROR: ${step.error}`);
|
|
410
544
|
parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
|
|
411
545
|
}
|
|
412
546
|
else {
|
|
@@ -419,7 +553,7 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
419
553
|
if (recentFailures.length >= 2) {
|
|
420
554
|
const sameSelector = recentFailures.slice(-2).every((s, i, arr) => i === 0 || s.code === arr[i - 1].code);
|
|
421
555
|
if (sameSelector) {
|
|
422
|
-
parts.push(
|
|
556
|
+
parts.push(`[WARNING] WARNING: You've tried the same selector multiple times and it failed!`);
|
|
423
557
|
parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
|
|
424
558
|
parts.push(` YOU MUST try a completely different selector this time!\n`);
|
|
425
559
|
}
|
|
@@ -439,15 +573,163 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
|
|
|
439
573
|
parts.push(JSON.stringify(context.extractedData, null, 2));
|
|
440
574
|
parts.push('');
|
|
441
575
|
}
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
576
|
+
return parts.join('\n');
|
|
577
|
+
}
|
|
578
|
+
/**
|
|
579
|
+
* Build exploratory system prompt for autonomous exploration
|
|
580
|
+
*/
|
|
581
|
+
static buildExploratorySystemPrompt(toolDescriptions) {
|
|
582
|
+
return `You are an autonomous exploration agent that discovers and tests web application features.
|
|
583
|
+
|
|
584
|
+
${toolDescriptions}
|
|
585
|
+
|
|
586
|
+
YOUR RESPONSE FORMAT - Output JSON matching this interface:
|
|
587
|
+
|
|
588
|
+
interface AgentDecisionLLMResponse {
|
|
589
|
+
status: string; // "continue" | "complete" | "stuck"
|
|
590
|
+
reasoning: string; // What you're exploring and why
|
|
591
|
+
|
|
592
|
+
// COMMANDS: Array of plain Playwright command strings
|
|
593
|
+
commands?: string[]; // Example: ["await page.fill('input[name=\"email\"]', 'test@example.com')", ...]
|
|
594
|
+
commandReasoning?: string;
|
|
595
|
+
toolCalls?: Array<{ // Tools to call (extract_data for menus, etc.)
|
|
596
|
+
name: string;
|
|
597
|
+
params: Record<string, any>;
|
|
598
|
+
}>;
|
|
599
|
+
toolReasoning?: string;
|
|
600
|
+
needsToolResults?: boolean;
|
|
601
|
+
noteToFutureSelf?: string;
|
|
602
|
+
coordinateAction?: { ... };
|
|
603
|
+
experiences?: string[]; // Use for BOTH app patterns AND exploration progress
|
|
604
|
+
blockerDetected?: { ... };
|
|
605
|
+
debugInfo?: { // Meta-learning: suggest prompt improvements (only when very confident)
|
|
606
|
+
suggestedPromptUpdates?: string;
|
|
607
|
+
reasoning?: string;
|
|
608
|
+
};
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
EXPLORATION MODE GUIDELINES:
|
|
612
|
+
|
|
613
|
+
1. **JOURNEY-FOCUSED EXPLORATION**: Follow the exploration prompt as your goal for THIS journey
|
|
614
|
+
- Example prompt: "Explore Dashboard and test all widgets"
|
|
615
|
+
- You should systematically test dashboard widgets, not wander off to other sections
|
|
616
|
+
- Stay focused on the given journey goal
|
|
617
|
+
|
|
618
|
+
2. **ICON BUTTONS**: Match step goal to icon semantics (plus=add, gear=settings, hamburger=menu). Check element map for confirmation. Don't randomly try - reason about fit.
|
|
619
|
+
|
|
620
|
+
3. **VISIBLE ELEMENTS ONLY**: Screenshot shows viewport only. Only interact with elements you SEE. If not visible, scroll or take_screenshot(isFullPage=true).
|
|
621
|
+
|
|
622
|
+
4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in experiences, check history to avoid repeating, prioritize unexplored areas.
|
|
623
|
+
|
|
624
|
+
5. **CREATIVE TESTING**: Test functionality thoroughly - try edge cases, verify features work, look for bugs.
|
|
625
|
+
|
|
626
|
+
7. **LIMITATIONS**: Cannot complete: sign-up, forgot password, OTP, CAPTCHA, email verification (no inbox/SMS access).
|
|
627
|
+
If encountered: CAPTCHA → stuck, sign-up/OTP → skip and explore other areas.
|
|
628
|
+
|
|
629
|
+
8. **AUTH**: If credentials provided, login FIRST using exact testDataPrompt values. Don't explore public pages or click sign-up.
|
|
630
|
+
|
|
631
|
+
9. **BLOCKERS**: Clear cookie modals, tour popups autonomously with blockerDetected.clearingCommands. CAPTCHA → stuck.
|
|
632
|
+
|
|
633
|
+
10. **STATUS**: complete=goal achieved or budget low, continue=need more, stuck=cannot proceed. Complete when journey goal met, don't wait for maxSteps.
|
|
634
|
+
|
|
635
|
+
11. **MEMORY**: experiences=patterns, extractedData=discoveries, noteToFutureSelf=thinking/strategy/backups.
|
|
636
|
+
|
|
637
|
+
CRITICAL: You're fully autonomous for THIS journey - no step-by-step instructions provided.
|
|
638
|
+
YOU decide the exploration path to meet the journey goal based on: journey prompt, current state, and memory.`;
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Build exploratory user prompt with context
|
|
642
|
+
*/
|
|
643
|
+
static buildExploratoryUserPrompt(context, explorationPrompt, testDataPrompt, stepNumber, maxSteps) {
|
|
644
|
+
const parts = [];
|
|
645
|
+
// Add SoM format reminder if screenshot is present
|
|
646
|
+
if (context.somScreenshot) {
|
|
647
|
+
parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
|
|
648
|
+
parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
|
|
649
|
+
parts.push(`Format: { "elementRef": "1", "action": "click" }`);
|
|
650
|
+
parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
|
|
651
|
+
}
|
|
652
|
+
parts.push('=== JOURNEY EXPLORATION CONTEXT ===\n');
|
|
653
|
+
parts.push(`GOAL: ${explorationPrompt}`);
|
|
654
|
+
parts.push(` (Focus on THIS specific goal - don't wander to unrelated areas)\n`);
|
|
655
|
+
if (testDataPrompt) {
|
|
656
|
+
parts.push(`TEST DATA/CREDENTIALS: ${testDataPrompt}`);
|
|
657
|
+
parts.push(` [WARNING] IMPORTANT: If credentials are provided above (email/username and password), you MUST:`);
|
|
658
|
+
parts.push(` - Use them to LOGIN and explore authenticated features`);
|
|
659
|
+
parts.push(` - Fill login forms with the exact credentials provided`);
|
|
660
|
+
parts.push(` - Don't waste time on public/unauthenticated pages when you can login`);
|
|
661
|
+
parts.push(` - Prioritize exploring the authenticated app experience\n`);
|
|
662
|
+
}
|
|
663
|
+
if (stepNumber && maxSteps) {
|
|
664
|
+
parts.push(`PROGRESS: Step ${stepNumber}/${maxSteps} (you can complete earlier if journey goal met)\n`);
|
|
665
|
+
}
|
|
666
|
+
// Show discovered and tracked data from extractedData
|
|
667
|
+
if (context.extractedData && Object.keys(context.extractedData).length > 0) {
|
|
668
|
+
parts.push(`\nDISCOVERED DATA (this journey):`);
|
|
669
|
+
for (const [key, value] of Object.entries(context.extractedData)) {
|
|
670
|
+
parts.push(` ${key}: ${value}`);
|
|
448
671
|
}
|
|
449
|
-
parts.push('');
|
|
450
672
|
}
|
|
673
|
+
// SoM screenshot (if available)
|
|
674
|
+
if (context.somScreenshot) {
|
|
675
|
+
parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
|
|
676
|
+
parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
|
|
677
|
+
parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
|
|
678
|
+
parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
|
|
679
|
+
parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
|
|
680
|
+
parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
|
|
681
|
+
parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
|
|
682
|
+
parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
|
|
683
|
+
parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
|
|
684
|
+
parts.push(``);
|
|
685
|
+
// SoM element map for disambiguation
|
|
686
|
+
if (context.somElementMap) {
|
|
687
|
+
parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
|
|
688
|
+
parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
|
|
689
|
+
parts.push(context.somElementMap);
|
|
690
|
+
parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
|
|
691
|
+
parts.push(``);
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
parts.push(`\nCURRENT PAGE:`);
|
|
695
|
+
parts.push(`URL: ${context.currentURL}`);
|
|
696
|
+
parts.push(`Title: ${context.currentPageInfo.title}`);
|
|
697
|
+
// Only include DOM details if NOT in SoM mode
|
|
698
|
+
if (!context.somScreenshot) {
|
|
699
|
+
parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
|
|
700
|
+
parts.push(context.currentPageInfo.formattedElements);
|
|
701
|
+
parts.push(`\nARIA TREE (hierarchical structure):`);
|
|
702
|
+
parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
|
|
703
|
+
}
|
|
704
|
+
else {
|
|
705
|
+
// In SoM mode, skip DOM details - agent uses visual screenshot
|
|
706
|
+
parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
|
|
707
|
+
}
|
|
708
|
+
if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
|
|
709
|
+
parts.push('... (truncated)');
|
|
710
|
+
}
|
|
711
|
+
// Recent actions
|
|
712
|
+
if (context.recentSteps.length > 0) {
|
|
713
|
+
parts.push(`\nRECENT ACTIONS (last ${context.recentSteps.length}):`);
|
|
714
|
+
for (const step of context.recentSteps) {
|
|
715
|
+
const status = step.result === 'success' ? '[OK]' : '[FAIL]';
|
|
716
|
+
parts.push(` ${status} ${step.action}`);
|
|
717
|
+
parts.push(` ${step.observation}`);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
// Learnings and exploration progress
|
|
721
|
+
if (context.experiences && context.experiences.length > 0) {
|
|
722
|
+
parts.push(`\nEXPLORATION NOTES & APP PATTERNS:`);
|
|
723
|
+
for (const exp of context.experiences) {
|
|
724
|
+
parts.push(` • ${exp}`);
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
// Note from previous iteration
|
|
728
|
+
if (context.noteFromPreviousIteration) {
|
|
729
|
+
parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.noteFromPreviousIteration.content}`);
|
|
730
|
+
parts.push(`Did it work? If yes, continue plan. If failed, try backup alternatives.`);
|
|
731
|
+
}
|
|
732
|
+
parts.push(`\nDECIDE NEXT ACTION: What to explore/test next? Check history to avoid repeating. Is goal achieved? Mark complete.`);
|
|
451
733
|
return parts.join('\n');
|
|
452
734
|
}
|
|
453
735
|
}
|