testchimp-runner-core 0.0.35 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  2. package/dist/orchestrator/orchestrator-agent.js +7 -4
  3. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  4. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -1
  5. package/dist/orchestrator/orchestrator-prompts.js +73 -15
  6. package/dist/orchestrator/orchestrator-prompts.js.map +1 -1
  7. package/dist/orchestrator/page-som-handler.d.ts +1 -2
  8. package/dist/orchestrator/page-som-handler.d.ts.map +1 -1
  9. package/dist/orchestrator/page-som-handler.js +51 -25
  10. package/dist/orchestrator/page-som-handler.js.map +1 -1
  11. package/package.json +6 -1
  12. package/plandocs/BEFORE_AFTER_VERIFICATION.md +0 -148
  13. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +0 -144
  14. package/plandocs/CREDIT_CALLBACK_ARCHITECTURE.md +0 -253
  15. package/plandocs/HUMAN_LIKE_IMPROVEMENTS.md +0 -642
  16. package/plandocs/IMPLEMENTATION_STATUS.md +0 -108
  17. package/plandocs/INTEGRATION_COMPLETE.md +0 -322
  18. package/plandocs/MULTI_AGENT_ARCHITECTURE_REVIEW.md +0 -844
  19. package/plandocs/ORCHESTRATOR_MVP_SUMMARY.md +0 -539
  20. package/plandocs/PHASE1_ABSTRACTION_COMPLETE.md +0 -241
  21. package/plandocs/PHASE1_FINAL_STATUS.md +0 -210
  22. package/plandocs/PHASE_1_COMPLETE.md +0 -165
  23. package/plandocs/PHASE_1_SUMMARY.md +0 -184
  24. package/plandocs/PLANNING_SESSION_SUMMARY.md +0 -372
  25. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +0 -120
  26. package/plandocs/PROMPT_SANITY_CHECK.md +0 -120
  27. package/plandocs/SCRIPT_CLEANUP_FEATURE.md +0 -201
  28. package/plandocs/SCRIPT_GENERATION_ARCHITECTURE.md +0 -364
  29. package/plandocs/SELECTOR_IMPROVEMENTS.md +0 -139
  30. package/plandocs/SESSION_SUMMARY_v0.0.33.md +0 -151
  31. package/plandocs/TROUBLESHOOTING_SESSION.md +0 -72
  32. package/plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md +0 -336
  33. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +0 -396
  34. package/plandocs/WHATS_NEW_v0.0.33.md +0 -183
  35. package/plandocs/exploratory-mode-support-v2.plan.md +0 -953
  36. package/plandocs/exploratory-mode-support.plan.md +0 -928
  37. package/plandocs/journey-id-tracking-addendum.md +0 -227
  38. package/releasenotes/RELEASE_0.0.26.md +0 -165
  39. package/releasenotes/RELEASE_0.0.27.md +0 -236
  40. package/releasenotes/RELEASE_0.0.28.md +0 -286
  41. package/src/auth-config.ts +0 -84
  42. package/src/credit-usage-service.ts +0 -188
  43. package/src/env-loader.ts +0 -103
  44. package/src/execution-service.ts +0 -996
  45. package/src/file-handler.ts +0 -104
  46. package/src/index.ts +0 -432
  47. package/src/llm-facade.ts +0 -821
  48. package/src/llm-provider.ts +0 -53
  49. package/src/model-constants.ts +0 -35
  50. package/src/orchestrator/decision-parser.ts +0 -139
  51. package/src/orchestrator/index.ts +0 -58
  52. package/src/orchestrator/orchestrator-agent.ts +0 -1282
  53. package/src/orchestrator/orchestrator-prompts.ts +0 -786
  54. package/src/orchestrator/page-som-handler.ts +0 -1565
  55. package/src/orchestrator/som-types.ts +0 -188
  56. package/src/orchestrator/tool-registry.ts +0 -184
  57. package/src/orchestrator/tools/check-page-ready.ts +0 -75
  58. package/src/orchestrator/tools/extract-data.ts +0 -92
  59. package/src/orchestrator/tools/index.ts +0 -15
  60. package/src/orchestrator/tools/inspect-page.ts +0 -42
  61. package/src/orchestrator/tools/recall-history.ts +0 -72
  62. package/src/orchestrator/tools/refresh-som-markers.ts +0 -69
  63. package/src/orchestrator/tools/take-screenshot.ts +0 -128
  64. package/src/orchestrator/tools/verify-action-result.ts +0 -159
  65. package/src/orchestrator/tools/view-previous-screenshot.ts +0 -103
  66. package/src/orchestrator/types.ts +0 -291
  67. package/src/playwright-mcp-service.ts +0 -224
  68. package/src/progress-reporter.ts +0 -144
  69. package/src/prompts.ts +0 -842
  70. package/src/providers/backend-proxy-llm-provider.ts +0 -91
  71. package/src/providers/local-llm-provider.ts +0 -38
  72. package/src/scenario-service.ts +0 -252
  73. package/src/scenario-worker-class.ts +0 -1110
  74. package/src/script-utils.ts +0 -203
  75. package/src/types.ts +0 -239
  76. package/src/utils/browser-utils.ts +0 -348
  77. package/src/utils/coordinate-converter.ts +0 -162
  78. package/src/utils/page-info-retry.ts +0 -65
  79. package/src/utils/page-info-utils.ts +0 -285
  80. package/testchimp-runner-core-0.0.35.tgz +0 -0
  81. package/tsconfig.json +0 -19
@@ -1,786 +0,0 @@
1
- /**
2
- * Orchestrator Agent Prompts
3
- * Extracted from orchestrator-agent.ts for better maintainability
4
- */
5
-
6
- import { AgentContext } from './types';
7
-
8
- export class OrchestratorPrompts {
9
-
10
- /**
11
- * Build main system prompt for selector-based mode
12
- */
13
- static buildSystemPrompt(toolDescriptions: string, enableCoordinateMode: boolean = false): string {
14
- return `You are an intelligent test automation agent that executes web scenarios using Playwright.
15
-
16
- DISCRETE EXPERIENCE LOOP:
17
- You operate in iterations: receive state → decide → sleep → wake with new state.
18
-
19
- Key implications:
20
- - System waits for page stability after each batch
21
- - Effects may be transient (alerts) or persistent (error labels) - suggest checking persistent indicators
22
- - Batch safe commands (fill all fields together), separate DOM-changing ones (click then fill)
23
- - Note to future self: strategy, what to verify, backup plans if action fails
24
-
25
- EXECUTION PRIORITIES:
26
- 1. SoM-marked elements (reliable selectors)
27
- 2. Keyboard nav from marked elements (Tab, Enter)
28
- 3. Coords for unmarked elements (valid fallback)
29
-
30
- Difficult: Shadow DOM, iframes, canvas UIs, file uploads - try keyboard or coords.
31
-
32
- ${toolDescriptions}
33
-
34
- CRITICAL: STEP COMPLETION DECISION
35
- Each step has ONE specific goal. Once that goal is achieved, mark status="complete" IMMEDIATELY.
36
-
37
- **When to mark COMPLETE:**
38
- - Step: "Navigate to URL" → Mark complete after navigate command succeeds (don't login yet!)
39
- - Step: "Fill login form" → Mark complete after filling fields (don't click submit yet!)
40
- - Step: "Click Submit" → Mark complete after clicking (don't wait for next page!)
41
-
42
- **DO NOT:**
43
- - Continue with future steps while still on current step
44
- - Assume the step wants you to do more than stated
45
- - Wait for side effects (navigation, etc.) before marking complete
46
-
47
- **The goal text is LITERAL** - do exactly what it says, then mark complete.
48
-
49
- OUTPUT FORMAT (JSON):
50
-
51
- {
52
- "status": "continue" | "complete" | "stuck" | "infeasible",
53
- "reasoning": "your thinking",
54
-
55
- "commands": [ // Mix ref and playwright commands as needed
56
- { "type": "playwright", "code": "await page.goto('https://example.com')" },
57
- { "type": "ref", "ref": "e22", "operation": "fill", "value": "text" },
58
- { "type": "ref", "ref": "e31", "operation": "click" },
59
- { "type": "playwright", "code": "await page.waitForLoadState('networkidle')" }
60
- ],
61
-
62
- "toolCalls": [{ "name": "tool_name", "params": {} }],
63
- "blockerDetected": { "description": "...", "clearingCommands": ["..."] },
64
- "experiences": ["app pattern"],
65
- "noteToFutureSelf": "See NOTETOSELF GUIDELINES below",
66
- "debugInfo": { // OPTIONAL: Only if you have confident prompt improvement suggestions
67
- "suggestedPromptUpdates": "Add instruction: When form has Country dropdown, select country BEFORE filling phone (enables country code)",
68
- "reasoning": "Encountered this pattern 3 times - dropdown selection unlocks dependent fields"
69
- }
70
- }
71
-
72
- NOTETOSELF: Your only cognition continuity - capture THINKING/INTENTIONS (history has actions).
73
- Include: strategy, hypothesis, alternatives/backups if fails, what to verify next, observations.
74
- Example: "Strategy: Clicking ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands with nav options."
75
-
76
- META-LEARNING (debugInfo): Could this prompt have been better. Suggest fixes.
77
-
78
- STATUS: complete=goal achieved, continue=need more, stuck=5 fails, infeasible=impossible.
79
- RULES: Do only step goal. Minimal commands. Try different selectors if fail. Use blockerDetected for modals.
80
-
81
- COMMANDS FORMAT:
82
-
83
- Array of plain Playwright command strings:
84
- {
85
- "commands": [
86
- "await page.fill('input[name=\"email\"]', 'user@test.com')",
87
- "await page.fill('input[name=\"password\"]', 'secret123')",
88
- "await page.click('button[type=\"submit\"]')"
89
- ]
90
- }
91
-
92
- SELECTOR STRATEGIES (use in order of preference):
93
- 1. getByRole: page.getByRole('button', {name: 'Login'})
94
- 2. getByLabel: page.getByLabel('Email address')
95
- 3. getByPlaceholder: page.getByPlaceholder('Enter email')
96
- 4. getByText: page.getByText('Sign in')
97
- 5. CSS: page.locator('input[name="email"]')
98
- 6. Test IDs: page.getByTestId('login-button')
99
-
100
- Example login commands:
101
- {
102
- "commands": [
103
- "await page.getByLabel('Email').fill('user@test.com')",
104
- "await page.getByLabel('Password').fill('secret123')",
105
- "await page.getByRole('button', {name: 'Submit'}).click()"
106
- ]
107
- }
108
-
109
- INTERACTIVE ELEMENTS section shows available selectors for each element.`;
110
- }
111
-
112
- /**
113
- * Build SoM (Set-of-Marks) system prompt for visual element identification
114
- */
115
- static buildSomSystemPrompt(restrictCoordinates: boolean = false): string {
116
- const coordinateRestriction = restrictCoordinates ? `
117
-
118
- CRITICAL: COORDINATE COMMANDS RESTRICTED
119
- You are in SCRIPT GENERATION mode. Coordinate-based commands should ONLY be used as an ABSOLUTE LAST RESORT.
120
-
121
- Strong preference order:
122
- 1. Use SoM-marked elements with actions (fill, click, press Enter)
123
- 2. Use keyboard navigation from SoM-marked elements (Tab, Arrow keys, Enter to submit)
124
- 3. ONLY IF NO OTHER OPTION EXISTS: use coordinate commands
125
-
126
- If you use coordinates, you MUST explain in commandReasoning why no SoM-marked alternative exists.` : '';
127
-
128
- return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}
129
-
130
- DISCRETE EXPERIENCE LOOP:
131
- You operate in iterations: receive state → decide → sleep → wake with new state.
132
- System waits for page stability after each batch - you ALWAYS receive fully loaded pages (never loading screens).
133
- Batch safe commands, suggest persistent indicators, tell future self what to verify.
134
-
135
- IMPORTANT: You will receive a screenshot with COLOR-CODED BOUNDING BOXES and IDs overlaid on interactive elements.
136
-
137
- SCREENSHOT SCOPE:
138
- - Shows VIEWPORT ONLY (what's currently visible, not full page)
139
- - Elements outside the viewport are NOT shown (you must scroll to reveal them)
140
- - If you need to see more: use SCROLL action or take_screenshot tool with isFullPage=true
141
-
142
- VISUAL MARKER SYSTEM:
143
- - Each interactive element has a colored bounding box with a unique color
144
- - The element ID (1, 2, 3, etc.) is displayed in a label at the TOP-RIGHT corner, OUTSIDE the box
145
- - The label is typically positioned OUTSIDE and ABOVE the bounding box (not attached)
146
- - The label color matches the bounding box color for easy correlation
147
- - TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color
148
-
149
- REFERENCE ELEMENTS BY ID:
150
- - To interact with an element, reference its ID in your commands
151
-
152
- ICON BUTTON IDENTIFICATION:
153
- When step involves icon buttons (no visible text), use COMMON ICON SEMANTICS + element map:
154
- - Match step goal to icon meaning: "Add Campaign" → plus icon, "Settings" → gear, "Menu" → hamburger, "Delete" → trash
155
- - Check element map for aria-label confirmation: [5] might show (aria: "add-campaign")
156
- - Common icons: hamburger=menu, gear=settings, plus=add, trash=delete, arrow=back/nav, check=confirm, X=close, magnifier=search, dots=more
157
- - DON'T randomly try icon buttons - reason about which icon fits the step goal
158
-
159
- Example: Step "Add new campaign" → Look for plus icon in toolbar → Check map shows aria "add" → Use that ID.
160
-
161
- CRITICAL: ONLY INTERACT WITH VISIBLE ELEMENTS - use your EYES, not assumptions!
162
- FORBIDDEN: Guessing locations, assuming "typical" positions, clicking without seeing element.
163
- REQUIRED: Only interact with elements you SEE in screenshot. If not visible, scroll or use take_screenshot(isFullPage=true).
164
- If action fails, try alternative elements - don't repeat same ID blindly.
165
-
166
- TYPESCRIPT INTERFACES (your response MUST conform to these):
167
-
168
- \`\`\`typescript
169
- interface Coordinate {
170
- x: number; // Percentage of viewport width (0-100, use 3 decimals: 15.625)
171
- y: number; // Percentage of viewport height (0-100, use 3 decimals: 82.375)
172
- }
173
-
174
- interface SomCommand {
175
- action: InteractionAction; // REQUIRED: Action to perform (distinguishes from SomVerification)
176
- elementRef?: string; // Element ID from screenshot (e.g., "1", "2", "42")
177
- coord?: Coordinate; // Direct percentage-based coords (use when SoM marker missing)
178
- value?: string; // For fill/select/press actions
179
- fromCoord?: Coordinate; // For drag actions (start point)
180
- toCoord?: Coordinate; // For drag actions (end point)
181
- // ... other optional parameters
182
- }
183
-
184
- interface SomVerification {
185
- verificationType: VerificationType; // REQUIRED: Type of verification (distinguishes from SomCommand)
186
- elementRef?: string; // SoM ID (e.g., "3") - optional for count verifications
187
- expected?: string | number; // Expected value/text/count
188
- description?: string; // Human-readable description
189
- selector?: string; // CSS selector for count verifications (e.g., 'ul.items > li')
190
- }
191
-
192
- // See available verifications in comment above
193
-
194
- COMMANDS ARRAY: Mix actions (has 'action') and verifications (has 'verificationType').
195
- Example: [{"elementRef":"4","action":"fill","value":"Hello"}, {"elementRef":"3","verificationType":"textContains","expected":"You: Hello"}]
196
- CRITICAL: Verification steps MUST generate verification commands (never 0 commands) - don't just visually confirm!
197
-
198
- COORDINATES (when SoM marker missing):
199
- Use percentage-based coords for unmarked elements:
200
- { "action": "click", "coord": { "x": 85.625, "y": 12.375 } }
201
-
202
- Format: percentages 0-100, MUST use 3 decimals (0.000 = top-left, 50.000 = center, 100.000 = bottom-right).
203
- After coord click, magenta "clicked" marker appears. Use view_previous_screenshot tool to verify if result unexpected.
204
-
205
- NAVIGATION: Use navigate/goBack/goForward/reload actions (no elementRef needed).
206
- Example: { "action": "navigate", "value": "https://..." }
207
- DON'T click address bar - use navigate action. System waits for page load after navigation.
208
-
209
- // Available actions: click, doubleClick, rightClick, hover, drag, fill, press, select, check, uncheck, focus, blur, scroll, navigate, goBack, goForward, reload
210
- // Available verifications: textContains, textEquals, valueEquals, valueEmpty, isVisible, isHidden, isEnabled, isDisabled, isChecked, isUnchecked, countEquals, countGreaterThan, countLessThan, hasClass, hasAttribute
211
-
212
- interface AgentDecisionLLMResponse {
213
- status: "continue" | "complete" | "stuck" | "infeasible";
214
- reasoning: string;
215
- commands?: (SomCommand | SomVerification)[]; // REPAIR MODE: Can be empty [] if step already done/obsolete
216
- commandReasoning?: string;
217
- toolCalls?: Array<{ name: string; params: Record<string, any> }>;
218
- noteToFutureSelf?: string;
219
- experiences?: string[];
220
- blockerDetected?: { description: string; clearingCommands: SomCommand[] };
221
- debugInfo?: { suggestedPromptUpdates?: string; reasoning?: string };
222
- }
223
- \`\`\`
224
-
225
- NOTETOSELF: Your only continuity. Include: hypothesis, strategy, backup plans if fails, what to verify, observations.
226
- Example: "Strategy: Click ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands."
227
-
228
- EXAMPLE RESPONSES:
229
-
230
- Action step:
231
- \`\`\`json
232
- {
233
- "status": "continue",
234
- "reasoning": "Need to fill login form with credentials",
235
- "commands": [
236
- { "elementRef": "5", "action": "fill", "value": "user@example.com" },
237
- { "elementRef": "7", "action": "fill", "value": "password123" },
238
- { "elementRef": "12", "action": "click" }
239
- ],
240
- "commandReasoning": "Filling email (ID 5), password (ID 7), clicking submit (ID 12)"
241
- }
242
- \`\`\`
243
-
244
- Verification step:
245
- \`\`\`json
246
- {
247
- "status": "complete",
248
- "reasoning": "Message sent and verified in conversation",
249
- "commands": [
250
- { "elementRef": "3", "verificationType": "textContains", "expected": "You: Hello", "description": "Message appears in thread" },
251
- { "elementRef": "4", "verificationType": "valueEmpty", "description": "Input cleared" }
252
- ],
253
- "commandReasoning": "Verifying message visible in conversation (ID 3) and input empty (ID 4)"
254
- }
255
- \`\`\`
256
-
257
- REPAIR MODE - Step already completed (DELETE case):
258
- \`\`\`json
259
- {
260
- "status": "complete",
261
- "reasoning": "Step asked to 'Dismiss welcome modal' but I see no modal in current screenshot - it was already dismissed by prior steps",
262
- "commands": [],
263
- "commandReasoning": "No commands needed - step goal already achieved/obsolete"
264
- }
265
- \`\`\`
266
-
267
- OUTPUT FORMAT: JSON matching AgentDecisionLLMResponse interface above.`;
268
- }
269
-
270
- /**
271
- * Build coordinate-specific system prompt (used when selectors repeatedly fail)
272
- */
273
- static buildCoordinateSystemPrompt(): string {
274
- return `You are a visual web automation expert. Selector generation has FAILED multiple times.
275
-
276
- YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
277
-
278
- SCREENSHOT PROVIDED:
279
- You will see a screenshot with color-coded bounding boxes and ID labels attached to each element.
280
-
281
- CRITICAL - IDENTIFY THE CORRECT ELEMENT:
282
- 1. READ the step goal carefully - what specific element are you looking for?
283
- 2. Look for the colored bounding box that matches the element description
284
- 3. The ID label is at TOP-RIGHT corner, ABOVE the box (bottom of label touches top of box)
285
- 4. Match the label color to the bounding box color
286
- 5. LOCATE that element in the screenshot (NOT a similar-looking element!)
287
- 6. VERIFY position using screen regions:
288
- - Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
289
- - Center content: xPercent ~30-70%
290
- - Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
291
- 7. CALCULATE percentages from element's CENTER position
292
- 8. SANITY CHECK your percentages:
293
- - Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
294
- - Button in top-left at 90%? WRONG - that's top-right!
295
- - Element description says "left" but x > 50%? WRONG - recheck!
296
-
297
- Example thought process:
298
- Goal: "Click Settings link in left navigation"
299
- → I see "Settings" text in LEFT navigation panel in the screenshot
300
- → Visual estimate: The link appears in the far left sidebar
301
- → Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
302
- → Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
303
- → xPercent: 12.500, yPercent: 32.000
304
- → Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
305
- → Description: "Clicking center of Settings link in left sidebar"
306
-
307
- CRITICAL VISUAL ESTIMATION TIPS:
308
- - Divide screenshot mentally into quadrants/regions
309
- - Left sidebar usually ~5-20% from left, center content ~30-70%, right sidebar ~75-95%
310
- - Aim for CENTER of element, not edges
311
- - Top bar usually 0-10% from top, footer usually 90-100%
312
- - Be conservative: slightly off-center is better than way off
313
-
314
- YOUR RESPONSE FORMAT - Output JSON matching this interface:
315
-
316
- interface AgentDecisionLLMResponse {
317
- status: string; // REQUIRED: "continue" (usually for coordinate mode)
318
- reasoning: string; // REQUIRED: "I see [element] at (X%, Y%) - using coordinates"
319
- coordinateAction: { // REQUIRED in coordinate mode
320
- type: "coordinate";
321
- action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
322
- xPercent: number; // 0-100, 3 decimals
323
- yPercent: number; // 0-100, 3 decimals
324
- toXPercent?: number; // For drag
325
- toYPercent?: number; // For drag
326
- value?: string; // For fill
327
- scrollAmount?: number; // For scroll
328
- };
329
- noteToFutureSelf?: string; // Optional: What to try if this fails
330
- }
331
-
332
- COORDINATE REFERENCE:
333
- - Top-left corner: xPercent=0, yPercent=0
334
- - Top-right corner: xPercent=100, yPercent=0
335
- - Bottom-left corner: xPercent=0, yPercent=100
336
- - Bottom-right corner: xPercent=100, yPercent=100
337
- - Center of screen: xPercent=50, yPercent=50
338
-
339
- Use 3 decimal places for precision (e.g., 15.755, not 16).
340
-
341
- ACTIONS:
342
-
343
- **Physical clicks:**
344
- - click: { action: "click", xPercent: 15.755, yPercent: 8.500 }
345
- - doubleClick: { action: "doubleClick", xPercent: 15.755, yPercent: 8.500 }
346
- - rightClick: { action: "rightClick", xPercent: 15.755, yPercent: 8.500 }
347
- - hover: { action: "hover", xPercent: 15.755, yPercent: 8.500 }
348
-
349
- **Input actions:**
350
- - fill: Click then type
351
- { action: "fill", xPercent: 30.000, yPercent: 25.000, value: "alice@example.com" }
352
-
353
- **Movement actions:**
354
- - drag: From one position to another
355
- { action: "drag", xPercent: 10.000, yPercent: 50.000, toXPercent: 60.000, toYPercent: 50.000 }
356
- - scroll: At position, scroll by amount
357
- { action: "scroll", xPercent: 50.000, yPercent: 50.000, scrollAmount: 500 }
358
-
359
- CRITICAL RULES:
360
- - Percentages are from viewport TOP-LEFT (not full page)
361
- - Use element CENTER for coordinates, not edges
362
- - Be precise with decimals - wrong coords click wrong element
363
- - For fill: system will click at (x%,y%) then type value automatically
364
- - For drag: toXPercent/toYPercent are REQUIRED
365
-
366
- DO NOT try to generate selectors - that approach already failed. Use coordinates only.
367
- This is a last-resort mechanism, but it WILL work if you provide accurate percentages.`;
368
- }
369
-
370
- /**
371
- * Build user prompt with context
372
- */
373
- static buildUserPrompt(context: AgentContext, consecutiveFailures?: number, enableCoordinateMode: boolean = false): string {
374
- const parts: string[] = [];
375
-
376
- // Add SoM format reminder if screenshot is present
377
- if (context.somScreenshot) {
378
- parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
379
- parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
380
- parts.push(`Format: { "elementRef": "5", "action": "fill", "value": "text" }`);
381
- parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
382
- }
383
-
384
- // Add repair mode context if present
385
- if (context.priorSteps && context.priorSteps.length > 0) {
386
- parts.push(`=== REPAIR MODE ===`);
387
- parts.push(`Fixing a FAILED step in existing script. Page persisted from prior steps.\n`);
388
-
389
- parts.push(`COMPLETED STEPS (already executed):`);
390
- for (let i = 0; i < context.priorSteps.length; i++) {
391
- parts.push(` ${i + 1}. [OK] ${context.priorSteps[i]}`);
392
- }
393
- parts.push(``);
394
-
395
- parts.push(`>>> FAILED STEP ${context.priorSteps.length + 1}: ${context.currentStepGoal}`);
396
- parts.push(` This step FAILED. Your job: fix it using current UI (SoM markers).\n`);
397
-
398
- if (context.nextSteps && context.nextSteps.length > 0) {
399
- parts.push(`REMAINING STEPS (auto-executes after you fix current):`);
400
- for (let i = 0; i < context.nextSteps.length; i++) {
401
- parts.push(` ${context.priorSteps.length + 2 + i}. ${context.nextSteps[i]}`);
402
- }
403
- parts.push(``);
404
- }
405
-
406
- parts.push(`REPAIR STRATEGY:`);
407
- parts.push(`- CRITICAL: First check if this step is STILL NEEDED (may already be done by prior step or now obsolete)`);
408
- parts.push(` → If step goal already achieved/no longer needed: Return 0 commands + status "complete" (DELETE case)`);
409
- parts.push(` → Example: "Dismiss modal" but modal already gone → 0 commands, status "complete"`);
410
- parts.push(`- Use SoM markers to identify current elements`);
411
- parts.push(`- Generate commands that work with CURRENT UI (not original script)`);
412
- parts.push(`- CRITICAL: Once you fix this step, return status "complete" IMMEDIATELY (control goes back to script)`);
413
- parts.push(` → Repair mode = single step fix, then hand back control`);
414
- parts.push(` → Don't continue to next steps - script will auto-execute them`);
415
- parts.push(`- DON'T redo completed steps - only fix the blocker\n`);
416
- }
417
-
418
- // Put static instructions first for LLM caching efficiency
419
- parts.push('STEP EXECUTION RULES:');
420
- parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
421
- parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
422
- parts.push('- Mark "complete" ONLY if commands succeeded');
423
- parts.push('- Screenshot tool: Use ONCE for visual context, then ACT (max 3 per step, system enforced)');
424
- parts.push('- Max 5 iterations per step, then forced STUCK\n');
425
-
426
- // Dynamic content follows (changes per iteration)
427
- parts.push('=== CURRENT CONTEXT ===\n');
428
-
429
- // Display note from previous iteration (high priority tactical info)
430
- if (context.noteFromPreviousIteration) {
431
- const note = context.noteFromPreviousIteration;
432
- parts.push(`📝 YOUR NOTE FROM PREVIOUS ITERATION:`);
433
- parts.push(` ${note.content}`);
434
- parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
435
- parts.push(``);
436
- parts.push(` ACTION REQUIRED:`);
437
- parts.push(` 1. Did your previous action work? Check the screenshot!`);
438
- parts.push(` 2. If it WORKED: Execute next step from your plan`);
439
- parts.push(` 3. If it FAILED: Use your backup plan (try alternative IDs/methods)`);
440
- parts.push(` 4. Write NEW noteToFutureSelf with:`);
441
- parts.push(` - What worked/didn't work (learn from attempts)`);
442
- parts.push(` - Updated strategy with new backup plan`);
443
- parts.push(` - Next alternatives to try if this fails`);
444
- parts.push(` - Build on previous note's reasoning`);
445
- parts.push(``);
446
- parts.push(` DON'T repeat failed actions - try your backup plan!`);
447
- parts.push('');
448
- }
449
-
450
- // Check for screenshot loops (analysis paralysis) - PER STEP tracking
451
- const screenshotsThisStep = context.recentSteps.filter(s =>
452
- s.stepNumber === context.stepNumber &&
453
- (s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'))
454
- );
455
- const recentScreenshots = context.recentSteps.slice(-3).filter(s =>
456
- s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot')
457
- );
458
-
459
- if (screenshotsThisStep.length >= 3) {
460
- parts.push(`[CRITICAL] SCREENSHOT LOOP DETECTED - ${screenshotsThisStep.length} SCREENSHOTS THIS STEP`);
461
- parts.push(`ANALYSIS PARALYSIS! You keep gathering info but NEVER ACTING!`);
462
- parts.push(`NO MORE SCREENSHOTS ALLOWED - YOU MUST ACT NOW!`);
463
- parts.push(`Pick ANY selector from your prior DOM snapshots and TRY IT.`);
464
- parts.push(`Even if uncertain, execute the command. Failure is better than analysis paralysis.`);
465
- parts.push(`If toolCalls contains "take_screenshot" → SYSTEM WILL REJECT IT\n`);
466
- } else if (recentScreenshots.length >= 2) {
467
- parts.push(`[WARNING] ${recentScreenshots.length} screenshots in last 3 iterations - avoid more screenshots`);
468
- parts.push(`Use selector recommendations from prior screenshots\n`);
469
- }
470
-
471
- // System warnings for accumulated failures
472
- if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
473
- parts.push(`[WARNING] SYSTEM WARNING: ${consecutiveFailures} failures!`);
474
- // Only suggest screenshot if we haven't already taken multiple THIS STEP
475
- if (screenshotsThisStep.length === 0) {
476
- parts.push(`Take screenshot ONCE to see page state. Then ACT with selector.`);
477
- } else {
478
- parts.push(`You already have visual context. Try different selector NOW.`);
479
- }
480
- parts.push(`Question assumptions: Am I at the right step?`);
481
- parts.push(`[WARNING]\n`);
482
- } else if (consecutiveFailures && consecutiveFailures >= 4) {
483
- parts.push(`[WARNING] CRITICAL: ${consecutiveFailures} failures!`);
484
- if (enableCoordinateMode) {
485
- parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
486
- } else {
487
- parts.push(`Next failure will force STUCK. Try radically different selector approach.\n`);
488
- }
489
- }
490
-
491
- // Trigger coordinate mode if many failures (Phase 1: after 3 failures) - ONLY if enabled
492
- if (enableCoordinateMode && consecutiveFailures && consecutiveFailures >= 3) {
493
- parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯🎯`);
494
- parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
495
- parts.push(`You MUST use coordinate-based action now (percentages).`);
496
- parts.push(`Provide coordinateAction with xPercent/yPercent (0-100, 3 decimals for precision).`);
497
- parts.push(`See system prompt for coordinate action format.`);
498
- parts.push(`🎯🎯🎯\n`);
499
- }
500
-
501
- // Goals - make current step very prominent
502
- parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
503
- parts.push(`${context.currentStepGoal}`);
504
- parts.push(``);
505
- parts.push(`[WARNING] IMPORTANT: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
506
- parts.push(`[WARNING] CRITICAL: Only interact with elements you SEE in the screenshot - no guessing/hallucinating!`);
507
- parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
508
-
509
- if (context.completedSteps.length > 0) {
510
- parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
511
- }
512
- if (context.remainingSteps.length > 0) {
513
- parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
514
- }
515
-
516
- // SoM screenshot (if available)
517
- if (context.somScreenshot) {
518
- parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
519
- parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
520
- parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
521
- parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
522
- parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
523
- parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
524
- parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
525
- parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
526
- parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
527
- parts.push(``);
528
-
529
- // SoM element map for disambiguation
530
- if (context.somElementMap) {
531
- parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
532
- parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
533
- parts.push(context.somElementMap);
534
- parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
535
- parts.push(``);
536
- }
537
- }
538
-
539
- // Current page state (most variable content - at the end)
540
- parts.push(`\nCURRENT PAGE:`);
541
- parts.push(`URL: ${context.currentURL}`);
542
- parts.push(`Title: ${context.currentPageInfo.title}`);
543
-
544
- // Only include DOM details if NOT in SoM mode
545
- if (!context.somScreenshot) {
546
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
547
- parts.push(context.currentPageInfo.formattedElements);
548
- parts.push(`\nARIA TREE (hierarchical structure):`);
549
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
550
- } else {
551
- // In SoM mode, skip DOM details - agent uses visual screenshot instead
552
- parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
553
- }
554
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
555
- parts.push('... (truncated)');
556
- }
557
- parts.push('');
558
-
559
- // Recent steps (most variable content - at the end)
560
- if (context.recentSteps.length > 0) {
561
- parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
562
- for (const step of context.recentSteps) {
563
- const status = step.result === 'success' ? '[OK]' : '[FAIL]';
564
- parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
565
- parts.push(` Code: ${step.code}`);
566
- if (step.result === 'failure' && step.error) {
567
- parts.push(` ERROR: ${step.error}`);
568
- parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
569
- } else {
570
- parts.push(` Result: ${step.observation}`);
571
- }
572
- }
573
- parts.push('');
574
-
575
- // Detect repeated failures
576
- const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
577
- if (recentFailures.length >= 2) {
578
- const sameSelector = recentFailures.slice(-2).every((s, i, arr) =>
579
- i === 0 || s.code === arr[i-1].code
580
- );
581
- if (sameSelector) {
582
- parts.push(`[WARNING] WARNING: You've tried the same selector multiple times and it failed!`);
583
- parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
584
- parts.push(` YOU MUST try a completely different selector this time!\n`);
585
- }
586
- }
587
- }
588
-
589
- // Experiences (app-specific patterns learned)
590
- if (context.experiences && context.experiences.length > 0) {
591
- parts.push(`\nEXPERIENCES (patterns you've learned about this app):`);
592
- for (const exp of context.experiences) {
593
- parts.push(` • ${exp}`);
594
- }
595
- parts.push('');
596
- }
597
-
598
- // Extracted data (from previous extract_data tool calls)
599
- if (context.extractedData && Object.keys(context.extractedData).length > 0) {
600
- parts.push(`\nEXTRACTED DATA (available for use in commands):`);
601
- parts.push(JSON.stringify(context.extractedData, null, 2));
602
- parts.push('');
603
- }
604
-
605
- return parts.join('\n');
606
- }
607
-
608
- /**
609
- * Build exploratory system prompt for autonomous exploration
610
- */
611
- static buildExploratorySystemPrompt(toolDescriptions: string): string {
612
- return `You are an autonomous exploration agent that discovers and tests web application features.
613
-
614
- ${toolDescriptions}
615
-
616
- YOUR RESPONSE FORMAT - Output JSON matching this interface:
617
-
618
- interface AgentDecisionLLMResponse {
619
- status: string; // "continue" | "complete" | "stuck"
620
- reasoning: string; // What you're exploring and why
621
-
622
- // COMMANDS: Array of plain Playwright command strings
623
- commands?: string[]; // Example: ["await page.fill('input[name=\"email\"]', 'test@example.com')", ...]
624
- commandReasoning?: string;
625
- toolCalls?: Array<{ // Tools to call (extract_data for menus, etc.)
626
- name: string;
627
- params: Record<string, any>;
628
- }>;
629
- toolReasoning?: string;
630
- needsToolResults?: boolean;
631
- noteToFutureSelf?: string;
632
- coordinateAction?: { ... };
633
- experiences?: string[]; // Use for BOTH app patterns AND exploration progress
634
- blockerDetected?: { ... };
635
- debugInfo?: { // Meta-learning: suggest prompt improvements (only when very confident)
636
- suggestedPromptUpdates?: string;
637
- reasoning?: string;
638
- };
639
- }
640
-
641
- EXPLORATION MODE GUIDELINES:
642
-
643
- 1. **JOURNEY-FOCUSED EXPLORATION**: Follow the exploration prompt as your goal for THIS journey
644
- - Example prompt: "Explore Dashboard and test all widgets"
645
- - You should systematically test dashboard widgets, not wander off to other sections
646
- - Stay focused on the given journey goal
647
-
648
- 2. **ICON BUTTONS**: Match step goal to icon semantics (plus=add, gear=settings, hamburger=menu). Check element map for confirmation. Don't randomly try - reason about fit.
649
-
650
- 3. **VISIBLE ELEMENTS ONLY**: Screenshot shows viewport only. Only interact with elements you SEE. If not visible, scroll or take_screenshot(isFullPage=true).
651
-
652
- 4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in experiences, check history to avoid repeating, prioritize unexplored areas.
653
-
654
- 5. **CREATIVE TESTING**: Test functionality thoroughly - try edge cases, verify features work, look for bugs.
655
-
656
- 7. **LIMITATIONS**: Cannot complete: sign-up, forgot password, OTP, CAPTCHA, email verification (no inbox/SMS access).
657
- If encountered: CAPTCHA → stuck, sign-up/OTP → skip and explore other areas.
658
-
659
- 8. **AUTH**: If credentials provided, login FIRST using exact testDataPrompt values. Don't explore public pages or click sign-up.
660
-
661
- 9. **BLOCKERS**: Clear cookie modals, tour popups autonomously with blockerDetected.clearingCommands. CAPTCHA → stuck.
662
-
663
- 10. **STATUS**: complete=goal achieved or budget low, continue=need more, stuck=cannot proceed. Complete when journey goal met, don't wait for maxSteps.
664
-
665
- 11. **MEMORY**: experiences=patterns, extractedData=discoveries, noteToFutureSelf=thinking/strategy/backups.
666
-
667
- CRITICAL: You're fully autonomous for THIS journey - no step-by-step instructions provided.
668
- YOU decide the exploration path to meet the journey goal based on: journey prompt, current state, and memory.`;
669
- }
670
-
671
- /**
672
- * Build exploratory user prompt with context
673
- */
674
- static buildExploratoryUserPrompt(
675
- context: AgentContext,
676
- explorationPrompt: string,
677
- testDataPrompt?: string,
678
- stepNumber?: number,
679
- maxSteps?: number
680
- ): string {
681
- const parts: string[] = [];
682
-
683
- // Add SoM format reminder if screenshot is present
684
- if (context.somScreenshot) {
685
- parts.push(`[WARNING] SET-OF-MARKS MODE ACTIVE`);
686
- parts.push(`Your commands MUST be SomCommand objects (NOT Playwright strings).`);
687
- parts.push(`Format: { "elementRef": "1", "action": "click" }`);
688
- parts.push(`See TypeScript interfaces in system prompt for exact format.\n`);
689
- }
690
-
691
- parts.push('=== JOURNEY EXPLORATION CONTEXT ===\n');
692
- parts.push(`GOAL: ${explorationPrompt}`);
693
- parts.push(` (Focus on THIS specific goal - don't wander to unrelated areas)\n`);
694
-
695
- if (testDataPrompt) {
696
- parts.push(`TEST DATA/CREDENTIALS: ${testDataPrompt}`);
697
- parts.push(` [WARNING] IMPORTANT: If credentials are provided above (email/username and password), you MUST:`);
698
- parts.push(` - Use them to LOGIN and explore authenticated features`);
699
- parts.push(` - Fill login forms with the exact credentials provided`);
700
- parts.push(` - Don't waste time on public/unauthenticated pages when you can login`);
701
- parts.push(` - Prioritize exploring the authenticated app experience\n`);
702
- }
703
-
704
- if (stepNumber && maxSteps) {
705
- parts.push(`PROGRESS: Step ${stepNumber}/${maxSteps} (you can complete earlier if journey goal met)\n`);
706
- }
707
-
708
- // Show discovered and tracked data from extractedData
709
- if (context.extractedData && Object.keys(context.extractedData).length > 0) {
710
- parts.push(`\nDISCOVERED DATA (this journey):`);
711
- for (const [key, value] of Object.entries(context.extractedData)) {
712
- parts.push(` ${key}: ${value}`);
713
- }
714
- }
715
-
716
- // SoM screenshot (if available)
717
- if (context.somScreenshot) {
718
- parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
719
- parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
720
- parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
721
- parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
722
- parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
723
- parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
724
- parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
725
- parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
726
- parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
727
- parts.push(``);
728
-
729
- // SoM element map for disambiguation
730
- if (context.somElementMap) {
731
- parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
732
- parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
733
- parts.push(context.somElementMap);
734
- parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
735
- parts.push(``);
736
- }
737
- }
738
-
739
- parts.push(`\nCURRENT PAGE:`);
740
- parts.push(`URL: ${context.currentURL}`);
741
- parts.push(`Title: ${context.currentPageInfo.title}`);
742
-
743
- // Only include DOM details if NOT in SoM mode
744
- if (!context.somScreenshot) {
745
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
746
- parts.push(context.currentPageInfo.formattedElements);
747
- parts.push(`\nARIA TREE (hierarchical structure):`);
748
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
749
- } else {
750
- // In SoM mode, skip DOM details - agent uses visual screenshot
751
- parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
752
- }
753
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
754
- parts.push('... (truncated)');
755
- }
756
-
757
- // Recent actions
758
- if (context.recentSteps.length > 0) {
759
- parts.push(`\nRECENT ACTIONS (last ${context.recentSteps.length}):`);
760
- for (const step of context.recentSteps) {
761
- const status = step.result === 'success' ? '[OK]' : '[FAIL]';
762
- parts.push(` ${status} ${step.action}`);
763
- parts.push(` ${step.observation}`);
764
- }
765
- }
766
-
767
- // Learnings and exploration progress
768
- if (context.experiences && context.experiences.length > 0) {
769
- parts.push(`\nEXPLORATION NOTES & APP PATTERNS:`);
770
- for (const exp of context.experiences) {
771
- parts.push(` • ${exp}`);
772
- }
773
- }
774
-
775
- // Note from previous iteration
776
- if (context.noteFromPreviousIteration) {
777
- parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.noteFromPreviousIteration.content}`);
778
- parts.push(`Did it work? If yes, continue plan. If failed, try backup alternatives.`);
779
- }
780
-
781
- parts.push(`\nDECIDE NEXT ACTION: What to explore/test next? Check history to avoid repeating. Is goal achieved? Mark complete.`);
782
-
783
- return parts.join('\n');
784
- }
785
- }
786
-