testchimp-runner-core 0.0.40 → 0.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/execution-service.d.ts.map +1 -1
  2. package/dist/execution-service.js +1 -3
  3. package/dist/execution-service.js.map +1 -1
  4. package/dist/index.d.ts +7 -6
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +5 -11
  7. package/dist/index.js.map +1 -1
  8. package/dist/orchestrator/decision-parser.d.ts.map +1 -1
  9. package/dist/orchestrator/decision-parser.js +16 -0
  10. package/dist/orchestrator/decision-parser.js.map +1 -1
  11. package/dist/orchestrator/index.d.ts +4 -2
  12. package/dist/orchestrator/index.d.ts.map +1 -1
  13. package/dist/orchestrator/index.js +10 -8
  14. package/dist/orchestrator/index.js.map +1 -1
  15. package/dist/orchestrator/orchestrator-agent.d.ts +10 -4
  16. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  17. package/dist/orchestrator/orchestrator-agent.js +376 -118
  18. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  19. package/dist/orchestrator/orchestrator-prompts.d.ts +2 -10
  20. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -1
  21. package/dist/orchestrator/orchestrator-prompts.js +343 -452
  22. package/dist/orchestrator/orchestrator-prompts.js.map +1 -1
  23. package/dist/orchestrator/page-loading-utils.d.ts +15 -0
  24. package/dist/orchestrator/page-loading-utils.d.ts.map +1 -0
  25. package/dist/orchestrator/page-loading-utils.js +115 -0
  26. package/dist/orchestrator/page-loading-utils.js.map +1 -0
  27. package/dist/orchestrator/page-som-handler.d.ts +2 -1
  28. package/dist/orchestrator/page-som-handler.d.ts.map +1 -1
  29. package/dist/orchestrator/page-som-handler.js +250 -33
  30. package/dist/orchestrator/page-som-handler.js.map +1 -1
  31. package/dist/orchestrator/site-learnings-utils.d.ts +31 -0
  32. package/dist/orchestrator/site-learnings-utils.d.ts.map +1 -0
  33. package/dist/orchestrator/site-learnings-utils.js +175 -0
  34. package/dist/orchestrator/site-learnings-utils.js.map +1 -0
  35. package/dist/orchestrator/som-types.d.ts +2 -0
  36. package/dist/orchestrator/som-types.d.ts.map +1 -1
  37. package/dist/orchestrator/som-types.js.map +1 -1
  38. package/dist/orchestrator/tools/index.d.ts +9 -8
  39. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  40. package/dist/orchestrator/tools/index.js +10 -15
  41. package/dist/orchestrator/tools/index.js.map +1 -1
  42. package/dist/orchestrator/tools/take-screenshot.d.ts.map +1 -1
  43. package/dist/orchestrator/tools/take-screenshot.js +10 -1
  44. package/dist/orchestrator/tools/take-screenshot.js.map +1 -1
  45. package/dist/orchestrator/types.d.ts +54 -9
  46. package/dist/orchestrator/types.d.ts.map +1 -1
  47. package/dist/orchestrator/types.js.map +1 -1
  48. package/dist/progress-reporter.d.ts +23 -2
  49. package/dist/progress-reporter.d.ts.map +1 -1
  50. package/dist/progress-reporter.js.map +1 -1
  51. package/dist/prompts.d.ts.map +1 -1
  52. package/dist/prompts.js +14 -3
  53. package/dist/prompts.js.map +1 -1
  54. package/dist/scenario-service.d.ts +3 -3
  55. package/dist/scenario-service.d.ts.map +1 -1
  56. package/dist/scenario-service.js +6 -5
  57. package/dist/scenario-service.js.map +1 -1
  58. package/dist/scenario-worker-class.d.ts +7 -3
  59. package/dist/scenario-worker-class.d.ts.map +1 -1
  60. package/dist/scenario-worker-class.js +94 -21
  61. package/dist/scenario-worker-class.js.map +1 -1
  62. package/dist/types.d.ts +4 -0
  63. package/dist/types.d.ts.map +1 -1
  64. package/dist/types.js.map +1 -1
  65. package/package.json +1 -1
  66. package/dist/testing/agent-tester.d.ts +0 -35
  67. package/dist/testing/agent-tester.d.ts.map +0 -1
  68. package/dist/testing/agent-tester.js +0 -84
  69. package/dist/testing/agent-tester.js.map +0 -1
  70. package/dist/testing/ref-translator-tester.d.ts +0 -44
  71. package/dist/testing/ref-translator-tester.d.ts.map +0 -1
  72. package/dist/testing/ref-translator-tester.js +0 -104
  73. package/dist/testing/ref-translator-tester.js.map +0 -1
  74. package/dist/utils/hierarchical-selector.d.ts +0 -47
  75. package/dist/utils/hierarchical-selector.d.ts.map +0 -1
  76. package/dist/utils/hierarchical-selector.js +0 -212
  77. package/dist/utils/hierarchical-selector.js.map +0 -1
  78. package/dist/utils/ref-attacher.d.ts +0 -21
  79. package/dist/utils/ref-attacher.d.ts.map +0 -1
  80. package/dist/utils/ref-attacher.js +0 -149
  81. package/dist/utils/ref-attacher.js.map +0 -1
  82. package/dist/utils/ref-translator.d.ts +0 -49
  83. package/dist/utils/ref-translator.d.ts.map +0 -1
  84. package/dist/utils/ref-translator.js +0 -276
  85. package/dist/utils/ref-translator.js.map +0 -1
@@ -5,120 +5,115 @@
5
5
  */
6
6
  Object.defineProperty(exports, "__esModule", { value: true });
7
7
  exports.OrchestratorPrompts = void 0;
8
- class OrchestratorPrompts {
9
- /**
10
- * Build main system prompt for selector-based mode
11
- */
12
- static buildSystemPrompt(toolDescriptions, enableCoordinateMode = false) {
13
- return `You are an intelligent test automation agent that executes web scenarios using Playwright.
14
-
15
- DISCRETE EXPERIENCE LOOP:
16
- You operate in iterations: receive state → decide → sleep → wake with new state.
17
- System waits for page stability after each batch. Note to future self: strategy, what to verify, backup plans.
18
-
19
- COMMON UX PATTERNS (critical for navigation):
20
- Disabled buttons Fill required fields first to enable them
21
- Missing SoM ID Element likely disabled (fill prerequisites first)
22
- Modals/overlays Dismiss or interact before underlying content
23
- Hover effects Reveal tooltips/menus before clicking
24
- Dropdowns/autocomplete Type then select from revealed options
25
- • Toasts/alerts → Read for success/error feedback (may be transient)
26
- Tabs/steppers Reveal new content in same page (not navigation)
27
- Form validation Red highlights/borders = invalid, fix before submit
28
- Confirmation dialogs Accept/dismiss before proceeding
29
- Lazy loading Scroll down to load more content
30
- Accordions/expandable → Click header to toggle visibility
31
-
32
- INTERACTION STRATEGY:
33
- 1. ALWAYS prefer SoM-marked elements (they have reliable selectors)
34
- 2. If element not marked: try refresh_som_markers tool (may have just enabled)
35
- 3. Last resort: coordinate-based interaction (when element truly unmarked)
36
-
37
- Difficult cases: Shadow DOM, iframes, canvas - try keyboard or coordinates as fallback.
38
-
39
- ${toolDescriptions}
40
-
41
- CRITICAL: STEP COMPLETION DECISION
42
- Each step has ONE specific goal. Once that goal is achieved, mark status="complete" IMMEDIATELY.
43
-
44
- **When to mark COMPLETE:**
45
- - Step: "Navigate to URL" Mark complete after navigate command succeeds (don't login yet!)
46
- - Step: "Fill login form" Mark complete after filling fields (don't click submit yet!)
47
- - Step: "Click Submit" → Mark complete after clicking (don't wait for next page!)
48
-
49
- **DO NOT:**
50
- - Continue with future steps while still on current step
51
- - Assume the step wants you to do more than stated
52
- - Wait for side effects (navigation, etc.) before marking complete
53
-
54
- **The goal text is LITERAL** - do exactly what it says, then mark complete.
55
-
56
- OUTPUT FORMAT (JSON):
57
-
58
- {
59
- "status": "continue" | "complete" | "stuck" | "infeasible",
60
- "reasoning": "your thinking",
8
+ // ========== UTILITY FUNCTIONS ==========
9
+ /**
10
+ * Truncate URL to avoid bloating prompts
11
+ */
12
+ function truncateUrl(url, maxLength = 300) {
13
+ if (!url || url.length <= maxLength)
14
+ return url;
15
+ return url.substring(0, maxLength) + '...';
16
+ }
17
+ // ========== SHARED PROMPT SECTIONS (to avoid duplication) ==========
18
+ const DISCRETE_EXPERIENCE_LOOP = `DISCRETE EXPERIENCE LOOP (YOU ARE STATELESS - NO SCREENSHOT MEMORY):
19
+ You operate in iterations: receive state → decide → sleep → wake with NEW state.
20
+ Each iteration you receive: current screenshot, past actions, collected memories, and noteToFutureSelf you wrote in last iteration.
21
+ CRITICAL: You do NOT see previous screenshots (unless you specifically request) - only text descriptions of past actions!
22
+ Write explicit EXPECTED STATE in noteToFutureSelf so your future self can verify against the future screenshot.
23
+ Example: "Clicked hamburger menu (was collapsed). EXPECT: menu expanded with 'settings' items visible"`;
24
+ const SITE_LEARNINGS_GUIDE = `SITE LEARNINGS: Build mental model (persistent across journeys)
25
+
26
+ NAMING (check SCREEN STATE VOCABULARY first):
27
+ - screen: REUSE from vocabulary ("login", "dashboard") or create if new. NEVER: "about:blank", "loading"
28
+ - state: INFER from COMPLETED STEPS (max 3 GENERIC dims - user role/context, NOT specific data)
29
+ Dimensions describe USER STATE (logged-in, admin, cart-empty), NOT data values (workspace names, usernames, products)
30
+ "logged-in,admin", "guest,cart-empty"
31
+ ❌ "testchimp-selected" (workspace name is data!), "user-john" (username is data!)
32
+
33
+ LEARNINGS (semantic insights that persist):
34
+ Focus on BEHAVIOR and PATTERNS that will help on future runs, when SoM IDs are completely different.
35
+
36
+ WHY NO SOM IDS: SoM markers (1, 2, [5], [6], element 9) regenerate EVERY page load - different numbers each time!
37
+ A learning with "element 9" is useless on next run when that same button is "element 3".
38
+
39
+ STORE: Non-obvious behavior, interaction quirks, selector strategies
40
+ ✅ "Dropdown opens on caret icon click, not container div"
41
+ "Delete requires overflow menu (not directly visible)"
42
+ "Search triggers on Enter, not auto-search while typing"
43
+
44
+ DON'T STORE: Element catalogs, SoM IDs, obvious facts, attribute documentation
45
+ "Continue with Google button" (element listing - adds no behavioral value)
46
+ "opener is SoM id [6]" (ephemeral - will be different ID next run!)
47
+ "input name=emailOrUsername" (documenting HTML - not useful)
48
+
49
+ Ask: "Will this help when SoM IDs are completely different?" NO → don't store
50
+
51
+ STEP COMPLETION: Check ALL signals (memory, URL, screenshot, noteToFutureSelf) vs step goal.
52
+ Process: Expected (from noteToSelf) Actual (commands success? URL changed? content visible?) Decide
53
+ - Commands ✓ + URL changed + expected page → COMPLETE
54
+ - Commands + error shown CONTINUE (retry)
55
+ - Command failed → CONTINUE (different selector)
56
+
57
+ `;
58
+ const NOTETOSELF_GUIDE = `NOTETOSELF: Capture thinking/intentions + EXPLICIT EXPECTED STATE for verification.
59
+ "Clicked menu. EXPECT: expanded with 'Settings' visible"
60
+ "Click menu" (future can't verify!)
61
+ Include: strategy, backups if fails, what to verify next.`;
62
+ // Response schema - exact TypeScript interface the agent must follow
63
+ const RESPONSE_SCHEMA = `
64
+ RESPONSE FORMAT (exact TypeScript interface):
65
+
66
+ interface AgentDecision {
67
+ // Required fields
68
+ status: 'complete' | 'stuck' | 'infeasible' | 'continue';
69
+ statusReasoning: string;
70
+ reasoning: string;
61
71
 
62
- "commands": [ // Mix ref and playwright commands as needed
63
- { "type": "playwright", "code": "await page.goto('https://example.com')" },
64
- { "type": "ref", "ref": "e22", "operation": "fill", "value": "text" },
65
- { "type": "ref", "ref": "e31", "operation": "click" },
66
- { "type": "playwright", "code": "await page.waitForLoadState('networkidle')" }
67
- ],
72
+ // Screen identification (REQUIRED - always identify current screen)
73
+ screenState: {
74
+ screen: string; // Screen name - REUSE from SCREEN STATE KNOWLEDGE if possible
75
+ state: string; // State dimensions: "admin", "admin,empty-cart", "" for default
76
+ };
68
77
 
69
- "toolCalls": [{ "name": "tool_name", "params": {} }],
70
- "blockerDetected": { "description": "...", "clearingCommands": ["..."] },
71
- "experiences": ["app pattern"],
72
- "noteToFutureSelf": "See NOTETOSELF GUIDELINES below",
73
- "debugInfo": { // OPTIONAL: Only if you have confident prompt improvement suggestions
74
- "suggestedPromptUpdates": "Add instruction: When form has Country dropdown, select country BEFORE filling phone (enables country code)",
75
- "reasoning": "Encountered this pattern 3 times - dropdown selection unlocks dependent fields"
76
- }
77
- }
78
-
79
- NOTETOSELF: Your only cognition continuity - capture THINKING/INTENTIONS (history has actions).
80
- Include: strategy, hypothesis, alternatives/backups if fails, what to verify next, observations.
81
- Example: "Strategy: Clicking ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands with nav options."
82
-
83
- META-LEARNING (debugInfo): Could this prompt have been better. Suggest fixes.
84
-
85
- STATUS: complete=goal achieved, continue=need more, stuck=5 fails, infeasible=impossible.
86
- RULES: Do only step goal. Minimal commands. Try different selectors if fail. Use blockerDetected for modals.
87
-
88
- COMMANDS FORMAT:
89
-
90
- Array of plain Playwright command strings:
91
- {
92
- "commands": [
93
- "await page.fill('input[name=\"email\"]', 'user@test.com')",
94
- "await page.fill('input[name=\"password\"]', 'secret123')",
95
- "await page.click('button[type=\"submit\"]')"
96
- ]
97
- }
98
-
99
- SELECTOR STRATEGIES (use in order of preference):
100
- 1. getByRole: page.getByRole('button', {name: 'Login'})
101
- 2. getByLabel: page.getByLabel('Email address')
102
- 3. getByPlaceholder: page.getByPlaceholder('Enter email')
103
- 4. getByText: page.getByText('Sign in')
104
- 5. CSS: page.locator('input[name="email"]')
105
- 6. Test IDs: page.getByTestId('login-button')
106
-
107
- Example login commands:
108
- {
109
- "commands": [
110
- "await page.getByLabel('Email').fill('user@test.com')",
111
- "await page.getByLabel('Password').fill('secret123')",
112
- "await page.getByRole('button', {name: 'Submit'}).click()"
113
- ]
78
+ // Site learnings (OPTIONAL - only when learning something NEW/IMPORTANT)
79
+ siteLearningsUpdate?: {
80
+ screens?: {
81
+ [screenName: string]: {
82
+ states: {
83
+ [stateName: string]: {
84
+ observations?: Array<{ id?: number; text: string }>; // Add (no id) or Update (with id)
85
+ deleteObservationIds?: number[];
86
+ };
87
+ };
88
+ };
89
+ };
90
+ uxPatterns?: Array<{ id?: number; text: string }>; // Add (no id) or Update (with id)
91
+ deleteUxPatternIds?: number[];
92
+ };
93
+
94
+ // Commands to execute
95
+ commands?: Array<SomCommand | string>;
96
+ commandReasoning?: string;
97
+
98
+ // Note to future self (your only memory continuity)
99
+ noteToFutureSelf?: string;
100
+
101
+ // Other optional fields
102
+ toolCalls?: Array<{ name: string; params: any }>;
103
+ toolReasoning?: string;
104
+ blockerDetected?: { description: string; clearingCommands: string[] };
105
+ memoryUpdate?: { action: string; observation: string; extractedData?: Record<string, any> };
114
106
  }
115
107
 
116
- INTERACTIVE ELEMENTS section shows available selectors for each element.`;
117
- }
108
+ CRITICAL: uxPatterns array must have objects with BOTH id and text fields!
109
+ Example: { "id": 1, "text": "Pattern description" } or { "text": "New pattern" } (no id for new)
110
+ `;
111
+ // ===================================================================
112
+ class OrchestratorPrompts {
118
113
  /**
119
114
  * Build SoM (Set-of-Marks) system prompt for visual element identification
120
115
  */
121
- static buildSomSystemPrompt(restrictCoordinates = false) {
116
+ static buildSomSystemPrompt(restrictCoordinates = false, toolDescriptions) {
122
117
  const coordinateRestriction = restrictCoordinates ? `
123
118
 
124
119
  CRITICAL: COORDINATE COMMANDS RESTRICTED
@@ -130,32 +125,24 @@ Strong preference order:
130
125
  3. ONLY IF NO OTHER OPTION EXISTS: use coordinate commands
131
126
 
132
127
  If you use coordinates, you MUST explain in commandReasoning why no SoM-marked alternative exists.` : '';
133
- return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}
128
+ const toolSection = toolDescriptions ? `
134
129
 
135
- DISCRETE EXPERIENCE LOOP:
136
- You operate in iterations: receive state → decide → sleep → wake with new state.
137
- System waits for page stability after each batch.
130
+ AVAILABLE TOOLS:
131
+ ${toolDescriptions}
138
132
 
139
- CRITICAL: MEMORY ISOLATION
140
- You have NO memory between iterations. Each "wake up" is like a fresh start - you only see:
141
- - Current screenshot
142
- - Current step goal
143
- - Previous step descriptions
144
- - Your noteToFutureSelf from last iteration
133
+ Use tools to gather additional information when needed before executing commands.` : '';
134
+ return `You are an intelligent test automation agent using Set-of-Marks (SoM) visual element identification.${coordinateRestriction}${toolSection}
135
+
136
+ ${DISCRETE_EXPERIENCE_LOOP}
145
137
 
146
- The noteToFutureSelf is your ONLY way to maintain a continuous stream of thinking across iterations. Use it strategically to:
147
- • Document your current intentions and strategy
148
- • Record what you were thinking/planning
149
- • Give specific advice to your future self about what to look for
150
- • Note any observations or patterns you've discovered
151
- • Suggest backup plans if current approach fails
138
+ ${NOTETOSELF_GUIDE}
152
139
 
153
140
  IMPORTANT: You will receive a screenshot with COLOR-CODED BOUNDING BOXES and IDs overlaid on interactive elements.
154
141
 
155
142
  SCREENSHOT SCOPE:
156
- - Shows VIEWPORT ONLY (what's currently visible, not full page)
157
- - Elements outside the viewport are NOT shown (you must scroll to reveal them)
158
- - If you need to see more: use SCROLL action or take_screenshot tool with isFullPage=true
143
+ - Shows FULL PAGE (entire scrollable content, including below-fold elements)
144
+ - ALL interactive elements across the entire page are marked with SoM IDs
145
+ - You can see and interact with any element on the page without scrolling
159
146
 
160
147
  VISUAL MARKER SYSTEM:
161
148
  - Each interactive element has a colored bounding box with a unique color
@@ -213,180 +200,19 @@ COMMANDS ARRAY: Mix actions (has 'action') and verifications (has 'verificationT
213
200
  Example: [{"elementRef":"4","action":"fill","value":"Hello"}, {"elementRef":"3","verificationType":"textContains","expected":"You: Hello"}]
214
201
  CRITICAL: Verification steps MUST generate verification commands (never 0 commands) - don't just visually confirm!
215
202
 
216
- COORDINATES (when SoM marker missing):
217
- Use percentage-based coords for unmarked elements:
218
- { "action": "click", "coord": { "x": 85.625, "y": 12.375 } }
219
-
220
- Format: percentages 0-100, MUST use 3 decimals (0.000 = top-left, 50.000 = center, 100.000 = bottom-right).
221
- After coord click, magenta "clicked" marker appears. Use view_previous_screenshot tool to verify if result unexpected.
222
-
223
- NAVIGATION: Use navigate/goBack/goForward/reload actions (no elementRef needed).
224
- Example: { "action": "navigate", "value": "https://..." }
225
- DON'T click address bar - use navigate action. System waits for page load after navigation.
226
-
227
- // Available actions: click, doubleClick, rightClick, hover, drag, fill, press, select, check, uncheck, focus, blur, scroll, navigate, goBack, goForward, reload
228
- // Available verifications: textContains, textEquals, valueEquals, valueEmpty, isVisible, isHidden, isEnabled, isDisabled, isChecked, isUnchecked, countEquals, countGreaterThan, countLessThan, hasClass, hasAttribute
203
+ ${RESPONSE_SCHEMA}
229
204
 
230
- interface AgentDecisionLLMResponse {
231
- status: "continue" | "complete" | "stuck" | "infeasible";
232
- reasoning: string;
233
- commands?: (SomCommand | SomVerification)[]; // REPAIR MODE: Can be empty [] if step already done/obsolete
234
- commandReasoning?: string;
235
- toolCalls?: Array<{ name: string; params: Record<string, any> }>;
236
- noteToFutureSelf?: string;
237
- experiences?: string[];
238
- blockerDetected?: { description: string; clearingCommands: SomCommand[] };
239
- debugInfo?: { suggestedPromptUpdates?: string; reasoning?: string };
240
- }
241
- \`\`\`
242
-
243
- NOTETOSELF: Your only continuity. Include: hypothesis, strategy, backup plans if fails, what to verify, observations.
244
- Example: "Strategy: Click ID 1 for menu. Backup: try ID 2/3 or coord (8%,15%). Want to verify: menu expands."
245
-
246
- EXAMPLE RESPONSES:
247
-
248
- Action step:
249
- \`\`\`json
250
- {
251
- "status": "continue",
252
- "reasoning": "Need to fill login form with credentials",
253
- "commands": [
254
- { "elementRef": "5", "action": "fill", "value": "user@example.com" },
255
- { "elementRef": "7", "action": "fill", "value": "password123" },
256
- { "elementRef": "12", "action": "click" }
257
- ],
258
- "commandReasoning": "Filling email (ID 5), password (ID 7), clicking submit (ID 12)"
259
- }
260
- \`\`\`
261
-
262
- Verification step:
263
- \`\`\`json
264
- {
265
- "status": "complete",
266
- "reasoning": "Message sent and verified in conversation",
267
- "commands": [
268
- { "elementRef": "3", "verificationType": "textContains", "expected": "You: Hello", "description": "Message appears in thread" },
269
- { "elementRef": "4", "verificationType": "valueEmpty", "description": "Input cleared" }
270
- ],
271
- "commandReasoning": "Verifying message visible in conversation (ID 3) and input empty (ID 4)"
272
- }
273
- \`\`\`
274
-
275
- REPAIR MODE - Step already completed (DELETE case):
276
- \`\`\`json
277
- {
278
- "status": "complete",
279
- "reasoning": "Step asked to 'Dismiss welcome modal' but I see no modal in current screenshot - it was already dismissed by prior steps",
280
- "commands": [],
281
- "commandReasoning": "No commands needed - step goal already achieved/obsolete"
282
- }
283
- \`\`\`
205
+ COORDS: { "action": "click", "coord": { "x": 85.625, "y": 12.375 } }. Use 3 decimals, 0-100%.
206
+ NAVIGATE: { "action": "navigate", "value": "https://..." }
207
+ SCROLL: { "action": "scroll", "scrollDirection": "down", "scrollAmount": 500 }
208
+ PRESS: { "elementRef": "5", "action": "press", "value": "Enter" } (NO coord for press!)
284
209
 
285
- OUTPUT FORMAT: JSON matching AgentDecisionLLMResponse interface above.`;
286
- }
287
- /**
288
- * Build coordinate-specific system prompt (used when selectors repeatedly fail)
289
- */
290
- static buildCoordinateSystemPrompt() {
291
- return `You are a visual web automation expert. Selector generation has FAILED multiple times.
292
-
293
- YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
294
-
295
- SCREENSHOT PROVIDED:
296
- You will see a screenshot with color-coded bounding boxes and ID labels attached to each element.
297
-
298
- CRITICAL - IDENTIFY THE CORRECT ELEMENT:
299
- 1. READ the step goal carefully - what specific element are you looking for?
300
- 2. Look for the colored bounding box that matches the element description
301
- 3. The ID label is at TOP-RIGHT corner, ABOVE the box (bottom of label touches top of box)
302
- 4. Match the label color to the bounding box color
303
- 5. LOCATE that element in the screenshot (NOT a similar-looking element!)
304
- 6. VERIFY position using screen regions:
305
- - Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
306
- - Center content: xPercent ~30-70%
307
- - Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
308
- 7. CALCULATE percentages from element's CENTER position
309
- 8. SANITY CHECK your percentages:
310
- - Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
311
- - Button in top-left at 90%? WRONG - that's top-right!
312
- - Element description says "left" but x > 50%? WRONG - recheck!
313
-
314
- Example thought process:
315
- Goal: "Click Settings link in left navigation"
316
- → I see "Settings" text in LEFT navigation panel in the screenshot
317
- → Visual estimate: The link appears in the far left sidebar
318
- → Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
319
- → Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
320
- → xPercent: 12.500, yPercent: 32.000
321
- → Sanity check: 12.5% is FAR LEFT (NOT 80%+ which would be far right!)
322
- → Description: "Clicking center of Settings link in left sidebar"
323
-
324
- CRITICAL VISUAL ESTIMATION TIPS:
325
- - Divide screenshot mentally into quadrants/regions
326
- - Left sidebar usually ~5-20% from left, center content ~30-70%, right sidebar ~75-95%
327
- - Aim for CENTER of element, not edges
328
- - Top bar usually 0-10% from top, footer usually 90-100%
329
- - Be conservative: slightly off-center is better than way off
330
-
331
- YOUR RESPONSE FORMAT - Output JSON matching this interface:
332
-
333
- interface AgentDecisionLLMResponse {
334
- status: string; // REQUIRED: "continue" (usually for coordinate mode)
335
- reasoning: string; // REQUIRED: "I see [element] at (X%, Y%) - using coordinates"
336
- coordinateAction: { // REQUIRED in coordinate mode
337
- type: "coordinate";
338
- action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
339
- xPercent: number; // 0-100, 3 decimals
340
- yPercent: number; // 0-100, 3 decimals
341
- toXPercent?: number; // For drag
342
- toYPercent?: number; // For drag
343
- value?: string; // For fill
344
- scrollAmount?: number; // For scroll
345
- };
346
- noteToFutureSelf?: string; // Optional: What to try if this fails
347
- }
348
-
349
- COORDINATE REFERENCE:
350
- - Top-left corner: xPercent=0, yPercent=0
351
- - Top-right corner: xPercent=100, yPercent=0
352
- - Bottom-left corner: xPercent=0, yPercent=100
353
- - Bottom-right corner: xPercent=100, yPercent=100
354
- - Center of screen: xPercent=50, yPercent=50
355
-
356
- Use 3 decimal places for precision (e.g., 15.755, not 16).
357
-
358
- ACTIONS:
359
-
360
- **Physical clicks:**
361
- - click: { action: "click", xPercent: 15.755, yPercent: 8.500 }
362
- - doubleClick: { action: "doubleClick", xPercent: 15.755, yPercent: 8.500 }
363
- - rightClick: { action: "rightClick", xPercent: 15.755, yPercent: 8.500 }
364
- - hover: { action: "hover", xPercent: 15.755, yPercent: 8.500 }
365
-
366
- **Input actions:**
367
- - fill: Click then type
368
- { action: "fill", xPercent: 30.000, yPercent: 25.000, value: "alice@example.com" }
369
-
370
- **Movement actions:**
371
- - drag: From one position to another
372
- { action: "drag", xPercent: 10.000, yPercent: 50.000, toXPercent: 60.000, toYPercent: 50.000 }
373
- - scroll: At position, scroll by amount
374
- { action: "scroll", xPercent: 50.000, yPercent: 50.000, scrollAmount: 500 }
375
-
376
- CRITICAL RULES:
377
- - Percentages are from viewport TOP-LEFT (not full page)
378
- - Use element CENTER for coordinates, not edges
379
- - Be precise with decimals - wrong coords click wrong element
380
- - For fill: system will click at (x%,y%) then type value automatically
381
- - For drag: toXPercent/toYPercent are REQUIRED
382
-
383
- DO NOT try to generate selectors - that approach already failed. Use coordinates only.
384
- This is a last-resort mechanism, but it WILL work if you provide accurate percentages.`;
210
+ OUTPUT: Return valid JSON. Example: { "status": "complete", "commands": [{"elementRef":"5","action":"fill","value":"test"}], "screenState": {"screen":"login","state":""} }`;
385
211
  }
386
212
  /**
387
213
  * Build user prompt with context
388
214
  */
389
- static buildUserPrompt(context, consecutiveFailures, enableCoordinateMode = false) {
215
+ static buildUserPrompt(context, consecutiveFailures) {
390
216
  const parts = [];
391
217
  // Add SoM format reminder if screenshot is present
392
218
  if (context.somScreenshot) {
@@ -413,50 +239,36 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
413
239
  }
414
240
  parts.push(``);
415
241
  }
416
- parts.push(`REPAIR STRATEGY:`);
417
- parts.push(`- CRITICAL: First check if this step is STILL NEEDED (may already be done by prior step or now obsolete)`);
418
- parts.push(` → If step goal already achieved/no longer needed: Return 0 commands + status "complete" (DELETE case)`);
419
- parts.push(` → Example: "Dismiss modal" but modal already gone → 0 commands, status "complete"`);
420
- parts.push(`- Use SoM markers to identify current elements`);
421
- parts.push(`- Generate commands that work with CURRENT UI (not original script)`);
422
- parts.push(`- CRITICAL: Once you fix this step, return status "complete" IMMEDIATELY (control goes back to script)`);
423
- parts.push(` → Repair mode = single step fix, then hand back control`);
424
- parts.push(` → Don't continue to next steps - script will auto-execute them`);
425
- parts.push(`- DON'T redo completed steps - only fix the blocker\n`);
242
+ parts.push(`STRATEGY: Check if step still needed. Fix using current UI. Return "complete" when fixed.\n`);
426
243
  }
427
- // Put static instructions first for LLM caching efficiency
428
- parts.push('STEP EXECUTION RULES:');
429
- parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
430
- parts.push('- If step doesn\'t say "verify/check/confirm" DON\'T add expect() assertions');
431
- parts.push('- Mark "complete" ONLY if commands succeeded');
244
+ // Concise rules for LLM caching
245
+ parts.push('RULES: Do step goal only. No verify commands unless step asks. Check prerequisites before advancing.');
246
+ // TEMPORARY: Always full-page mode during debugging
247
+ // TODO: Re-enable adaptive heuristic once verified working
248
+ parts.push('- Screenshot shows FULL PAGE (all content visible, including offscreen elements)');
249
+ parts.push('- All interactive elements are marked with SoM IDs, even those below the fold');
250
+ // Kept for future reference when re-enabling adaptive mode:
251
+ // const heightOk = context.pageHeight && context.viewportHeight && context.pageHeight < (context.viewportHeight * 2);
252
+ // const widthOk = context.pageWidth && context.viewportWidth && context.pageWidth < (context.viewportWidth * 2);
253
+ // const isCompactPage = heightOk && widthOk;
432
254
  parts.push('- Screenshot tool: Use ONCE for visual context, then ACT (max 3 per step, system enforced)');
433
255
  parts.push('- Max 5 iterations per step, then forced STUCK\n');
434
256
  // Dynamic content follows (changes per iteration)
435
257
  parts.push('=== CURRENT CONTEXT ===\n');
436
258
  // Display note from previous iteration (high priority tactical info)
437
- if (context.noteFromPreviousIteration) {
438
- const note = context.noteFromPreviousIteration;
259
+ if (context.journeyMemory.latestNote) {
260
+ const note = context.journeyMemory.latestNote;
439
261
  parts.push(`📝 YOUR NOTE FROM PREVIOUS ITERATION:`);
440
- parts.push(` ${note.content}`);
441
- parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
442
- parts.push(``);
443
- parts.push(` ACTION REQUIRED:`);
444
- parts.push(` 1. Did your previous action work? Check the screenshot!`);
445
- parts.push(` 2. If it WORKED: Execute next step from your plan`);
446
- parts.push(` 3. If it FAILED: Use your backup plan (try alternative IDs/methods)`);
447
- parts.push(` 4. Write NEW noteToFutureSelf with:`);
448
- parts.push(` - What worked/didn't work (learn from attempts)`);
449
- parts.push(` - Updated strategy with new backup plan`);
450
- parts.push(` - Next alternatives to try if this fails`);
451
- parts.push(` - Build on previous note's reasoning`);
262
+ parts.push(`${note.content}`);
452
263
  parts.push(``);
453
- parts.push(` DON'T repeat failed actions - try your backup plan!`);
264
+ parts.push(`⚠️ Follow your own instructions above. Compare current screenshot to expected state.`);
454
265
  parts.push('');
455
266
  }
456
267
  // Check for screenshot loops (analysis paralysis) - PER STEP tracking
457
- const screenshotsThisStep = context.recentSteps.filter(s => s.stepNumber === context.stepNumber &&
268
+ const recentSteps = context.journeyMemory.history.slice(-6);
269
+ const screenshotsThisStep = recentSteps.filter(s => s.stepNumber === context.stepNumber &&
458
270
  (s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot')));
459
- const recentScreenshots = context.recentSteps.slice(-3).filter(s => s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'));
271
+ const recentScreenshots = recentSteps.slice(-3).filter(s => s.code.includes('take_screenshot') || s.action.toLowerCase().includes('screenshot'));
460
272
  if (screenshotsThisStep.length >= 3) {
461
273
  parts.push(`[CRITICAL] SCREENSHOT LOOP DETECTED - ${screenshotsThisStep.length} SCREENSHOTS THIS STEP`);
462
274
  parts.push(`ANALYSIS PARALYSIS! You keep gathering info but NEVER ACTING!`);
@@ -474,7 +286,8 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
474
286
  parts.push(`[WARNING] SYSTEM WARNING: ${consecutiveFailures} failures!`);
475
287
  // Only suggest screenshot if we haven't already taken multiple THIS STEP
476
288
  if (screenshotsThisStep.length === 0) {
477
- parts.push(`Take screenshot ONCE to see page state. Then ACT with selector.`);
289
+ parts.push(`Take full-page screenshot to see page state: { "name": "take_screenshot", "params": {"isFullPage": true} }`);
290
+ parts.push(`Then ACT with selector from the screenshot analysis.`);
478
291
  }
479
292
  else {
480
293
  parts.push(`You already have visual context. Try different selector NOW.`);
@@ -484,30 +297,16 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
484
297
  }
485
298
  else if (consecutiveFailures && consecutiveFailures >= 4) {
486
299
  parts.push(`[WARNING] CRITICAL: ${consecutiveFailures} failures!`);
487
- if (enableCoordinateMode) {
488
- parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
489
- }
490
- else {
491
- parts.push(`Next failure will force STUCK. Try radically different selector approach.\n`);
492
- }
493
- }
494
- // Trigger coordinate mode if many failures (Phase 1: after 3 failures) - ONLY if enabled
495
- if (enableCoordinateMode && consecutiveFailures && consecutiveFailures >= 3) {
496
- parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯🎯`);
497
- parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
498
- parts.push(`You MUST use coordinate-based action now (percentages).`);
499
- parts.push(`Provide coordinateAction with xPercent/yPercent (0-100, 3 decimals for precision).`);
500
- parts.push(`See system prompt for coordinate action format.`);
501
- parts.push(`🎯🎯🎯\n`);
300
+ parts.push(`Next failure will force STUCK. Try radically different approach.\n`);
502
301
  }
503
302
  // REPAIR MODE detection and instructions
504
303
  const isRepairMode = context.priorSteps !== undefined;
505
304
  if (isRepairMode) {
506
- parts.push(`⚠️ ⚠️ ⚠️ REPAIR MODE ⚠️ ⚠️ ⚠️`);
305
+ parts.push(`⚠️ ⚠️ REPAIR MODE ⚠️ ⚠️`);
507
306
  parts.push(`You are fixing a FAILED command from an existing script.`);
508
307
  parts.push(`CRITICAL: The script executed command-by-command and stopped at a failure.`);
509
308
  parts.push(`Your job: Fix ONLY the failing command. System will auto-execute remaining commands after.`);
510
- parts.push(`⚠️ ⚠️ ⚠️ ⚠️ ⚠️ ⚠️ ⚠️ ⚠️\n`);
309
+ parts.push(`⚠️ ⚠️\n`);
511
310
  // Show execution position summary
512
311
  const successCount = context.successfulCommandsInCurrentStep?.length || 0;
513
312
  const remainCount = context.remainingCommandsInCurrentStep?.length || 0;
@@ -556,7 +355,10 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
556
355
  parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
557
356
  parts.push(`${context.currentStepGoal}`);
558
357
  parts.push(``);
559
- parts.push(`[WARNING] IMPORTANT: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
358
+ parts.push(`[WARNING] BEFORE STARTING: Do prerequisites from prior steps still exist?`);
359
+ parts.push(` Example: Step 4 "Click Core HR" needs Step 3's "menu expanded" state`);
360
+ parts.push(` → Check screenshot: Is menu still expanded? If NO, re-expand before Step 4!`);
361
+ parts.push(`[WARNING] AFTER ACTING: Is THIS step's goal achieved? If YES, mark status="complete" NOW.`);
560
362
  parts.push(`[WARNING] CRITICAL: Only interact with elements you SEE in the screenshot - no guessing/hallucinating!`);
561
363
  parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
562
364
  if (!isRepairMode) {
@@ -569,50 +371,77 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
569
371
  }
570
372
  // SoM screenshot (if available)
571
373
  if (context.somScreenshot) {
572
- parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
573
- parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
574
- parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
575
- parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
576
- parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
577
- parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
578
- parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
579
- parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
580
- parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
581
- parts.push(``);
582
- // SoM element map for disambiguation
374
+ parts.push(`\nSET-OF-MARKS: Full page with color-coded boxes + IDs. Match label color to box. Use IDs in elementRef.`);
583
375
  if (context.somElementMap) {
584
- parts.push(`SOM ELEMENT DETAILS (for disambiguation):`);
585
- parts.push(`If unsure which ID matches your target (e.g., is it 11 or 12?), use this map:`);
376
+ parts.push(`\nELEMENT MAP (for disambiguation):`);
586
377
  parts.push(context.somElementMap);
587
- parts.push(`Example: If you need a "Submit" button and see IDs 5 and 6 are both buttons, check the map to see which one says "Submit".`);
588
378
  parts.push(``);
589
379
  }
590
380
  }
591
381
  // Current page state (most variable content - at the end)
592
382
  parts.push(`\nCURRENT PAGE:`);
593
383
  parts.push(`URL: ${context.currentURL}`);
594
- parts.push(`Title: ${context.currentPageInfo.title}`);
595
- // Only include DOM details if NOT in SoM mode
596
- if (!context.somScreenshot) {
597
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
598
- parts.push(context.currentPageInfo.formattedElements);
599
- parts.push(`\nARIA TREE (hierarchical structure):`);
600
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
384
+ parts.push(`Title: ${context.currentPageTitle}`);
385
+ // Page dimensions for scroll decisions
386
+ if (context.viewportWidth && context.viewportHeight && context.pageHeight) {
387
+ const heightOk = context.pageHeight < (context.viewportHeight * 2);
388
+ const widthOk = context.pageWidth && context.pageWidth < (context.viewportWidth * 2);
389
+ const isCompactPage = heightOk && widthOk;
390
+ parts.push(`\nPAGE DIMENSIONS & SCROLL POSITION:`);
391
+ parts.push(`Viewport: ${context.viewportWidth}x${context.viewportHeight}px`);
392
+ parts.push(`Full Page: ${context.pageWidth}x${context.pageHeight}px`);
393
+ parts.push(`Screenshot Mode: ${isCompactPage ? 'FULL PAGE (compact page, all visible)' : 'VIEWPORT ONLY (large page, full-page markers would be too small)'}`);
394
+ if (context.scrollY !== undefined && context.scrollY > 0) {
395
+ parts.push(`Current Scroll: ${context.scrollY}px from top (you've already scrolled down)`);
396
+ }
397
+ else {
398
+ parts.push(`Current Scroll: At top of page (scrollY = 0)`);
399
+ }
400
+ const canScrollDown = context.pageHeight > context.viewportHeight;
401
+ const canScrollRight = context.pageWidth && context.pageWidth > context.viewportWidth;
402
+ if (!isCompactPage && (canScrollDown || canScrollRight)) {
403
+ const remainingBelow = Math.max(0, context.pageHeight - context.viewportHeight - (context.scrollY || 0));
404
+ const remainingRight = context.pageWidth ? Math.max(0, context.pageWidth - context.viewportWidth - (context.scrollX || 0)) : 0;
405
+ const hiddenContent = [];
406
+ if (remainingBelow > 0)
407
+ hiddenContent.push(`${remainingBelow}px below`);
408
+ if (remainingRight > 0)
409
+ hiddenContent.push(`${remainingRight}px to right`);
410
+ if (hiddenContent.length > 0) {
411
+ parts.push(`Hidden content: ${hiddenContent.join(', ')}`);
412
+ parts.push(`💡 If element not found → Call: take_screenshot with {"isFullPage": true, "purpose": "Find X"}`);
413
+ parts.push(` This shows entire page (markers small but LLM can still locate elements)`);
414
+ }
415
+ }
416
+ else if (isCompactPage) {
417
+ parts.push(`All content visible in screenshot (no need for additional tools)`);
418
+ }
601
419
  }
602
- else {
603
- // In SoM mode, skip DOM details - agent uses visual screenshot instead
604
- parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
420
+ // In SoM mode, element details are in somElementMap (visual screenshot)
421
+ parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
422
+ parts.push('');
423
+ // Show current URL with change detection
424
+ const currentUrl = truncateUrl(context.currentURL || '');
425
+ const lastAction = context.journeyMemory.history[context.journeyMemory.history.length - 1];
426
+ if (lastAction && lastAction.previousUrl && lastAction.url !== lastAction.previousUrl) {
427
+ const prevUrl = truncateUrl(lastAction.previousUrl);
428
+ const newUrl = truncateUrl(lastAction.url);
429
+ parts.push(`🔄 URL CHANGED: ${prevUrl} → ${newUrl}`);
430
+ parts.push(` ⚠️ Navigation occurred! Previous action likely succeeded and triggered page transition.\n`);
605
431
  }
606
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
607
- parts.push('... (truncated)');
432
+ else {
433
+ parts.push(`📍 Current URL: ${currentUrl}\n`);
608
434
  }
609
- parts.push('');
610
435
  // Recent steps (most variable content - at the end)
611
- if (context.recentSteps.length > 0) {
612
- parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
613
- for (const step of context.recentSteps) {
436
+ const recentStepsDisplay = context.journeyMemory.history.slice(-6);
437
+ if (recentStepsDisplay.length > 0) {
438
+ parts.push(`RECENT STEPS (last ${recentStepsDisplay.length}):`);
439
+ for (const step of recentStepsDisplay) {
614
440
  const status = step.result === 'success' ? '[OK]' : '[FAIL]';
615
- parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
441
+ const urlChanged = step.previousUrl && step.url !== step.previousUrl
442
+ ? ` [URL: ${step.previousUrl} → ${step.url}]`
443
+ : '';
444
+ parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}${urlChanged}`);
616
445
  parts.push(` Code: ${step.code}`);
617
446
  if (step.result === 'failure' && step.error) {
618
447
  parts.push(` ERROR: ${step.error}`);
@@ -624,7 +453,7 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
624
453
  }
625
454
  parts.push('');
626
455
  // Detect repeated failures
627
- const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
456
+ const recentFailures = recentStepsDisplay.filter(s => s.result === 'failure');
628
457
  if (recentFailures.length >= 2) {
629
458
  const sameSelector = recentFailures.slice(-2).every((s, i, arr) => i === 0 || s.code === arr[i - 1].code);
630
459
  if (sameSelector) {
@@ -634,18 +463,49 @@ This is a last-resort mechanism, but it WILL work if you provide accurate percen
634
463
  }
635
464
  }
636
465
  }
637
- // Experiences (app-specific patterns learned)
638
- if (context.experiences && context.experiences.length > 0) {
639
- parts.push(`\nEXPERIENCES (patterns you've learned about this app):`);
640
- for (const exp of context.experiences) {
641
- parts.push(` • ${exp}`);
466
+ // Site learnings (persistent knowledge)
467
+ if (context.siteLearnings) {
468
+ const { screens, uxPatterns } = context.siteLearnings;
469
+ // Display UX patterns with IDs
470
+ const uxPatternEntries = Object.entries(uxPatterns);
471
+ if (uxPatternEntries.length > 0) {
472
+ parts.push(`\n🎯 SITE-WIDE UX PATTERNS (reference [ID] for updates/deletes):`);
473
+ uxPatternEntries.forEach(([id, text]) => parts.push(` [${id}] ${text}`));
474
+ parts.push('');
475
+ }
476
+ // Display screen/state vocabulary first (for consistent naming)
477
+ if (context.siteLearnings?.screenStateVocabulary && Object.keys(context.siteLearnings.screenStateVocabulary).length > 0) {
478
+ parts.push(`\n📋 SCREEN STATE VOCABULARY (use these names for consistency):`);
479
+ Object.entries(context.siteLearnings.screenStateVocabulary).forEach(([screenName, stateNames]) => {
480
+ const statesDisplay = stateNames.length > 0
481
+ ? ` → States: ${stateNames.map(s => s || '""').join(', ')}`
482
+ : '';
483
+ parts.push(` • ${screenName}${statesDisplay}`);
484
+ });
485
+ parts.push('');
486
+ }
487
+ // Display screen state knowledge with IDs
488
+ if (screens && Object.keys(screens).length > 0) {
489
+ parts.push(`\n📚 SCREEN STATE KNOWLEDGE (reference [ID] for updates/deletes):`);
490
+ Object.entries(screens).forEach(([screenName, screenLearnings]) => {
491
+ Object.entries(screenLearnings.states).forEach(([state, learning]) => {
492
+ const stateLabel = state ? `[${state}]` : '';
493
+ parts.push(`\n ${screenName}${stateLabel}:`);
494
+ const obsEntries = Object.entries(learning.observations);
495
+ if (obsEntries.length > 0) {
496
+ obsEntries.forEach(([id, text]) => {
497
+ parts.push(` [${id}] ${text}`);
498
+ });
499
+ }
500
+ });
501
+ });
502
+ parts.push('');
642
503
  }
643
- parts.push('');
644
504
  }
645
505
  // Extracted data (from previous extract_data tool calls)
646
- if (context.extractedData && Object.keys(context.extractedData).length > 0) {
506
+ if (context.journeyMemory.extractedData && Object.keys(context.journeyMemory.extractedData).length > 0) {
647
507
  parts.push(`\nEXTRACTED DATA (available for use in commands):`);
648
- parts.push(JSON.stringify(context.extractedData, null, 2));
508
+ parts.push(JSON.stringify(context.journeyMemory.extractedData, null, 2));
649
509
  parts.push('');
650
510
  }
651
511
  return parts.join('\n');
@@ -662,19 +522,20 @@ DISCRETE EXPERIENCE LOOP:
662
522
  You operate in iterations: receive state → decide → sleep → wake with new state.
663
523
  System waits for page stability after each batch.
664
524
 
665
- CRITICAL: MEMORY ISOLATION
666
- You have NO memory between iterations. Each "wake up" is like a fresh start - you only see:
667
- - Current screenshot
668
- - Current journey goal
669
- - Previous step descriptions
525
+ CRITICAL: NO SCREENSHOT MEMORY (STATELESS!)
526
+ Each iteration you receive:
527
+ - Current screenshot (NOT previous screenshots!)
528
+ - Past actions (text descriptions, not screenshots)
529
+ - Ongoing memory (experiences, patterns)
670
530
  - Your noteToFutureSelf from last iteration
531
+ - Current journey goal
671
532
 
672
- The noteToFutureSelf is your ONLY way to maintain a continuous stream of thinking across iterations. Use it strategically to:
673
- Document your current intentions and strategy
674
- Record what you were thinking/planning
675
- Give specific advice to your future self about what to look for
676
- Note any observations or patterns you've discovered
677
- Suggest backup plans if current approach fails
533
+ The noteToFutureSelf is your way to document expectations for verification. MUST include EXPLICIT EXPECTED STATE:
534
+ GOOD: "Clicked sidebar menu button (was collapsed). EXPECT: expanded sidebar with 'Dashboard' and 'Reports' visible"
535
+ GOOD: "Navigated to /settings. EXPECT: URL changed, 'Save Settings' button visible"
536
+ BAD: "Clicked menu" (future you can't verify if it worked!)
537
+ BAD: "Clicked ID 8" (ID meaningless without screenshot!)
538
+ Also include: strategy, observations, patterns discovered, backup plans if this fails
678
539
 
679
540
  COMMON UX PATTERNS (critical for navigation):
680
541
  • Disabled buttons → Fill required fields first to enable them
@@ -689,31 +550,13 @@ COMMON UX PATTERNS (critical for navigation):
689
550
  • Lazy loading → Scroll down to load more content
690
551
  • Accordions/expandable → Click header to toggle visibility
691
552
 
692
- YOUR RESPONSE FORMAT - Output JSON matching this interface:
553
+ ${RESPONSE_SCHEMA}
693
554
 
694
- interface AgentDecisionLLMResponse {
695
- status: string; // "continue" | "complete" | "stuck"
696
- reasoning: string; // What you're exploring and why
697
- stepSummary?: string; // Concise 1-sentence summary of what was done in this step
698
-
699
- // COMMANDS: Array of plain Playwright command strings
700
- commands?: string[]; // Example: ["await page.fill('input[name=\"email\"]', 'test@example.com')", ...]
701
- commandReasoning?: string;
702
- toolCalls?: Array<{ // Tools to call (extract_data for menus, etc.)
703
- name: string;
704
- params: Record<string, any>;
705
- }>;
706
- toolReasoning?: string;
707
- needsToolResults?: boolean;
708
- noteToFutureSelf?: string;
709
- coordinateAction?: { ... };
710
- experiences?: string[]; // Use for BOTH app patterns AND exploration progress
711
- blockerDetected?: { ... };
712
- debugInfo?: { // Meta-learning: suggest prompt improvements (only when very confident)
713
- suggestedPromptUpdates?: string;
714
- reasoning?: string;
715
- };
716
- }
555
+ YOUR RESPONSE FORMAT - Output JSON matching AgentDecision interface above.
556
+
557
+ For exploration mode, also include:
558
+ - stepSummary: Concise 1-sentence summary of what was accomplished this iteration
559
+ - commands: Array of plain Playwright command strings (exploration uses string commands, not SoM)
717
560
 
718
561
  EXPLORATION MODE GUIDELINES:
719
562
 
@@ -726,7 +569,7 @@ EXPLORATION MODE GUIDELINES:
726
569
 
727
570
  3. **VISIBLE ELEMENTS ONLY**: Screenshot shows viewport only. Only interact with elements you SEE. If not visible, scroll or take_screenshot(isFullPage=true).
728
571
 
729
- 4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in experiences, check history to avoid repeating, prioritize unexplored areas.
572
+ 4. **SYSTEMATIC EXPLORATION**: Use extract_data to discover, store in extractedData, track in siteLearningsUpdate, check history to avoid repeating, prioritize unexplored areas.
730
573
 
731
574
  5. **CREATIVE TESTING**: Test functionality thoroughly - try edge cases, verify features work, look for bugs.
732
575
 
@@ -741,7 +584,25 @@ EXPLORATION MODE GUIDELINES:
741
584
 
742
585
  11. **STEP SUMMARY**: When you complete actions, provide a concise 1-sentence summary of what was accomplished (e.g., "Logged in successfully", "Navigated to dashboard", "Created new widget"). This is used for step tracking, not future planning.
743
586
 
744
- 12. **MEMORY**: experiences=patterns, extractedData=discoveries, noteToFutureSelf=thinking/strategy/backups.
587
+ 12. **MEMORY (STATELESS!)**: You see only current screenshot. MUST write expected state in noteToFutureSelf:
588
+ - ✅ "Clicked settings button in navbar. EXPECT: settings page with 'Profile' section visible"
589
+ - ❌ "Clicked settings" (can't verify!)
590
+ - ❌ "Clicked ID 9" (ID meaningless without screenshot!)
591
+ - siteLearningsUpdate=persistent knowledge, extractedData=journey discoveries
592
+
593
+ SITE LEARNINGS: Build mental model (persistent across journeys)
594
+ - screenState: {screen, state} to identify current context (NEVER: "about:blank", "loading" states)
595
+ - siteLearningsUpdate: Add/update/delete observations per screen-state
596
+ CRITICAL: NEVER include SoM IDs ("element 9", "ID 5") - they regenerate every page load!
597
+ ✅ "Workspace selector opens on caret icon click"
598
+ ❌ "Element 9 opens dropdown with entries 6,7,8"
599
+
600
+ WHEN TO STORE:
601
+ ✅ After discovering navigation (uxPatterns)
602
+ ✅ After learning UI behavior (uxPatterns)
603
+ ✅ When understanding screen layout (observations)
604
+ ✅ When selector fails (observations)
605
+ ❌ Don't store obvious/temporary things
745
606
 
746
607
  CRITICAL: You're fully autonomous for THIS journey - no step-by-step instructions provided.
747
608
  YOU decide the exploration path to meet the journey goal based on: journey prompt, current state, and memory.`;
@@ -773,21 +634,20 @@ YOU decide the exploration path to meet the journey goal based on: journey promp
773
634
  parts.push(`PROGRESS: Step ${stepNumber}/${maxSteps} (you can complete earlier if journey goal met)\n`);
774
635
  }
775
636
  // Show discovered and tracked data from extractedData
776
- if (context.extractedData && Object.keys(context.extractedData).length > 0) {
637
+ if (context.journeyMemory.extractedData && Object.keys(context.journeyMemory.extractedData).length > 0) {
777
638
  parts.push(`\nDISCOVERED DATA (this journey):`);
778
- for (const [key, value] of Object.entries(context.extractedData)) {
639
+ for (const [key, value] of Object.entries(context.journeyMemory.extractedData)) {
779
640
  parts.push(` ${key}: ${value}`);
780
641
  }
781
642
  }
782
643
  // SoM screenshot (if available)
783
644
  if (context.somScreenshot) {
784
645
  parts.push(`\n SET-OF-MARKS SCREENSHOT (with element IDs):`);
785
- parts.push(`Screenshot shows VIEWPORT ONLY (current visible area, not full page).`);
786
- parts.push(`Color-coded bounding boxes mark interactive elements in the viewport.`);
646
+ parts.push(`Screenshot shows FULL PAGE (all content, including below-fold elements).`);
647
+ parts.push(`Color-coded bounding boxes mark ALL interactive elements across entire page.`);
787
648
  parts.push(`Each element has a unique color and an ID label (1, 2, 3, etc.) at TOP-RIGHT corner, OUTSIDE the box.`);
788
649
  parts.push(`Labels are typically positioned OUTSIDE and ABOVE the bounding box.`);
789
650
  parts.push(`TO FIND THE CORRECT ELEMENT: match the label color with the bounding box color.`);
790
- parts.push(`If target element not visible: SCROLL down/up OR use take_screenshot(isFullPage=true).`);
791
651
  parts.push(`Reference element IDs in your commands using elementRef field (e.g., "1", "2", "42").`);
792
652
  parts.push(`The screenshot is attached as an image - examine it to identify elements visually.`);
793
653
  parts.push(``);
@@ -808,41 +668,72 @@ YOU decide the exploration path to meet the journey goal based on: journey promp
808
668
  }
809
669
  parts.push(`\nCURRENT PAGE:`);
810
670
  parts.push(`URL: ${context.currentURL}`);
811
- parts.push(`Title: ${context.currentPageInfo.title}`);
812
- // Only include DOM details if NOT in SoM mode
813
- if (!context.somScreenshot) {
814
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
815
- parts.push(context.currentPageInfo.formattedElements);
816
- parts.push(`\nARIA TREE (hierarchical structure):`);
817
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
671
+ parts.push(`Title: ${context.currentPageTitle}`);
672
+ // In SoM mode, element details are in somElementMap
673
+ parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
674
+ // Recent actions
675
+ // Show current URL with change detection
676
+ const currentUrl = truncateUrl(context.currentURL || '');
677
+ const lastAction = context.journeyMemory.history[context.journeyMemory.history.length - 1];
678
+ if (lastAction && lastAction.previousUrl && lastAction.url !== lastAction.previousUrl) {
679
+ const prevUrl = truncateUrl(lastAction.previousUrl);
680
+ const newUrl = truncateUrl(lastAction.url);
681
+ parts.push(`\n🔄 URL CHANGED: ${prevUrl} → ${newUrl}`);
682
+ parts.push(` ⚠️ Navigation occurred! Previous action likely triggered page transition.\n`);
818
683
  }
819
684
  else {
820
- // In SoM mode, skip DOM details - agent uses visual screenshot
821
- parts.push(`\nNote: Element details available in visual screenshot with SoM markers.`);
822
- }
823
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
824
- parts.push('... (truncated)');
685
+ parts.push(`\n📍 Current URL: ${currentUrl}\n`);
825
686
  }
826
- // Recent actions
827
- if (context.recentSteps.length > 0) {
828
- parts.push(`\nRECENT ACTIONS (last ${context.recentSteps.length}):`);
829
- for (const step of context.recentSteps) {
687
+ const recentActions = context.journeyMemory.history.slice(-6);
688
+ if (recentActions.length > 0) {
689
+ parts.push(`RECENT ACTIONS (last ${recentActions.length}):`);
690
+ for (const step of recentActions) {
830
691
  const status = step.result === 'success' ? '[OK]' : '[FAIL]';
831
- parts.push(` ${status} ${step.action}`);
692
+ const urlChanged = step.previousUrl && step.url !== step.previousUrl
693
+ ? ` [URL: ${truncateUrl(step.previousUrl)} → ${truncateUrl(step.url)}]`
694
+ : '';
695
+ parts.push(` ${status} ${step.action}${urlChanged}`);
832
696
  parts.push(` ${step.observation}`);
833
697
  }
834
698
  }
835
- // Learnings and exploration progress
836
- if (context.experiences && context.experiences.length > 0) {
837
- parts.push(`\nEXPLORATION NOTES & APP PATTERNS:`);
838
- for (const exp of context.experiences) {
839
- parts.push(` • ${exp}`);
699
+ // Site learnings
700
+ if (context.siteLearnings) {
701
+ const { screens, uxPatterns } = context.siteLearnings;
702
+ const uxPatternEntries = Object.entries(uxPatterns);
703
+ if (uxPatternEntries.length > 0) {
704
+ parts.push(`\n🎯 SITE-WIDE UX PATTERNS (reference [ID] for updates/deletes):`);
705
+ uxPatternEntries.forEach(([id, text]) => parts.push(` [${id}] ${text}`));
706
+ }
707
+ // Display screen/state vocabulary first (for consistent naming)
708
+ if (context.siteLearnings?.screenStateVocabulary && Object.keys(context.siteLearnings.screenStateVocabulary).length > 0) {
709
+ parts.push(`\n📋 SCREEN STATE VOCABULARY (use these names for consistency):`);
710
+ Object.entries(context.siteLearnings.screenStateVocabulary).forEach(([screenName, stateNames]) => {
711
+ const statesDisplay = stateNames.length > 0
712
+ ? ` → States: ${stateNames.map(s => s || '""').join(', ')}`
713
+ : '';
714
+ parts.push(` • ${screenName}${statesDisplay}`);
715
+ });
716
+ }
717
+ if (screens && Object.keys(screens).length > 0) {
718
+ parts.push(`\n📚 SCREEN STATE KNOWLEDGE (reference [ID] for updates/deletes):`);
719
+ Object.entries(screens).forEach(([screenName, screenLearnings]) => {
720
+ Object.entries(screenLearnings.states).forEach(([state, learning]) => {
721
+ const stateLabel = state ? `[${state}]` : '';
722
+ parts.push(`\n ${screenName}${stateLabel}:`);
723
+ const obsEntries = Object.entries(learning.observations);
724
+ if (obsEntries.length > 0) {
725
+ obsEntries.forEach(([id, text]) => {
726
+ parts.push(` [${id}] ${text}`);
727
+ });
728
+ }
729
+ });
730
+ });
840
731
  }
841
732
  }
842
733
  // Note from previous iteration
843
- if (context.noteFromPreviousIteration) {
844
- parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.noteFromPreviousIteration.content}`);
845
- parts.push(`Did it work? If yes, continue plan. If failed, try backup alternatives.`);
734
+ if (context.journeyMemory.latestNote) {
735
+ parts.push(`\nYOUR NOTE FROM LAST ITERATION: ${context.journeyMemory.latestNote.content}`);
736
+ parts.push(` ^^ Follow your own instructions from previous iteration ^^`);
846
737
  }
847
738
  parts.push(`\nDECIDE NEXT ACTION: What to explore/test next? Check history to avoid repeating. Is goal achieved? Mark complete.`);
848
739
  return parts.join('\n');