testchimp-runner-core 0.0.32 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/dist/llm-facade.d.ts.map +1 -1
  2. package/dist/llm-facade.js +7 -7
  3. package/dist/llm-facade.js.map +1 -1
  4. package/dist/llm-provider.d.ts +9 -0
  5. package/dist/llm-provider.d.ts.map +1 -1
  6. package/dist/model-constants.d.ts +16 -5
  7. package/dist/model-constants.d.ts.map +1 -1
  8. package/dist/model-constants.js +17 -6
  9. package/dist/model-constants.js.map +1 -1
  10. package/dist/orchestrator/index.d.ts +1 -1
  11. package/dist/orchestrator/index.d.ts.map +1 -1
  12. package/dist/orchestrator/index.js +3 -2
  13. package/dist/orchestrator/index.js.map +1 -1
  14. package/dist/orchestrator/orchestrator-agent.d.ts +0 -8
  15. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  16. package/dist/orchestrator/orchestrator-agent.js +206 -405
  17. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  18. package/dist/orchestrator/orchestrator-prompts.d.ts +20 -0
  19. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
  20. package/dist/orchestrator/orchestrator-prompts.js +455 -0
  21. package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
  22. package/dist/orchestrator/tools/index.d.ts +2 -1
  23. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  24. package/dist/orchestrator/tools/index.js +4 -2
  25. package/dist/orchestrator/tools/index.js.map +1 -1
  26. package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
  27. package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
  28. package/dist/orchestrator/tools/verify-action-result.js +140 -0
  29. package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
  30. package/dist/orchestrator/types.d.ts +26 -0
  31. package/dist/orchestrator/types.d.ts.map +1 -1
  32. package/dist/orchestrator/types.js.map +1 -1
  33. package/dist/prompts.d.ts.map +1 -1
  34. package/dist/prompts.js +87 -37
  35. package/dist/prompts.js.map +1 -1
  36. package/dist/scenario-worker-class.d.ts.map +1 -1
  37. package/dist/scenario-worker-class.js +4 -1
  38. package/dist/scenario-worker-class.js.map +1 -1
  39. package/dist/utils/coordinate-converter.d.ts +32 -0
  40. package/dist/utils/coordinate-converter.d.ts.map +1 -0
  41. package/dist/utils/coordinate-converter.js +130 -0
  42. package/dist/utils/coordinate-converter.js.map +1 -0
  43. package/package.json +1 -1
  44. package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
  45. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
  46. package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
  47. package/plandocs/PHASE_1_COMPLETE.md +165 -0
  48. package/plandocs/PHASE_1_SUMMARY.md +184 -0
  49. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
  50. package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
  51. package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
  52. package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
  53. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
  54. package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
  55. package/src/llm-facade.ts +8 -8
  56. package/src/llm-provider.ts +11 -1
  57. package/src/model-constants.ts +17 -5
  58. package/src/orchestrator/index.ts +3 -2
  59. package/src/orchestrator/orchestrator-agent.ts +249 -424
  60. package/src/orchestrator/orchestrator-agent.ts.backup +1386 -0
  61. package/src/orchestrator/orchestrator-prompts.ts +474 -0
  62. package/src/orchestrator/tools/index.ts +2 -1
  63. package/src/orchestrator/tools/verify-action-result.ts +159 -0
  64. package/src/orchestrator/types.ts +48 -0
  65. package/src/prompts.ts +87 -37
  66. package/src/scenario-worker-class.ts +7 -2
  67. package/src/utils/coordinate-converter.ts +162 -0
  68. package/testchimp-runner-core-0.0.33.tgz +0 -0
  69. /package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
  70. /package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
  71. /package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0
@@ -7,6 +7,7 @@ import { expect } from '@playwright/test';
7
7
  import { LLMFacade } from '../llm-facade';
8
8
  import { ProgressReporter, TokenUsage, StepExecutionStatus } from '../progress-reporter';
9
9
  import { getEnhancedPageInfo } from '../utils/page-info-utils';
10
+ import { CoordinateConverter } from '../utils/coordinate-converter';
10
11
  import { ToolRegistry, ToolExecutionContext } from './tool-registry';
11
12
  import { DEFAULT_MODEL } from '../model-constants';
12
13
  import {
@@ -17,8 +18,11 @@ import {
17
18
  MemoryStep,
18
19
  OrchestratorStepResult,
19
20
  SelfReflection,
21
+ NoteToFutureSelf,
22
+ CoordinateAction,
20
23
  DEFAULT_AGENT_CONFIG
21
24
  } from './types';
25
+ import { OrchestratorPrompts } from './orchestrator-prompts';
22
26
 
23
27
  /**
24
28
  * Orchestrator Agent - manages step execution with tool use and memory
@@ -68,8 +72,10 @@ export class OrchestratorAgent {
68
72
 
69
73
  let iteration = 0;
70
74
  let previousReflection: SelfReflection | undefined = undefined;
75
+ let noteToSelf: NoteToFutureSelf | undefined = memory.latestNote; // Start with note from previous step
71
76
  const commandsExecuted: string[] = [];
72
77
  let consecutiveFailures = 0; // Track consecutive iterations with failed commands
78
+ let coordinateAttempts = 0; // Track coordinate mode attempts (max 2)
73
79
 
74
80
  while (iteration < this.config.maxIterationsPerStep) {
75
81
  iteration++;
@@ -85,11 +91,18 @@ export class OrchestratorAgent {
85
91
  scenarioSteps,
86
92
  memory,
87
93
  previousReflection,
88
- consecutiveFailures
94
+ consecutiveFailures,
95
+ noteToSelf // NEW: Pass note from previous iteration
89
96
  );
90
97
 
91
98
  // Call agent to make decision
92
- const decision = await this.callAgent(context, jobId, stepNumber, iteration, consecutiveFailures);
99
+ const decision = await this.callAgent(
100
+ context,
101
+ jobId,
102
+ stepNumber,
103
+ iteration,
104
+ consecutiveFailures
105
+ );
93
106
 
94
107
  // Log agent's reasoning
95
108
  this.logAgentDecision(decision, iteration);
@@ -100,6 +113,15 @@ export class OrchestratorAgent {
100
113
  // Execute tools if requested (tools are READ-ONLY, they don't change state)
101
114
  let toolResults: Record<string, any> = {};
102
115
 
116
+ // ANTI-LOOP: Detect if agent is taking screenshots repeatedly without acting
117
+ const recentScreenshots = memory.history.slice(-3).filter(s =>
118
+ s.code.includes('take_screenshot') || s.action.includes('Screenshot')
119
+ );
120
+ if (recentScreenshots.length >= 2 && iteration >= 3) {
121
+ this.logger?.(`[Orchestrator] ⚠️ WARNING: ${recentScreenshots.length} screenshots in last 3 iterations - agent may be looping`, 'warn');
122
+ this.logger?.(`[Orchestrator] 💭 System: Stop gathering info, START ACTING with available selectors`);
123
+ }
124
+
103
125
  if (decision.toolCalls && decision.toolCalls.length > 0) {
104
126
  toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
105
127
 
@@ -148,7 +170,7 @@ export class OrchestratorAgent {
148
170
  }
149
171
  }
150
172
 
151
- // Execute main commands (only if no blocker failure or continueOnFailure is true)
173
+ // Execute main commands (only if no blocker failure)
152
174
  if (decision.commands && decision.commands.length > 0 && !iterationHadFailure) {
153
175
  const executeResult = await this.executeCommandsSequentially(
154
176
  decision.commands,
@@ -171,8 +193,161 @@ export class OrchestratorAgent {
171
193
  }
172
194
  }
173
195
 
196
+ // Handle coordinate-based actions (NEW - fallback when selectors fail)
197
+ if (decision.coordinateAction && !iterationHadFailure) {
198
+ coordinateAttempts++;
199
+
200
+ this.logger?.(`[Orchestrator] 🎯 Coordinate Action (attempt ${coordinateAttempts}/2): ${decision.coordinateAction.action} at (${decision.coordinateAction.xPercent}%, ${decision.coordinateAction.yPercent}%)`);
201
+
202
+ try {
203
+ // BEFORE screenshot for visual verification
204
+ const beforeScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
205
+ const beforeDataUrl = `data:image/jpeg;base64,${beforeScreenshot}`;
206
+
207
+ // Generate Playwright commands from coordinate action
208
+ const coordCommands = await CoordinateConverter.generateCommands(decision.coordinateAction, page);
209
+
210
+ this.logger?.(`[Orchestrator] Generated commands:`);
211
+ coordCommands.forEach(cmd => this.logger?.(` ${cmd}`));
212
+
213
+ // Execute coordinate commands
214
+ const coordResult = await this.executeCommandsSequentially(
215
+ coordCommands,
216
+ page,
217
+ memory,
218
+ stepNumber,
219
+ iteration,
220
+ jobId
221
+ );
222
+
223
+ commandsExecuted.push(...coordResult.executed);
224
+
225
+ if (!coordResult.allSucceeded) {
226
+ this.logger?.(`[Orchestrator] ❌ Coordinate action failed (Playwright error)`);
227
+ consecutiveFailures++;
228
+ iterationHadFailure = true;
229
+
230
+ // Give up after 2 coordinate attempts
231
+ if (coordinateAttempts >= 2) {
232
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
233
+ return {
234
+ success: false,
235
+ commands: commandsExecuted,
236
+ iterations: iteration,
237
+ terminationReason: 'agent_stuck',
238
+ memory,
239
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
240
+ };
241
+ }
242
+ } else {
243
+ this.logger?.(`[Orchestrator] ✅ Coordinate action succeeded (no Playwright error)`);
244
+
245
+ // CRITICAL: Verify visually if goal was achieved (coordinates might have clicked wrong place)
246
+ // Wait for network idle (smarter than fixed timeout - waits as needed, max 10s)
247
+ try {
248
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
249
+ } catch (e) {
250
+ // Network idle timeout - page may still be loading, but proceed with verification
251
+ this.logger?.(`[Orchestrator] ⚠️ Network idle timeout after 10s, proceeding with verification`, 'warn');
252
+ }
253
+
254
+ const afterScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
255
+ const afterDataUrl = `data:image/jpeg;base64,${afterScreenshot}`;
256
+
257
+ this.logger?.(`[Orchestrator] 📸 Verifying coordinate action visually...`);
258
+
259
+ const verificationRequest = {
260
+ model: 'gpt-5-mini',
261
+ systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
262
+ userPrompt: `Goal: ${scenarioSteps[stepNumber - 1]}\n\nA coordinate-based action was just executed. Compare the BEFORE and AFTER screenshots.\n\nDid the action achieve the goal? Respond with JSON:\n{\n "verified": boolean,\n "reasoning": "What changed (or didn't change) between screenshots",\n "visibleChanges": ["List of UI changes observed"],\n "clickedWrongPlace": boolean\n}\n\nBe strict: Only return verified=true if you clearly see the expected change matching the goal.`,
263
+ images: [
264
+ { label: 'BEFORE', dataUrl: beforeDataUrl },
265
+ { label: 'AFTER', dataUrl: afterDataUrl }
266
+ ]
267
+ };
268
+
269
+ const verificationResponse = await this.llmFacade.llmProvider.callLLM(verificationRequest);
270
+ const jsonMatch = verificationResponse.answer.match(/\{[\s\S]*\}/);
271
+
272
+ if (jsonMatch) {
273
+ const verificationResult = JSON.parse(jsonMatch[0]);
274
+ const verified = verificationResult.verified === true;
275
+ const reasoning = verificationResult.reasoning || 'No reasoning provided';
276
+ const clickedWrongPlace = verificationResult.clickedWrongPlace === true;
277
+
278
+ this.logger?.(`[Orchestrator] 📊 Visual verification: ${verified ? '✅ VERIFIED' : '❌ NOT VERIFIED'}`);
279
+ this.logger?.(`[Orchestrator] 💭 Reasoning: ${reasoning}`);
280
+
281
+ if (verified) {
282
+ // Goal achieved! Can mark complete
283
+ consecutiveFailures = 0;
284
+
285
+ // Store note for context
286
+ noteToSelf = {
287
+ fromIteration: iteration,
288
+ content: `Coordinate action verified successful: ${reasoning}`
289
+ };
290
+ memory.latestNote = noteToSelf;
291
+ } else {
292
+ // Goal NOT achieved despite no error
293
+ this.logger?.(`[Orchestrator] ⚠️ Coordinate click succeeded but goal NOT achieved`, 'warn');
294
+ consecutiveFailures++;
295
+ iterationHadFailure = true;
296
+
297
+ // Store diagnostic note for next attempt
298
+ const diagnostic = clickedWrongPlace
299
+ ? `Clicked wrong place. ${reasoning}. Try different coordinates.`
300
+ : `Action executed but goal not achieved. ${reasoning}. May need different approach.`;
301
+
302
+ noteToSelf = {
303
+ fromIteration: iteration,
304
+ content: diagnostic
305
+ };
306
+ memory.latestNote = noteToSelf;
307
+
308
+ // Give up after 2 coordinate attempts
309
+ if (coordinateAttempts >= 2) {
310
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts, none achieved goal) - marking stuck`);
311
+ return {
312
+ success: false,
313
+ commands: commandsExecuted,
314
+ iterations: iteration,
315
+ terminationReason: 'agent_stuck',
316
+ memory,
317
+ error: `Coordinate actions clicked but didn't achieve goal: ${reasoning}`
318
+ };
319
+ }
320
+ }
321
+ } else {
322
+ this.logger?.(`[Orchestrator] ⚠️ Could not parse verification response - treating as unverified`, 'warn');
323
+ consecutiveFailures++;
324
+ iterationHadFailure = true;
325
+ }
326
+ }
327
+
328
+ } catch (error: any) {
329
+ this.logger?.(`[Orchestrator] ❌ Coordinate action error: ${error.message}`, 'error');
330
+ consecutiveFailures++;
331
+ iterationHadFailure = true;
332
+
333
+ // Give up after 2 coordinate attempts
334
+ if (coordinateAttempts >= 2) {
335
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
336
+ return {
337
+ success: false,
338
+ commands: commandsExecuted,
339
+ iterations: iteration,
340
+ terminationReason: 'agent_stuck',
341
+ memory,
342
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
343
+ };
344
+ }
345
+ }
346
+ }
347
+
174
348
  // System-enforced stuck detection (agent might not detect it)
175
- if (consecutiveFailures >= 3) {
349
+ // Allow 5 failures: 3 selector attempts + 2 coordinate attempts
350
+ if (consecutiveFailures >= 5) {
176
351
  this.logger?.(`[Orchestrator] 🛑 SYSTEM: ${consecutiveFailures} consecutive failures detected - forcing stuck`, 'warn');
177
352
  return {
178
353
  success: false,
@@ -208,20 +383,38 @@ export class OrchestratorAgent {
208
383
  // Store self-reflection for next iteration
209
384
  previousReflection = decision.selfReflection;
210
385
 
386
+ // Store note to future self (NEW - tactical memory across iterations AND steps)
387
+ if (decision.noteToFutureSelf) {
388
+ noteToSelf = {
389
+ fromIteration: iteration,
390
+ content: decision.noteToFutureSelf
391
+ };
392
+ memory.latestNote = noteToSelf; // Persist in journey memory across steps
393
+ this.logger?.(`[Orchestrator] 📝 Note to self: ${decision.noteToFutureSelf}`);
394
+ }
395
+
211
396
  // Check termination
212
397
  if (decision.status !== 'continue') {
213
398
  this.logger?.(`[Orchestrator] 🎯 Status: ${decision.status}`);
214
399
  this.logger?.(`[Orchestrator] 💭 Reason: ${decision.statusReasoning}`);
215
400
 
216
- return {
217
- success: decision.status === 'complete',
218
- commands: commandsExecuted,
219
- iterations: iteration,
220
- terminationReason: decision.status === 'complete' ? 'complete' :
221
- decision.status === 'stuck' ? 'agent_stuck' :
222
- 'infeasible',
223
- memory
224
- };
401
+ // SAFETY CHECK: Don't allow "complete" if commands failed this iteration
402
+ if (decision.status === 'complete' && iterationHadFailure) {
403
+ this.logger?.(`[Orchestrator] ⚠️ OVERRIDE: Agent said "complete" but commands FAILED - forcing "continue"`, 'warn');
404
+ this.logger?.(`[Orchestrator] 💭 System: Commands must succeed before marking complete`);
405
+ // Don't return - continue to next iteration
406
+ } else {
407
+ // Valid termination
408
+ return {
409
+ success: decision.status === 'complete',
410
+ commands: commandsExecuted,
411
+ iterations: iteration,
412
+ terminationReason: decision.status === 'complete' ? 'complete' :
413
+ decision.status === 'stuck' ? 'agent_stuck' :
414
+ 'infeasible',
415
+ memory
416
+ };
417
+ }
225
418
  }
226
419
  }
227
420
 
@@ -249,7 +442,8 @@ export class OrchestratorAgent {
249
442
  scenarioSteps: string[],
250
443
  memory: JourneyMemory,
251
444
  previousReflection?: SelfReflection,
252
- consecutiveFailures?: number
445
+ consecutiveFailures?: number,
446
+ noteFromPreviousIteration?: NoteToFutureSelf // NEW
253
447
  ): Promise<AgentContext> {
254
448
  // Get fresh DOM
255
449
  const currentPageInfo = await getEnhancedPageInfo(page);
@@ -271,7 +465,8 @@ export class OrchestratorAgent {
271
465
  recentSteps,
272
466
  experiences: memory.experiences,
273
467
  extractedData: memory.extractedData,
274
- previousIterationGuidance: previousReflection
468
+ previousIterationGuidance: previousReflection,
469
+ noteFromPreviousIteration // NEW: Pass tactical note from previous iteration
275
470
  };
276
471
  }
277
472
 
@@ -285,9 +480,25 @@ export class OrchestratorAgent {
285
480
  iteration: number,
286
481
  consecutiveFailures?: number
287
482
  ): Promise<AgentDecision> {
288
- // Build prompt
289
- const systemPrompt = this.buildSystemPrompt();
290
- const userPrompt = this.buildUserPrompt(context, consecutiveFailures);
483
+ // Detect if coordinate mode should be activated
484
+ // Phase 1: Only 2 tiers (selectors → coordinates), so activate after 3 failures
485
+ // Phase 2: Will have 3 tiers (selectors → index → coordinates), threshold will be 5
486
+ const useCoordinateMode = consecutiveFailures !== undefined && consecutiveFailures >= 3;
487
+
488
+ // Build appropriate system prompt based on mode
489
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
490
+ const systemPrompt = useCoordinateMode
491
+ ? OrchestratorPrompts.buildCoordinateSystemPrompt()
492
+ : OrchestratorPrompts.buildSystemPrompt(toolDescriptions);
493
+ const userPrompt = OrchestratorPrompts.buildUserPrompt(context, consecutiveFailures);
494
+
495
+ // Log prompt lengths for monitoring
496
+ const systemLength = systemPrompt.length;
497
+ const userLength = userPrompt.length;
498
+ const totalLength = systemLength + userLength;
499
+ const estimatedTokens = Math.ceil(totalLength / 4); // Rough estimate: 4 chars per token
500
+
501
+ this.logger?.(`[Orchestrator] 📊 Prompt lengths: system=${systemLength} chars, user=${userLength} chars, total=${totalLength} chars (~${estimatedTokens} tokens)`, 'log');
291
502
 
292
503
  try {
293
504
  // Call LLM directly via provider
@@ -332,409 +543,7 @@ export class OrchestratorAgent {
332
543
  }
333
544
  }
334
545
 
335
- /**
336
- * Build system prompt with tool descriptions
337
- */
338
- private buildSystemPrompt(): string {
339
- const toolDescriptions = this.toolRegistry.generateToolDescriptions();
340
-
341
- return `You are an intelligent test automation agent that executes web scenarios using Playwright.
342
-
343
- ${toolDescriptions}
344
-
345
- YOUR RESPONSE FORMAT (JSON):
346
- {
347
- "toolCalls": [{"name": "tool_name", "params": {...}}], // Optional: Tools to use
348
- "toolReasoning": "Why I need these tools", // If using tools
349
- "needsToolResults": true/false, // Wait for tool results before commands?
350
- "commands": ["await page.click(...)", ...], // Playwright commands to execute
351
- "commandReasoning": "Why these commands", // If executing commands
352
- "selfReflection": { // Guidance for your next iteration
353
- "guidanceForNext": "What to focus on next",
354
- "detectingLoop": false, // Set true if repeating same approach
355
- "loopReasoning": "Why I think I'm looping" // If detecting loop
356
- },
357
- "experiences": ["App-specific learning"], // APP-SPECIFIC patterns (see rules below)
358
- "status": "continue|complete|stuck|infeasible", // Your decision
359
- "statusReasoning": "Why this status",
360
- "reasoning": "Overall reasoning for this iteration",
361
- "blockerDetected": { // OPTIONAL: If something blocks the current goal
362
- "description": "What's blocking", // E.g., "Cookie consent modal"
363
- "clearingCommands": ["await page.click(...)"] // Commands to clear blocker FIRST
364
- },
365
- "stepReEvaluation": { // OPTIONAL: If questioning step order (last resort)
366
- "detected": true, // Set true if you think step order is wrong
367
- "issue": "prior_incomplete|already_done|wrong_order", // What's the issue?
368
- "explanation": "Why step order seems wrong" // Your reasoning
369
- }
370
- }
371
-
372
- STATUS DECISION RULES (CRITICAL!):
373
-
374
- - "complete": ALL commands succeeded AND step goal achieved
375
- ❌ NEVER mark complete if ANY command failed!
376
-
377
- - "continue": Command failed OR need more actions
378
- Use when: Need to retry with different selector, wait for something, or continue execution
379
-
380
- - "stuck": Tried 3+ iterations, all failed
381
-
382
- - "infeasible": Goal truly impossible (element doesn't exist anywhere)
383
-
384
- IF COMMAND FAILED → status must be "continue" (to retry) or "stuck" (if tried many times)
385
- IF COMMAND SUCCEEDED → status can be "complete" (if goal done) or "continue" (if more needed)
386
-
387
- STEP RE-EVALUATION (LAST RESORT - After 2+ Failures):
388
-
389
- When you've failed multiple times on current step, QUESTION YOUR ASSUMPTIONS:
390
-
391
- 1. **LOOK BACK**: Did prior steps really complete?
392
- - Check COMPLETED steps list vs current page state
393
- - Example: Says "Login complete" but you see login form → PRIOR STEP FAILED
394
- - If prior step incomplete: Execute commands to complete it, explain in reasoning
395
-
396
- 2. **LOOK FORWARD**: Is current step already done or no longer relevant?
397
- - Check if current step goal already achieved (element exists, state correct)
398
- - Example: Step says "Click Submit" but form already submitted → STEP ALREADY DONE
399
- - If already done: Mark "complete" and explain why
400
-
401
- 3. **LOOK AHEAD**: Is next step more feasible?
402
- - Sometimes UI flows differently than expected
403
- - Example: Current step "Click Edit" fails, but next step "Fill Name" elements are visible
404
- - If can do future step: Note in reasoning, but PREFER sticking to order
405
-
406
- STICK TO ORIGINAL PLAN unless clear evidence suggests otherwise.
407
- Only re-evaluate after repeated failures on same step.
408
-
409
- BLOCKER DETECTION (IMPORTANT!):
410
-
411
- Use "blockerDetected" when the CURRENT STEP GOAL is not immediately feasible because something unexpected is in the way:
412
-
413
- Examples of BLOCKERS:
414
- - Cookie consent modal appears before login
415
- - "What's New" tour overlay prevents clicking navigation
416
- - Email verification prompt blocks form submission
417
- - Session timeout modal requires re-login before continuing
418
-
419
- When you detect a blocker:
420
- 1. Set "blockerDetected.description" to describe what's blocking
421
- 2. Provide "clearingCommands" to dismiss/handle the blocker
422
- 3. Set your main "commands" to execute AFTER blocker is cleared
423
- 4. Agent will execute: clearingCommands FIRST, then your commands
424
-
425
- Example:
426
- {
427
- "blockerDetected": {
428
- "description": "Cookie consent modal blocking page interaction",
429
- "clearingCommands": ["await page.getByRole('button', {name: 'Accept'}).click()"]
430
- },
431
- "commands": ["await page.getByRole('textbox', {name: 'Email'}).fill('user@example.com')"],
432
- ...
433
- }
434
-
435
- DON'T use blockerDetected for:
436
- - Normal step execution (just use commands)
437
- - Wrong selectors (that's not a blocker, just fix the selector)
438
- - Missing elements (that's infeasible, not blocked)
439
-
440
- EXPERIENCES - APP-SPECIFIC PATTERNS ONLY:
441
-
442
- Guidelines:
443
- - Concise - combine related info into single learning
444
- - Focus on app quirks, not obvious practices
445
- - Max 1-2 experiences per iteration
446
- - Capture SEMANTIC SELECTOR patterns when discovered
447
-
448
- ✅ GOOD - App-specific patterns:
449
- - "Login form fields accessible via getByLabel: 'Email' and 'Password'"
450
- - "Submit buttons consistently use role=button with text matching action"
451
- - "Form uses custom dropdowns (role=combobox) instead of native selects"
452
- - "Navigation items accessible via getByRole('link') with visible text"
453
- - "Site uses data-testid attributes consistently (pattern: action-element-name)"
454
- - "Input fields have clear placeholders - prefer getByPlaceholder over IDs"
455
-
456
- ❌ BAD - Too specific or obvious:
457
- - Listing individual field selectors (combine them)
458
- - "Button exists with role=button" (obvious)
459
- - Generic automation advice
460
- - Noting auto-generated IDs like #«r3»-form-item (these are unreliable)
461
-
462
- COMBINE related learnings instead of listing each element separately.
463
- PRIORITIZE learning about semantic selector availability over CSS selectors.
464
-
465
- CRITICAL RULES FOR DECISION MAKING:
466
-
467
- 1. **BE FOCUSED AND CONCISE** (CRITICAL!):
468
- - Read the CURRENT STEP GOAL carefully - ONLY do what's explicitly asked for in THAT step
469
- - DON'T add extra actions, verifications, or defensive checks not mentioned in the step goal
470
- - If step says "Send a message" → ONLY send the message, DON'T verify it appeared
471
- - If step says "Click Submit" → ONLY click, DON'T wait or verify submission success
472
- - ONLY add expect() assertions if the step explicitly mentions "verify", "check", "ensure", or "confirm"
473
- - Keep commands minimal - no unnecessary preparatory or validation actions
474
-
475
- 2. **COMMAND EFFICIENCY**:
476
- - DON'T click on input field before filling it - just use .fill() directly
477
- - DON'T do preparatory actions unless necessary
478
- - Example: page.fill works without clicking the input first
479
-
480
- 2b. **VARIABLE SCOPE & AVAILABLE FUNCTIONS** (CRITICAL!):
481
- - Commands in SAME BATCH share scope - variables persist within iteration
482
- - To save data for later steps: Use extract_data tool, then access via extractedData in context
483
- - Within same batch: Variables work across commands
484
- - AVAILABLE IN CONTEXT: page (Playwright Page object) and expect (Playwright assertions)
485
- - You can use expect() directly - it is already imported and available
486
-
487
- 3. **WHEN COMMAND FAILS** (most important!):
488
- - DON'T repeat the same command/selector
489
- - DO analyze the error message
490
- - DO look at DOM snapshot for SEMANTIC selectors (getByRole, getByLabel, getByText)
491
- - AVOID falling back to auto-generated IDs with unicode characters (e.g., #«r3»-form-item)
492
- - DO take screenshot if DOM doesn't help
493
- - "Element is not a <select>" → It's a custom dropdown, use .click() then select from menu
494
- - "Timeout waiting for locator" → Try semantic selectors from ARIA tree instead of CSS IDs
495
-
496
- 4. **CHECK YOUR RECENT STEPS FOR PATTERNS**:
497
- - If you see same selector failed 2+ times → YOU ARE LOOPING!
498
- - Set detectingLoop: true and explain why
499
- - Take screenshot to see what's actually on the page
500
- - Use different SEMANTIC selector from DOM/ARIA tree (getByRole, getByLabel, getByText, data-testid)
501
- - AVOID repeating with similar auto-generated ID patterns (e.g., #«r3»-form-item → #«r4»-form-item)
502
-
503
- 5. **USE DOM SNAPSHOT** - ONLY USE WHAT'S ACTUALLY THERE (CRITICAL!):
504
- - You get INTERACTIVE ELEMENTS list showing: position, tag, id, text, SUGGESTED SELECTORS
505
- - ONLY use selectors from this list - DO NOT invent selectors with text/names not in the list!
506
- - ❌ WRONG: List shows button "Submit" but you use getByRole('button', {name: 'SomeOtherText'})
507
- - ✅ RIGHT: Use EXACT selector from the provided list
508
- - ✅ RIGHT: If list shows "#submit-btn" → use that
509
- - ✅ RIGHT: If list shows "getByRole('button', {name: 'X'})" → use exactly that
510
- - If element you need is NOT in list → take_screenshot to find it
511
- - ARIA tree shows roles/names that exist - inventing roles/names not in tree will fail!
512
-
513
- 5b. **SELECTOR PREFERENCE** (CRITICAL - Playwright Best Practices):
514
- ALWAYS prefer user-friendly, semantic selectors in this order:
515
-
516
- 🏆 BEST (use these first):
517
- 1. page.getByRole('role', {name: 'text'}) → ✅ Accessible, semantic, resilient
518
- 2. page.getByLabel('label text') → ✅ Great for form inputs
519
- 3. page.getByPlaceholder('placeholder') → ✅ Good for inputs without labels
520
- 4. page.getByText('visible text') → ✅ Clear and readable
521
- 5. page.getByTestId('test-id') → ✅ Stable if available
522
-
523
- ⚠️ AVOID (last resort only):
524
- - CSS selectors with auto-generated IDs: #r3-form-item, #«r3»-form-item
525
- - CSS selectors with unicode: #«r4»-form-item
526
- - Complex CSS paths: div.ant-form > div:nth-child(2)
527
-
528
- Examples:
529
- ❌ BAD: await page.fill('#«r3»-form-item', 'alice@example.com')
530
- ✅ GOOD: await page.getByLabel('Email').fill('alice@example.com')
531
- ✅ GOOD: await page.getByRole('textbox', {name: 'Email'}).fill('alice@example.com')
532
- ✅ GOOD: await page.getByPlaceholder('Enter your email').fill('alice@example.com')
533
-
534
- ❌ BAD: await page.click('#submit-btn-23847')
535
- ✅ GOOD: await page.getByRole('button', {name: 'Submit'}).click()
536
- ✅ GOOD: await page.getByText('Submit').click()
537
-
538
- WHY: Semantic selectors are more maintainable, self-documenting, and resilient to UI changes.
539
- Auto-generated IDs (especially with unicode like «r3») break when component instances change.
540
-
541
- 5c. **ASSERTIONS AND VERIFICATION** (CRITICAL - Use expect):
542
- Use expect() for verification ONLY when the step explicitly asks for verification.
543
-
544
- 🏆 WHEN TO USE expect():
545
- - Step says "Verify message appears" → await expect(locator).toBeVisible()
546
- - Step says "Check field is empty" → await expect(locator).toBeEmpty()
547
- - Step says "Confirm URL changed" → await expect(page).toHaveURL('...')
548
- - Step explicitly mentions: "verify", "check", "ensure", "confirm", "validate"
549
-
550
- ❌ WHEN NOT TO USE expect():
551
- - Step says "Send a message" → DON'T add expect() for message visibility
552
- - Step says "Click Submit" → DON'T add expect() for button state or result
553
- - Step says "Fill Email field" → DON'T add expect() to verify it's filled
554
- - Step says "Navigate to Dashboard" → DON'T add expect() for URL (unless step asks)
555
-
556
- 🏆 expect() syntax (when needed):
557
- - await expect(locator).toBeVisible()
558
- - await expect(locator).toHaveText('expected text')
559
- - await expect(locator).toHaveValue('expected value')
560
- - await expect(locator).toBeEmpty()
561
- - await expect(page).toHaveURL('expected-url')
562
-
563
- ❌ BAD - Manual checks:
564
- const value = await page.getByPlaceholder('Message...').inputValue();
565
- if (value !== '') throw new Error(\`Expected empty but found: '\${value}'\`);
566
-
567
- ✅ GOOD - Use expect (but ONLY if step asks to verify):
568
- await expect(page.getByPlaceholder('Message...')).toBeEmpty();
569
-
570
- WHY: expect() has built-in retries and better error messages.
571
- BUT: Only use it when verification is explicitly requested in the step goal.
572
-
573
- 6. **TOOLS ARE READ-ONLY (Information Gathering)**:
574
- AVAILABLE TOOLS (for information only):
575
- - take_screenshot: Get visual context when DOM isn't enough or stuck 2+ times
576
- * DEFAULT: Captures FULL PAGE (elements below fold included)
577
- * Set isFullPage=false ONLY if you need just viewport
578
- * Use for: Planning, finding hidden elements, verifying layout
579
- - recall_history: Remember earlier steps in journey
580
- - inspect_page: Force fresh DOM snapshot (rarely needed, already in context)
581
- - check_page_ready: Verify page loaded (use sparingly!)
582
- - extract_data: Save data from page to memory for later steps
583
-
584
- ALL STATE CHANGES USE PLAYWRIGHT COMMANDS:
585
- - Navigation: await page.goto('url')
586
- - Go back: await page.goBack()
587
- - Refresh: await page.reload()
588
- - Fill inputs: await page.fill() or page.getByRole().fill()
589
- - Click buttons: await page.click() or page.getByRole().click()
590
- - Wait: await page.waitForURL(), page.waitForLoadState()
591
-
592
- Tools gather info → Commands change state
593
-
594
- 7. **ERROR-SPECIFIC RESPONSES**:
595
- - "Timeout waiting for locator" → Selector doesn't exist, try different one from DOM
596
- - "Element is not a <select>" → It's a custom dropdown (button with role=combobox), use .click()
597
- - "Multiple elements" → Add more specific selector from DOM (use #id)
598
-
599
- 8. **WHEN TO RUN COMMANDS vs TOOLS**:
600
- - Confident about selectors from DOM → Run commands directly
601
- - Unsure or failed 2+ times → Take screenshot first
602
- - First iteration of a step → Usually can run commands from DOM`;
603
- }
604
-
605
- /**
606
- * Build user prompt with context
607
- */
608
- private buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
609
- const parts: string[] = [];
610
-
611
- parts.push('=== YOUR CURRENT CONTEXT ===\n');
612
-
613
- // System warnings for accumulated failures
614
- if (consecutiveFailures && consecutiveFailures >= 2) {
615
- parts.push(`⚠️⚠️⚠️ SYSTEM WARNING ⚠️⚠️⚠️`);
616
- parts.push(`You've failed ${consecutiveFailures} iterations in a row!`);
617
- parts.push(`Next failure will force STUCK. You MUST:`);
618
- parts.push(`1. Take screenshot if you haven't already`);
619
- parts.push(`2. Try completely different selector strategy`);
620
- parts.push(`3. QUESTION ASSUMPTIONS: Am I at the right step?`);
621
- parts.push(` - LOOK BACK: Did prior steps really complete? Check page state vs COMPLETED list`);
622
- parts.push(` - LOOK FORWARD: Is current step already done? Check if goal already achieved`);
623
- parts.push(`4. OR mark status="stuck" if truly can't proceed`);
624
- parts.push(`⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️\n`);
625
- }
626
-
627
- // Goals - make current step very prominent
628
- parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
629
- parts.push(`${context.currentStepGoal}`);
630
- parts.push(`^^ DO ONLY WHAT THIS STEP ASKS - NO EXTRA ACTIONS OR VERIFICATIONS ^^`);
631
- parts.push(`^^ If step doesn't say "verify/check/confirm" → DON'T add expect() assertions ^^`);
632
- parts.push(`^^ Mark "complete" ONLY if commands succeeded ^^`);
633
- parts.push(``);
634
- parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
635
-
636
- if (context.completedSteps.length > 0) {
637
- parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
638
- }
639
- if (context.remainingSteps.length > 0) {
640
- parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
641
- }
642
-
643
- // Current page state
644
- parts.push(`\nCURRENT PAGE:`);
645
- parts.push(`URL: ${context.currentURL}`);
646
- parts.push(`Title: ${context.currentPageInfo.title}`);
647
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
648
- parts.push(context.currentPageInfo.formattedElements);
649
- parts.push(`\nARIA TREE (hierarchical structure):`);
650
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
651
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
652
- parts.push('... (truncated)');
653
- }
654
- parts.push('')
655
-
656
- // Recent steps
657
- if (context.recentSteps.length > 0) {
658
- parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
659
- for (const step of context.recentSteps) {
660
- const status = step.result === 'success' ? '✓' : '✗';
661
- parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
662
- parts.push(` Code: ${step.code}`);
663
- if (step.result === 'failure' && step.error) {
664
- parts.push(` ❌ ERROR: ${step.error}`);
665
- parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
666
- } else {
667
- parts.push(` Result: ${step.observation}`);
668
- }
669
- }
670
- parts.push('');
671
-
672
- // Detect repeated failures
673
- const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
674
- if (recentFailures.length >= 2) {
675
- const sameSelector = recentFailures.slice(-2).every((s, i, arr) =>
676
- i === 0 || s.code === arr[i-1].code
677
- );
678
- if (sameSelector) {
679
- parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
680
- parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
681
- parts.push(` YOU MUST try a completely different selector this time!\n`);
682
- }
683
- }
684
- }
685
-
686
- // Experiences
687
- if (context.experiences.length > 0) {
688
- parts.push(`\nLEARNINGS (${context.experiences.length}):`);
689
- for (const exp of context.experiences) {
690
- parts.push(` - ${exp}`);
691
- }
692
- parts.push('');
693
- }
694
-
695
- // Extracted data
696
- if (Object.keys(context.extractedData).length > 0) {
697
- parts.push(`\nEXTRACTED DATA:`);
698
- for (const [key, value] of Object.entries(context.extractedData)) {
699
- parts.push(` ${key}: ${value}`);
700
- }
701
- parts.push('');
702
- }
703
-
704
- // Previous reflection
705
- if (context.previousIterationGuidance) {
706
- parts.push(`\nYOUR PREVIOUS GUIDANCE TO YOURSELF:`);
707
- parts.push(`Focus: ${context.previousIterationGuidance.guidanceForNext}`);
708
- if (context.previousIterationGuidance.detectingLoop) {
709
- parts.push(`⚠ YOU DETECTED A LOOP: ${context.previousIterationGuidance.loopReasoning}`);
710
- }
711
- parts.push('');
712
- }
713
-
714
- // Tool results (if any)
715
- if (context.toolResults) {
716
- parts.push(`\n========== TOOL RESULTS ==========`);
717
- for (const [toolName, result] of Object.entries(context.toolResults)) {
718
- parts.push(`\n${toolName}: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
719
-
720
- if (result.learning) {
721
- parts.push(`\n📊 ANALYSIS FROM ${toolName.toUpperCase()}:`);
722
- parts.push(result.learning);
723
- parts.push(`^^ USE THIS INFORMATION TO DECIDE YOUR NEXT COMMANDS ^^`);
724
- }
725
-
726
- if (result.error) {
727
- parts.push(` Error: ${result.error}`);
728
- }
729
- }
730
- parts.push(`\n===================================\n`);
731
- }
732
-
733
- parts.push('=== WHAT DO YOU WANT TO DO? ===');
734
- parts.push('Respond with JSON following the format specified in the system prompt.');
735
-
736
- return parts.join('\n');
737
- }
546
+
738
547
 
739
548
  /**
740
549
  * Parse agent decision from LLM response
@@ -744,21 +553,37 @@ CRITICAL RULES FOR DECISION MAKING:
744
553
  // Extract JSON from response
745
554
  const jsonMatch = response.match(/\{[\s\S]*\}/);
746
555
  if (!jsonMatch) {
556
+ this.logger?.(`[Orchestrator] ✗ No JSON found in LLM response`, 'error');
557
+ this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
747
558
  throw new Error('No JSON found in response');
748
559
  }
749
560
 
750
561
  const parsed = JSON.parse(jsonMatch[0]);
751
562
 
752
563
  // Validate required fields
753
- if (!parsed.status || !parsed.reasoning) {
754
- throw new Error('Missing required fields: status and reasoning');
564
+ // Accept either "reasoning" or "statusReasoning" (LLMs sometimes only provide one)
565
+ if (!parsed.status || (!parsed.reasoning && !parsed.statusReasoning)) {
566
+ this.logger?.(`[Orchestrator] ✗ Missing required fields in parsed JSON`, 'error');
567
+ this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
568
+ this.logger?.(`[Orchestrator] 📄 PARSED JSON:\n${JSON.stringify(parsed, null, 2)}`, 'error');
569
+ this.logger?.(`[Orchestrator] ❌ Has status: ${!!parsed.status}, Has reasoning: ${!!parsed.reasoning}, Has statusReasoning: ${!!parsed.statusReasoning}`, 'error');
570
+ throw new Error('Missing required fields: status and (reasoning or statusReasoning)');
571
+ }
572
+
573
+ // Normalize: if reasoning is missing but statusReasoning exists, use statusReasoning as reasoning
574
+ if (!parsed.reasoning && parsed.statusReasoning) {
575
+ parsed.reasoning = parsed.statusReasoning;
755
576
  }
756
577
 
757
578
  return parsed as AgentDecision;
758
579
 
759
580
  } catch (error: any) {
760
581
  this.logger?.(`[Orchestrator] ✗ Failed to parse agent decision: ${error.message}`, 'error');
761
- this.logger?.(`[Orchestrator] Raw response: ${response.substring(0, 200)}...`, 'log');
582
+
583
+ // Only log full response if not already logged above
584
+ if (!error.message.includes('Missing required fields') && !error.message.includes('No JSON found')) {
585
+ this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
586
+ }
762
587
 
763
588
  // Return fallback
764
589
  return {