npm - testchimp-runner-core - Versions diffs - 0.0.33 → 0.0.34 - Mend

testchimp-runner-core 0.0.33 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/dist/llm-facade.d.ts.map +1 -1
package/dist/llm-facade.js +7 -7
package/dist/llm-facade.js.map +1 -1
package/dist/llm-provider.d.ts +9 -0
package/dist/llm-provider.d.ts.map +1 -1
package/dist/model-constants.d.ts +16 -5
package/dist/model-constants.d.ts.map +1 -1
package/dist/model-constants.js +17 -6
package/dist/model-constants.js.map +1 -1
package/dist/orchestrator/index.d.ts +1 -1
package/dist/orchestrator/index.d.ts.map +1 -1
package/dist/orchestrator/index.js +3 -2
package/dist/orchestrator/index.js.map +1 -1
package/dist/orchestrator/orchestrator-agent.d.ts +0 -8
package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
package/dist/orchestrator/orchestrator-agent.js +206 -405
package/dist/orchestrator/orchestrator-agent.js.map +1 -1
package/dist/orchestrator/orchestrator-prompts.d.ts +20 -0
package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
package/dist/orchestrator/orchestrator-prompts.js +455 -0
package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
package/dist/orchestrator/tools/index.d.ts +2 -1
package/dist/orchestrator/tools/index.d.ts.map +1 -1
package/dist/orchestrator/tools/index.js +4 -2
package/dist/orchestrator/tools/index.js.map +1 -1
package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
package/dist/orchestrator/tools/verify-action-result.js +140 -0
package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
package/dist/orchestrator/types.d.ts +26 -0
package/dist/orchestrator/types.d.ts.map +1 -1
package/dist/orchestrator/types.js.map +1 -1
package/dist/prompts.d.ts.map +1 -1
package/dist/prompts.js +40 -34
package/dist/prompts.js.map +1 -1
package/dist/scenario-worker-class.d.ts.map +1 -1
package/dist/scenario-worker-class.js +4 -1
package/dist/scenario-worker-class.js.map +1 -1
package/dist/utils/coordinate-converter.d.ts +32 -0
package/dist/utils/coordinate-converter.d.ts.map +1 -0
package/dist/utils/coordinate-converter.js +130 -0
package/dist/utils/coordinate-converter.js.map +1 -0
package/package.json +1 -1
package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
package/plandocs/PHASE_1_COMPLETE.md +165 -0
package/plandocs/PHASE_1_SUMMARY.md +184 -0
package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
package/src/llm-facade.ts +8 -8
package/src/llm-provider.ts +11 -1
package/src/model-constants.ts +17 -5
package/src/orchestrator/index.ts +3 -2
package/src/orchestrator/orchestrator-agent.ts +249 -424
package/src/orchestrator/orchestrator-agent.ts.backup +1386 -0
package/src/orchestrator/orchestrator-prompts.ts +474 -0
package/src/orchestrator/tools/index.ts +2 -1
package/src/orchestrator/tools/verify-action-result.ts +159 -0
package/src/orchestrator/types.ts +48 -0
package/src/prompts.ts +40 -34
package/src/scenario-worker-class.ts +7 -2
package/src/utils/coordinate-converter.ts +162 -0
package/testchimp-runner-core-0.0.33.tgz +0 -0
/package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
/package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
/package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0

package/src/orchestrator/orchestrator-agent.ts.backup ADDED Viewed

@@ -0,0 +1,1386 @@
+/**
+ * Orchestrator Agent
+ * Single agent with tool-use capabilities that maintains journey memory and executes scenarios
+ */
+import { expect } from '@playwright/test';
+import { LLMFacade } from '../llm-facade';
+import { ProgressReporter, TokenUsage, StepExecutionStatus } from '../progress-reporter';
+import { getEnhancedPageInfo } from '../utils/page-info-utils';
+import { CoordinateConverter } from '../utils/coordinate-converter';
+import { ToolRegistry, ToolExecutionContext } from './tool-registry';
+import { DEFAULT_MODEL } from '../model-constants';
+import {
+  AgentConfig,
+  AgentContext,
+  AgentDecision,
+  JourneyMemory,
+  MemoryStep,
+  OrchestratorStepResult,
+  SelfReflection,
+  NoteToFutureSelf,
+  CoordinateAction,
+  DEFAULT_AGENT_CONFIG
+} from './types';
+import { OrchestratorPrompts } from './orchestrator-prompts';
+/**
+ * Orchestrator Agent - manages step execution with tool use and memory
+ */
+export class OrchestratorAgent {
+  private llmFacade: LLMFacade;
+  private toolRegistry: ToolRegistry;
+  private progressReporter?: ProgressReporter;
+  private config: Required<AgentConfig>;
+  private logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void;
+  private debugMode: boolean = false;
+  constructor(
+    llmFacade: LLMFacade,
+    toolRegistry: ToolRegistry,
+    config?: Partial<AgentConfig>,
+    progressReporter?: ProgressReporter,
+    logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void,
+    debugMode?: boolean
+  ) {
+    this.llmFacade = llmFacade;
+    this.toolRegistry = toolRegistry;
+    this.config = { ...DEFAULT_AGENT_CONFIG, ...config };
+    this.progressReporter = progressReporter;
+    this.logger = logger;
+    this.debugMode = debugMode || false;
+  }
+  setDebugMode(enabled: boolean): void {
+    this.debugMode = enabled;
+  }
+  /**
+   * Execute a single step of the scenario
+   */
+  async executeStep(
+    page: any,
+    stepDescription: string,
+    stepNumber: number,
+    totalSteps: number,
+    scenarioSteps: string[],
+    memory: JourneyMemory,
+    jobId: string
+  ): Promise<OrchestratorStepResult> {
+    this.logger?.(`\n[Orchestrator] ========== STEP ${stepNumber}/${totalSteps} ==========`);
+    this.logger?.(`[Orchestrator] 🎯 Goal: ${stepDescription}`);
+    let iteration = 0;
+    let previousReflection: SelfReflection | undefined = undefined;
+    let noteToSelf: NoteToFutureSelf | undefined = memory.latestNote;  // Start with note from previous step
+    const commandsExecuted: string[] = [];
+    let consecutiveFailures = 0;  // Track consecutive iterations with failed commands
+    let coordinateAttempts = 0;  // Track coordinate mode attempts (max 2)
+    while (iteration < this.config.maxIterationsPerStep) {
+      iteration++;
+      this.logger?.(`\n[Orchestrator] === Iteration ${iteration}/${this.config.maxIterationsPerStep} ===`);
+      // Build context for agent
+      const context = await this.buildAgentContext(
+        page,
+        stepDescription,
+        stepNumber,
+        totalSteps,
+        scenarioSteps,
+        memory,
+        previousReflection,
+        consecutiveFailures,
+        noteToSelf  // NEW: Pass note from previous iteration
+      );
+      // Call agent to make decision
+      const decision = await this.callAgent(
+        context,
+        jobId,
+        stepNumber,
+        iteration,
+        consecutiveFailures
+      );
+      // Log agent's reasoning
+      this.logAgentDecision(decision, iteration);
+      // Report progress
+      await this.reportStepProgress(jobId, stepNumber, stepDescription, decision, iteration);
+      // Execute tools if requested (tools are READ-ONLY, they don't change state)
+      let toolResults: Record<string, any> = {};
+      if (decision.toolCalls && decision.toolCalls.length > 0) {
+        toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
+        // If agent wants to wait for tool results before proceeding, call agent again
+        if (decision.needsToolResults) {
+          const updatedContext = { ...context, toolResults };
+          const continuedDecision = await this.callAgent(updatedContext, jobId, stepNumber, iteration, consecutiveFailures);
+          // Merge continued decision
+          decision.commands = continuedDecision.commands || decision.commands;
+          decision.commandReasoning = continuedDecision.commandReasoning || decision.commandReasoning;
+          decision.status = continuedDecision.status;
+          decision.statusReasoning = continuedDecision.statusReasoning;
+          decision.reasoning = continuedDecision.reasoning;
+        }
+      }
+      // Execute commands sequentially
+      let iterationHadFailure = false;
+      // Handle blocker if detected (clear blocker FIRST, then proceed with main commands)
+      if (decision.blockerDetected && decision.blockerDetected.clearingCommands && decision.blockerDetected.clearingCommands.length > 0) {
+        this.logger?.(`[Orchestrator] 🚧 BLOCKER DETECTED: ${decision.blockerDetected.description}`);
+        this.logger?.(`[Orchestrator] 🧹 Clearing blocker with ${decision.blockerDetected.clearingCommands.length} command(s)...`);
+        const blockerResult = await this.executeCommandsSequentially(
+          decision.blockerDetected.clearingCommands,
+          page,
+          memory,
+          stepNumber,
+          iteration,
+          jobId
+        );
+        // Add blocker commands with comment to output
+        if (blockerResult.executed.length > 0) {
+          commandsExecuted.push(`// Blocker: ${decision.blockerDetected.description}`);
+          commandsExecuted.push(...blockerResult.executed);
+        }
+        // If blocker clearing failed, track it
+        if (!blockerResult.allSucceeded) {
+          this.logger?.(`[Orchestrator] ❌ Failed to clear blocker - continuing anyway`);
+          consecutiveFailures++;
+          iterationHadFailure = true;
+        }
+      }
+      // Execute main commands (only if no blocker failure)
+      if (decision.commands && decision.commands.length > 0 && !iterationHadFailure) {
+        const executeResult = await this.executeCommandsSequentially(
+          decision.commands,
+          page,
+          memory,
+          stepNumber,
+          iteration,
+          jobId
+        );
+        commandsExecuted.push(...executeResult.executed);
+        // Track failures
+        if (!executeResult.allSucceeded) {
+          this.logger?.(`[Orchestrator] ⚠ Command execution stopped at failure`);
+          consecutiveFailures++;
+          iterationHadFailure = true;
+        } else {
+          consecutiveFailures = 0;  // Reset on success
+        }
+      }
+      // Handle coordinate-based actions (NEW - fallback when selectors fail)
+      if (decision.coordinateAction && !iterationHadFailure) {
+        coordinateAttempts++;
+        this.logger?.(`[Orchestrator] 🎯 Coordinate Action (attempt ${coordinateAttempts}/2): ${decision.coordinateAction.action} at (${decision.coordinateAction.xPercent}%, ${decision.coordinateAction.yPercent}%)`);
+        try {
+          // BEFORE screenshot for visual verification
+          const beforeScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
+          const beforeDataUrl = `data:image/jpeg;base64,${beforeScreenshot}`;
+          // Generate Playwright commands from coordinate action
+          const coordCommands = await CoordinateConverter.generateCommands(decision.coordinateAction, page);
+          this.logger?.(`[Orchestrator] Generated commands:`);
+          coordCommands.forEach(cmd => this.logger?.(`  ${cmd}`));
+          // Execute coordinate commands
+          const coordResult = await this.executeCommandsSequentially(
+            coordCommands,
+            page,
+            memory,
+            stepNumber,
+            iteration,
+            jobId
+          );
+          commandsExecuted.push(...coordResult.executed);
+          if (!coordResult.allSucceeded) {
+            this.logger?.(`[Orchestrator] ❌ Coordinate action failed (Playwright error)`);
+            consecutiveFailures++;
+            iterationHadFailure = true;
+            // Give up after 2 coordinate attempts
+            if (coordinateAttempts >= 2) {
+              this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
+              return {
+                success: false,
+                commands: commandsExecuted,
+                iterations: iteration,
+                terminationReason: 'agent_stuck',
+                memory,
+                error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
+              };
+            }
+          } else {
+            this.logger?.(`[Orchestrator] ✅ Coordinate action succeeded (no Playwright error)`);
+            // CRITICAL: Verify visually if goal was achieved (coordinates might have clicked wrong place)
+            // Wait for network idle (smarter than fixed timeout - waits as needed, max 10s)
+            try {
+              await page.waitForLoadState('networkidle', { timeout: 10000 });
+            } catch (e) {
+              // Network idle timeout - page may still be loading, but proceed with verification
+              this.logger?.(`[Orchestrator] ⚠️  Network idle timeout after 10s, proceeding with verification`, 'warn');
+            }
+            const afterScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
+            const afterDataUrl = `data:image/jpeg;base64,${afterScreenshot}`;
+            this.logger?.(`[Orchestrator] 📸 Verifying coordinate action visually...`);
+            const verificationRequest = {
+              model: 'gpt-5-mini',
+              systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
+              userPrompt: `Goal: ${scenarioSteps[stepNumber - 1]}\n\nA coordinate-based action was just executed. Compare the BEFORE and AFTER screenshots.\n\nDid the action achieve the goal? Respond with JSON:\n{\n  "verified": boolean,\n  "reasoning": "What changed (or didn't change) between screenshots",\n  "visibleChanges": ["List of UI changes observed"],\n  "clickedWrongPlace": boolean\n}\n\nBe strict: Only return verified=true if you clearly see the expected change matching the goal.`,
+              images: [
+                { label: 'BEFORE', dataUrl: beforeDataUrl },
+                { label: 'AFTER', dataUrl: afterDataUrl }
+              ]
+            };
+            const verificationResponse = await this.llmFacade.llmProvider.callLLM(verificationRequest);
+            const jsonMatch = verificationResponse.answer.match(/\{[\s\S]*\}/);
+            if (jsonMatch) {
+              const verificationResult = JSON.parse(jsonMatch[0]);
+              const verified = verificationResult.verified === true;
+              const reasoning = verificationResult.reasoning || 'No reasoning provided';
+              const clickedWrongPlace = verificationResult.clickedWrongPlace === true;
+              this.logger?.(`[Orchestrator] 📊 Visual verification: ${verified ? '✅ VERIFIED' : '❌ NOT VERIFIED'}`);
+              this.logger?.(`[Orchestrator] 💭 Reasoning: ${reasoning}`);
+              if (verified) {
+                // Goal achieved! Can mark complete
+                consecutiveFailures = 0;
+                // Store note for context
+                noteToSelf = {
+                  fromIteration: iteration,
+                  content: `Coordinate action verified successful: ${reasoning}`
+                };
+                memory.latestNote = noteToSelf;
+              } else {
+                // Goal NOT achieved despite no error
+                this.logger?.(`[Orchestrator] ⚠️  Coordinate click succeeded but goal NOT achieved`, 'warn');
+                consecutiveFailures++;
+                iterationHadFailure = true;
+                // Store diagnostic note for next attempt
+                const diagnostic = clickedWrongPlace
+                  ? `Clicked wrong place. ${reasoning}. Try different coordinates.`
+                  : `Action executed but goal not achieved. ${reasoning}. May need different approach.`;
+                noteToSelf = {
+                  fromIteration: iteration,
+                  content: diagnostic
+                };
+                memory.latestNote = noteToSelf;
+                // Give up after 2 coordinate attempts
+                if (coordinateAttempts >= 2) {
+                  this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts, none achieved goal) - marking stuck`);
+                  return {
+                    success: false,
+                    commands: commandsExecuted,
+                    iterations: iteration,
+                    terminationReason: 'agent_stuck',
+                    memory,
+                    error: `Coordinate actions clicked but didn't achieve goal: ${reasoning}`
+                  };
+                }
+              }
+            } else {
+              this.logger?.(`[Orchestrator] ⚠️  Could not parse verification response - treating as unverified`, 'warn');
+              consecutiveFailures++;
+              iterationHadFailure = true;
+            }
+          }
+        } catch (error: any) {
+          this.logger?.(`[Orchestrator] ❌ Coordinate action error: ${error.message}`, 'error');
+          consecutiveFailures++;
+          iterationHadFailure = true;
+          // Give up after 2 coordinate attempts
+          if (coordinateAttempts >= 2) {
+            this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
+            return {
+              success: false,
+              commands: commandsExecuted,
+              iterations: iteration,
+              terminationReason: 'agent_stuck',
+              memory,
+              error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
+            };
+          }
+        }
+      }
+      // System-enforced stuck detection (agent might not detect it)
+      // Allow 5 failures: 3 selector attempts + 2 coordinate attempts
+      if (consecutiveFailures >= 5) {
+        this.logger?.(`[Orchestrator] 🛑 SYSTEM: ${consecutiveFailures} consecutive failures detected - forcing stuck`, 'warn');
+        return {
+          success: false,
+          commands: commandsExecuted,
+          iterations: iteration,
+          terminationReason: 'agent_stuck',
+          memory,
+          error: `Failed ${consecutiveFailures} iterations in a row - unable to proceed`
+        };
+      }
+      // Update memory with experiences
+      if (decision.experiences && decision.experiences.length > 0) {
+        for (const exp of decision.experiences) {
+          // Deduplicate - don't add if very similar experience exists
+          const exists = memory.experiences.some(existing =>
+            existing.toLowerCase().includes(exp.toLowerCase()) ||
+            exp.toLowerCase().includes(existing.toLowerCase())
+          );
+          if (!exists) {
+            memory.experiences.push(exp);
+            this.logger?.(`[Orchestrator] 📚 Experience: ${exp}`);
+          }
+        }
+        // Cap experiences
+        if (memory.experiences.length > this.config.maxExperiences) {
+          memory.experiences = memory.experiences.slice(-this.config.maxExperiences);
+        }
+      }
+      // Store self-reflection for next iteration
+      previousReflection = decision.selfReflection;
+      // Store note to future self (NEW - tactical memory across iterations AND steps)
+      if (decision.noteToFutureSelf) {
+        noteToSelf = {
+          fromIteration: iteration,
+          content: decision.noteToFutureSelf
+        };
+        memory.latestNote = noteToSelf;  // Persist in journey memory across steps
+        this.logger?.(`[Orchestrator] 📝 Note to self: ${decision.noteToFutureSelf}`);
+      }
+      // Check termination
+      if (decision.status !== 'continue') {
+        this.logger?.(`[Orchestrator] 🎯 Status: ${decision.status}`);
+        this.logger?.(`[Orchestrator] 💭 Reason: ${decision.statusReasoning}`);
+        // SAFETY CHECK: Don't allow "complete" if commands failed this iteration
+        if (decision.status === 'complete' && iterationHadFailure) {
+          this.logger?.(`[Orchestrator] ⚠️  OVERRIDE: Agent said "complete" but commands FAILED - forcing "continue"`, 'warn');
+          this.logger?.(`[Orchestrator] 💭 System: Commands must succeed before marking complete`);
+          // Don't return - continue to next iteration
+        } else {
+          // Valid termination
+          return {
+            success: decision.status === 'complete',
+            commands: commandsExecuted,
+            iterations: iteration,
+            terminationReason: decision.status === 'complete' ? 'complete' :
+                             decision.status === 'stuck' ? 'agent_stuck' :
+                             'infeasible',
+            memory
+          };
+        }
+      }
+    }
+    // Hit iteration limit
+    this.logger?.(`[Orchestrator] ⚠ Maximum iterations reached (${this.config.maxIterationsPerStep})`);
+    return {
+      success: false,
+      commands: commandsExecuted,
+      iterations: iteration,
+      terminationReason: 'system_limit',
+      memory,
+      error: 'Maximum iterations reached'
+    };
+  }
+  /**
+   * Build context for agent
+   */
+  private async buildAgentContext(
+    page: any,
+    currentStepGoal: string,
+    stepNumber: number,
+    totalSteps: number,
+    scenarioSteps: string[],
+    memory: JourneyMemory,
+    previousReflection?: SelfReflection,
+    consecutiveFailures?: number,
+    noteFromPreviousIteration?: NoteToFutureSelf  // NEW
+  ): Promise<AgentContext> {
+    // Get fresh DOM
+    const currentPageInfo = await getEnhancedPageInfo(page);
+    const currentURL = page.url();
+    // Get recent steps
+    const recentSteps = memory.history.slice(-this.config.recentStepsCount);
+    // Build context
+    return {
+      overallGoal: scenarioSteps.join('\n'),
+      currentStepGoal,
+      stepNumber,
+      totalSteps,
+      completedSteps: scenarioSteps.slice(0, stepNumber - 1),
+      remainingSteps: scenarioSteps.slice(stepNumber),
+      currentPageInfo,
+      currentURL,
+      recentSteps,
+      experiences: memory.experiences,
+      extractedData: memory.extractedData,
+      previousIterationGuidance: previousReflection,
+      noteFromPreviousIteration  // NEW: Pass tactical note from previous iteration
+    };
+  }
+  /**
+   * Call agent to make decision
+   */
+  private async callAgent(
+    context: AgentContext,
+    jobId: string,
+    stepNumber: number,
+    iteration: number,
+    consecutiveFailures?: number
+  ): Promise<AgentDecision> {
+    // Detect if coordinate mode should be activated
+    // Phase 1: Only 2 tiers (selectors → coordinates), so activate after 3 failures
+    // Phase 2: Will have 3 tiers (selectors → index → coordinates), threshold will be 5
+    const useCoordinateMode = consecutiveFailures !== undefined && consecutiveFailures >= 3;
+    // Build appropriate system prompt based on mode
+    const toolDescriptions = this.toolRegistry.generateToolDescriptions();
+    const systemPrompt = useCoordinateMode
+      ? OrchestratorPrompts.buildCoordinateSystemPrompt()
+      : OrchestratorPrompts.buildSystemPrompt(toolDescriptions);
+    const userPrompt = OrchestratorPrompts.buildUserPrompt(context, consecutiveFailures);
+    // Log prompt lengths for monitoring
+    const systemLength = systemPrompt.length;
+    const userLength = userPrompt.length;
+    const totalLength = systemLength + userLength;
+    const estimatedTokens = Math.ceil(totalLength / 4); // Rough estimate: 4 chars per token
+    this.logger?.(`[Orchestrator] 📊 Prompt lengths: system=${systemLength} chars, user=${userLength} chars, total=${totalLength} chars (~${estimatedTokens} tokens)`, 'log');
+    try {
+      // Call LLM directly via provider
+      const llmRequest = {
+        model: DEFAULT_MODEL,
+        systemPrompt,
+        userPrompt
+      };
+      const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
+      // Report token usage
+      if (response.usage && this.progressReporter?.onTokensUsed) {
+        const tokenUsage: TokenUsage = {
+          jobId,
+          stepNumber,
+          iteration,
+          inputTokens: response.usage.inputTokens,
+          outputTokens: response.usage.outputTokens,
+          includesImage: false,
+          model: DEFAULT_MODEL,
+          timestamp: Date.now()
+        };
+        this.logger?.(`[Orchestrator] 💰 Reporting token usage: ${tokenUsage.inputTokens} + ${tokenUsage.outputTokens}`, 'log');
+        await this.progressReporter.onTokensUsed(tokenUsage);
+      } else if (!response.usage) {
+        this.logger?.(`[Orchestrator] ⚠ No usage data in LLM response`, 'warn');
+      }
+      // Parse response
+      return this.parseAgentDecision(response.answer);
+    } catch (error: any) {
+      this.logger?.(`[Orchestrator] ✗ Agent call failed: ${error.message}`, 'error');
+      // Return fallback decision
+      return {
+        status: 'stuck',
+        statusReasoning: `Agent call failed: ${error.message}`,
+        reasoning: 'LLM call failed'
+      };
+    }
+  }
+  /**
+   * Build system prompt with tool descriptions
+   */
+  private buildSystemPrompt(): string {
+    const toolDescriptions = this.toolRegistry.generateToolDescriptions();
+    return `You are an intelligent test automation agent that executes web scenarios using Playwright.
+${toolDescriptions}
+YOUR RESPONSE FORMAT - Output JSON matching this interface:
+interface AgentDecisionLLMResponse {
+  status: string;              // REQUIRED: "continue" | "complete" | "stuck" | "infeasible"
+  reasoning: string;           // REQUIRED: Your thinking - what you're doing and why
+  commands?: string[];         // Playwright commands to execute
+  commandReasoning?: string;   // Why these commands
+  toolCalls?: Array<{          // Tools to call
+    name: string;
+    params: Record<string, any>;
+  }>;
+  toolReasoning?: string;      // Why these tools
+  needsToolResults?: boolean;  // Wait for tool results before commands
+  noteToFutureSelf?: string;   // Free-form tactical note for next iteration
+  coordinateAction?: {         // Use when selectors fail (after 3 attempts)
+    type: "coordinate";
+    action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
+    xPercent: number;          // 0-100, 3 decimals (e.g., 15.755)
+    yPercent: number;          // 0-100, 3 decimals (e.g., 8.500)
+    toXPercent?: number;       // For drag
+    toYPercent?: number;       // For drag
+    value?: string;            // For fill
+    scrollAmount?: number;     // For scroll
+  };
+  selfReflection?: {
+    guidanceForNext: string;
+    detectingLoop: boolean;
+    loopReasoning?: string;
+  };
+  experiences?: string[];      // App-specific learnings
+  blockerDetected?: {
+    description: string;
+    clearingCommands: string[];
+  };
+  stepReEvaluation?: {
+    detected: boolean;
+    issue: "prior_incomplete" | "already_done" | "wrong_order" | null;
+    explanation: string;
+  };
+}
+STATUS DECISION RULES (CRITICAL - Think carefully!):
+RULE #1: NEVER MARK "complete" IF ANY COMMAND FAILED
+- Command failed (timeout, error, exception)? → status MUST be "continue" or "stuck"
+- EVEN IF you think the goal might be achieved, if command failed → NOT "complete"
+- System will OVERRIDE and force "continue" if you violate this
+RULE #2: Decision tree:
+- Command FAILED? → "continue" (retry different way) OR "stuck" (exhausted all attempts)
+- Command SUCCEEDED? → "complete" (goal done) OR "continue" (need more actions)
+Status meanings:
+- "complete": Commands succeeded AND goal achieved
+- "continue": Command failed OR need more actions
+- "stuck": Tried 5 iterations, all failed, can't proceed
+- "infeasible": Goal impossible (element truly doesn't exist)
+Examples:
+✅ Command: page.click('button') → Success → Goal done → status: "complete"
+❌ Command: page.click('button') → Timeout → status: "complete" (WRONG! Must be "continue")
+✅ Command: page.click('button') → Timeout → status: "continue" (try different selector)
+STEP RE-EVALUATION (After 2+ failures - Question assumptions!):
+After repeated failures, check:
+1. LOOK BACK: Did prior steps actually complete? (Check COMPLETED vs page state)
+2. LOOK FORWARD: Is current step already done?
+3. LOOK AHEAD: Is next step more feasible with current page state?
+Stick to original plan unless clear evidence suggests otherwise.
+BLOCKER DETECTION:
+Use when unexpected UI blocks current goal (modals, overlays, prompts).
+Provide clearingCommands to dismiss blocker, then regular commands execute.
+Example: Cookie modal → clearingCommands: ["click Accept"], commands: ["fill email"]
+NOT blockers: Wrong selectors, missing elements (those are "infeasible")
+EXPERIENCES - App-specific patterns only:
+- Concise, 1-2 per iteration
+- Focus on app quirks (custom dropdowns, data-testid patterns, semantic selector availability)
+- NOT obvious things ("button has role=button") or individual selectors
+- Combine related learnings
+CRITICAL RULES FOR DECISION MAKING:
+DECISION-MAKING PROCESS:
+1. **FOCUS**: Do ONLY what current step asks. No extra actions unless step says "verify/check".
+2. **EFFICIENCY**: Don't click before fill. Don't add unnecessary preparation. Minimal commands.
+3. **WHEN COMMAND FAILS**: Analyze error → Check DOM for semantic selectors → Try different approach
+   Never repeat same selector. Avoid auto-generated IDs (#«r3»-form-item). Screenshot if stuck.
+4. **DETECT LOOPS**: Same selector failed 2+ times? Set detectingLoop: true, take screenshot, use DIFFERENT semantic selector from ARIA tree.
+5. **AVAILABLE CONTEXT**: page, expect (already imported), extractedData (from extract_data tool)
+6. **USE DOM SNAPSHOT** (CRITICAL - Don't invent selectors!):
+   - You get INTERACTIVE ELEMENTS with: position, tag, id, text, SUGGESTED SELECTORS
+   - ONLY use what's in the list - DON'T invent text/names not shown
+   - Use EXACT selectors provided (#submit-btn, getByRole('button', {name: 'X'}), etc.)
+   - ⚠️ If text appears multiple times → scope to parent or use role filter
+   - Element not in list? → take_screenshot
+   - ARIA tree = source of truth for roles/names
+   ⚠️ IGNORE NON-INTERACTIVE ELEMENTS:
+   - Tooltips (role="tooltip") - informational only, NOT clickable
+   - Status messages (role="status", role="alert") - display info, NOT clickable
+   - Popovers with no buttons inside - usually just show info on hover
+   - Hidden elements (aria-hidden="true", display:none) - can't interact
+   If you see a tooltip text in the goal, find the TRIGGER element (button/icon that shows the tooltip), NOT the tooltip itself.
+   ⚠️ TOOLTIPS CAUSE STRICT MODE VIOLATIONS:
+   - Tooltips/popovers DUPLICATE text in the DOM (button + tooltip both have same text)
+   - BAD: getByText('Settings') → matches both button AND tooltip → STRICT MODE ERROR
+   - GOOD: getByRole('button', { name: 'Settings' }) → matches only button, ignores tooltip
+   - GOOD: locator('button').getByText('Settings') → scoped to button element
+   - Always prefer role-based selectors when text might appear in tooltips
+7. **SELECTOR PREFERENCE** (CRITICAL):
+   Prefer in order:
+   1. getByRole/Label/Placeholder - Semantic, stable
+   2. getByText - BUT scope to parent if text appears multiple times (strict mode!)
+   3. data-testid or stable IDs
+   4. Avoid: Auto-generated IDs (#«r3»-form-item), unicode, complex CSS paths
+   ⚠️ Common mistakes:
+   - getByText('Settings') when "Settings" appears 2+ times → STRICT MODE ERROR
+     Fix: locator('#parent').getByText('Settings') OR getByRole('button').filter({hasText: 'Settings'})
+   - Missing timeout on goto() → Add { timeout: 30000 }
+   - Using auto-generated IDs → Break when components re-render
+8. **ASSERTIONS** (CRITICAL):
+   Use expect() ONLY when step explicitly asks: "verify", "check", "ensure", "confirm"
+   When to use:
+   - "Verify X appears" → await expect(locator).toBeVisible()
+   - "Check field is empty" → await expect(locator).toBeEmpty()
+   - "Confirm URL" → await expect(page).toHaveURL('...')
+   When NOT to use:
+   - "Send message" → DON'T verify it appeared (unless step asks)
+   - "Click Submit" → DON'T check button state
+   - "Fill field" → DON'T verify it's filled
+9. **TOOLS vs COMMANDS**:
+   Tools = read-only info gathering (screenshot, recall_history, extract_data)
+   Commands = state changes (Playwright: goto, click, fill, etc.)
+   Navigation commands MUST include timeout:
+   - page.goto(url, { waitUntil: 'load', timeout: 30000 })
+   - page.waitForLoadState('load', { timeout: 30000 })
+10. **ERROR ANALYSIS** (Think about what went wrong):
+   - "Timeout waiting for locator" → Selector doesn't exist, find different one in DOM
+   - "page.goto: Timeout" → Missing timeout param: page.goto(url, { timeout: 30000 })
+   - "strict mode violation" → Text appears multiple times. Scope to parent: locator('#parent').getByText()
+   - "Element is not <select>" → Custom dropdown, use .click() not .selectOption()
+   - Loop detected (same selector 2+ times) → Try completely different selector from ARIA tree
+11. **WHEN TO RUN COMMANDS vs TOOLS**:
+   - Confident about selectors from DOM → Run commands directly
+   - Unsure or failed 2+ times → Take screenshot first
+   - First iteration of a step → Usually can run commands from DOM
+12. **NOTE TO FUTURE SELF** (Tactical memory across iterations):
+   Write FREE-FORM notes for your next iteration about:
+   - What you tried and why it failed
+   - Hypothesis being tested
+   - Plan for next attempt
+   - Page behavior patterns observed
+   Your next iteration reads this FIRST - use it to maintain strategic continuity.
+13. **COORDINATE-BASED ACTIONS** (Last resort after 3 selector failures):
+    Activated automatically after 3 failures. Use PERCENTAGES (0-100, 3 decimals):
+    - xPercent: 0=left, 100=right
+    - yPercent: 0=top, 100=bottom
+    Format:
+    {
+      "coordinateAction": {
+        "type": "coordinate",
+        "action": "click|doubleClick|rightClick|hover|drag|fill|scroll",
+        "xPercent": 15.755, "yPercent": 8.500,
+        "toXPercent": 45.25, "toYPercent": 8.50,  // For drag
+        "value": "text",      // For fill
+        "scrollAmount": 500   // For scroll
+      }
+    }
+    AFTER coordinate action succeeds:
+    - If goal verification unclear → CALL verify_action_result tool
+    - Tool compares before/after screenshots to confirm goal achieved
+    - If verified: mark status="complete"
+    - If not verified: try different coordinates (2 attempts max)
+    Example after coordinate click:
+    {
+      "status": "continue",
+      "reasoning": "Coordinate click succeeded, verifying if dashboard page loaded",
+      "toolCalls": [{"name": "verify_action_result", "params": {"expectedChange": "Dashboard page with data grid visible"}}],
+      "needsToolResults": true
+    }`;
+  }
+  /**
+   * Verify goal achievement using before/after screenshot comparison
+   * Use sparingly - expensive (2 images sent to vision model)
+   */
+  private async verifyGoalWithScreenshotComparison(
+    goalDescription: string,
+    beforeDataUrl: string,
+    afterDataUrl: string
+  ): Promise<boolean> {
+    this.logger?.(`[Orchestrator] 🔍 Verifying goal with screenshot comparison...`);
+    try {
+      const request = {
+        model: 'gpt-5-mini', // Vision model
+        systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
+        userPrompt: `Goal: ${goalDescription}
+Compare the BEFORE and AFTER screenshots.
+Did the action achieve the goal? Respond with JSON:
+{
+  "goalAchieved": boolean,
+  "reasoning": "What changed (or didn't change) between screenshots",
+  "visibleChanges": ["List of UI changes observed"]
+}
+Focus on:
+- Did expected elements appear/disappear?
+- Did page navigate or content change?
+- Are there visual indicators the action succeeded (new panels, forms, highlights)?
+Be strict: Only return true if you clearly see the expected change.`,
+        images: [
+          { label: 'BEFORE', dataUrl: beforeDataUrl },
+          { label: 'AFTER', dataUrl: afterDataUrl }
+        ]
+      };
+      const response = await this.llmFacade.llmProvider.callLLM(request);
+      // Parse response
+      const jsonMatch = response.answer.match(/\{[\s\S]*\}/);
+      if (!jsonMatch) {
+        this.logger?.(`[Orchestrator] ⚠️ Visual verification: Could not parse response`, 'warn');
+        return false; // Default to not achieved if can't parse
+      }
+      const parsed = JSON.parse(jsonMatch[0]);
+      this.logger?.(`[Orchestrator] 📊 Visual verification result: ${parsed.goalAchieved}`);
+      this.logger?.(`[Orchestrator] 💭 Reasoning: ${parsed.reasoning}`);
+      return parsed.goalAchieved === true;
+    } catch (error: any) {
+      this.logger?.(`[Orchestrator] ❌ Visual verification failed: ${error.message}`, 'error');
+      return false; // Default to not achieved on error
+    }
+  }
+  /**
+   * Build coordinate-specific system prompt (used when selectors repeatedly fail)
+   */
+  private buildCoordinateSystemPrompt(): string {
+    return `You are a visual web automation expert. Selector generation has FAILED multiple times.
+YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
+SCREENSHOT PROVIDED:
+You will see a screenshot with visual indicators (bounding boxes or markers).
+CRITICAL - IDENTIFY THE CORRECT ELEMENT:
+1. READ the step goal carefully - what specific element are you looking for?
+2. LOCATE that element in the screenshot (NOT a similar-looking element!)
+3. VERIFY position using screen regions:
+   - Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
+   - Center content: xPercent ~30-70%
+   - Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
+4. CALCULATE percentages from element's CENTER position
+5. SANITY CHECK your percentages:
+   - Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
+   - Button in top-left at 90%? WRONG - that's top-right!
+   - Element description says "left" but x > 50%? WRONG - recheck!
+Example thought process:
+Goal: "Click Settings link in left navigation"
+→ I see "Settings" text in LEFT navigation panel in the screenshot
+→ Visual estimate: The link appears in the far left sidebar
+→ Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
+→ Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
+→ xPercent: 12.500, yPercent: 32.000
+→ Sanity check: 12.5% is FAR LEFT ✓ (NOT 80%+ which would be far right!)
+→ Description: "Clicking center of Settings link in left sidebar"
+CRITICAL VISUAL ESTIMATION TIPS:
+- Divide screenshot mentally into quadrants/regions
+- Left sidebar usually ~5-20% from left, center content ~30-70%, right sidebar ~75-95%
+- Aim for CENTER of element, not edges
+- Top bar usually 0-10% from top, footer usually 90-100%
+- Be conservative: slightly off-center is better than way off
+YOUR RESPONSE FORMAT - Output JSON matching this interface:
+interface AgentDecisionLLMResponse {
+  status: string;              // REQUIRED: "continue" (usually for coordinate mode)
+  reasoning: string;           // REQUIRED: "I see [element] at (X%, Y%) - using coordinates"
+  coordinateAction: {          // REQUIRED in coordinate mode
+    type: "coordinate";
+    action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
+    xPercent: number;          // 0-100, 3 decimals
+    yPercent: number;          // 0-100, 3 decimals
+    toXPercent?: number;       // For drag
+    toYPercent?: number;       // For drag
+    value?: string;            // For fill
+    scrollAmount?: number;     // For scroll
+  };
+  noteToFutureSelf?: string;   // Optional: What to try if this fails
+}
+COORDINATE REFERENCE:
+- Top-left corner: xPercent=0, yPercent=0
+- Top-right corner: xPercent=100, yPercent=0
+- Bottom-left corner: xPercent=0, yPercent=100
+- Bottom-right corner: xPercent=100, yPercent=100
+- Center of screen: xPercent=50, yPercent=50
+Use 3 decimal places for precision (e.g., 15.755, not 16).
+ACTIONS:
+**Physical clicks:**
+- click: { action: "click", xPercent: 15.755, yPercent: 8.500 }
+- doubleClick: { action: "doubleClick", xPercent: 15.755, yPercent: 8.500 }
+- rightClick: { action: "rightClick", xPercent: 15.755, yPercent: 8.500 }
+- hover: { action: "hover", xPercent: 15.755, yPercent: 8.500 }
+**Input actions:**
+- fill: Click then type
+  { action: "fill", xPercent: 30.000, yPercent: 25.000, value: "alice@example.com" }
+**Movement actions:**
+- drag: From one position to another
+  { action: "drag", xPercent: 10.000, yPercent: 50.000, toXPercent: 60.000, toYPercent: 50.000 }
+- scroll: At position, scroll by amount
+  { action: "scroll", xPercent: 50.000, yPercent: 50.000, scrollAmount: 500 }
+CRITICAL RULES:
+- Percentages are from viewport TOP-LEFT (not full page)
+- Use element CENTER for coordinates, not edges
+- Be precise with decimals - wrong coords click wrong element
+- For fill: system will click at (x%,y%) then type value automatically
+- For drag: toXPercent/toYPercent are REQUIRED
+DO NOT try to generate selectors - that approach already failed. Use coordinates only.
+This is a last-resort mechanism, but it WILL work if you provide accurate percentages.`;
+  }
+  /**
+   * Build user prompt with context
+   */
+  private buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
+    const parts: string[] = [];
+    // Put static instructions first for LLM caching efficiency
+    parts.push('STEP EXECUTION RULES:');
+    parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
+    parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
+    parts.push('- Mark "complete" ONLY if commands succeeded');
+    parts.push('- Try screenshot tool if you need visual context');
+    parts.push('- Max 5 iterations per step, then forced STUCK\n');
+    // Dynamic content follows (changes per iteration)
+    parts.push('=== CURRENT CONTEXT ===\n');
+    // Display note from previous iteration (high priority tactical info)
+    if (context.noteFromPreviousIteration) {
+      const note = context.noteFromPreviousIteration;
+      parts.push(`📝 YOUR NOTE FROM ITERATION ${note.fromIteration}:`);
+      parts.push(`   ${note.content}`);
+      parts.push(`   ^^ READ THIS - your previous self left important tactical guidance ^^`);
+      parts.push('');
+    }
+    // System warnings for accumulated failures
+    if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
+      parts.push(`⚠️ SYSTEM WARNING: ${consecutiveFailures} failures!`);
+      parts.push(`Take screenshot if needed. Try different selector strategy.`);
+      parts.push(`Question assumptions: Am I at the right step?`);
+      parts.push(`⚠️\n`);
+    } else if (consecutiveFailures && consecutiveFailures >= 4) {
+      parts.push(`⚠️ CRITICAL: ${consecutiveFailures} failures!`);
+      parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
+    }
+    // Trigger coordinate mode if many failures (Phase 1: after 3 failures)
+    if (consecutiveFailures && consecutiveFailures >= 3) {
+      parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯`);
+      parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
+      parts.push(`You MUST use coordinate-based action now (percentages).`);
+      parts.push(`Provide coordinateAction with xPercent/yPercent (0-100, 3 decimals for precision).`);
+      parts.push(`See system prompt for coordinate action format.`);
+      parts.push(`🎯🎯\n`);
+    }
+    // Goals - make current step very prominent
+    parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
+    parts.push(`${context.currentStepGoal}`);
+    parts.push(``);
+    parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
+    if (context.completedSteps.length > 0) {
+      parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
+    }
+    if (context.remainingSteps.length > 0) {
+      parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
+    }
+    // Current page state (most variable content - at the end)
+    parts.push(`\nCURRENT PAGE:`);
+    parts.push(`URL: ${context.currentURL}`);
+    parts.push(`Title: ${context.currentPageInfo.title}`);
+    parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
+    parts.push(context.currentPageInfo.formattedElements);
+    parts.push(`\nARIA TREE (hierarchical structure):`);
+    parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
+    if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
+      parts.push('... (truncated)');
+    }
+    parts.push('')
+    // Recent steps (most variable content - at the end)
+    if (context.recentSteps.length > 0) {
+      parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
+      for (const step of context.recentSteps) {
+        const status = step.result === 'success' ? '✓' : '✗';
+        parts.push(`  ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
+        parts.push(`     Code: ${step.code}`);
+        if (step.result === 'failure' && step.error) {
+          parts.push(`     ❌ ERROR: ${step.error}`);
+          parts.push(`     ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
+        } else {
+          parts.push(`     Result: ${step.observation}`);
+        }
+      }
+      parts.push('');
+      // Detect repeated failures
+      const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
+      if (recentFailures.length >= 2) {
+        const sameSelector = recentFailures.slice(-2).every((s, i, arr) =>
+          i === 0 || s.code === arr[i-1].code
+        );
+        if (sameSelector) {
+          parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
+          parts.push(`   Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
+          parts.push(`   YOU MUST try a completely different selector this time!\n`);
+        }
+      }
+    }
+    // Experiences
+    if (context.experiences.length > 0) {
+      parts.push(`\nLEARNINGS (${context.experiences.length}):`);
+      for (const exp of context.experiences) {
+        parts.push(`  - ${exp}`);
+      }
+      parts.push('');
+    }
+    // Extracted data
+    if (Object.keys(context.extractedData).length > 0) {
+      parts.push(`\nEXTRACTED DATA:`);
+      for (const [key, value] of Object.entries(context.extractedData)) {
+        parts.push(`  ${key}: ${value}`);
+      }
+      parts.push('');
+    }
+    // Previous reflection
+    if (context.previousIterationGuidance) {
+      parts.push(`\nYOUR PREVIOUS GUIDANCE TO YOURSELF:`);
+      parts.push(`Focus: ${context.previousIterationGuidance.guidanceForNext}`);
+      if (context.previousIterationGuidance.detectingLoop) {
+        parts.push(`⚠ YOU DETECTED A LOOP: ${context.previousIterationGuidance.loopReasoning}`);
+      }
+      parts.push('');
+    }
+    // Tool results (if any)
+    if (context.toolResults) {
+      parts.push(`\n========== TOOL RESULTS ==========`);
+      for (const [toolName, result] of Object.entries(context.toolResults)) {
+        parts.push(`\n${toolName}: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
+        if (result.learning) {
+          parts.push(`\n📊 ANALYSIS FROM ${toolName.toUpperCase()}:`);
+          parts.push(result.learning);
+          parts.push(`^^ USE THIS INFORMATION TO DECIDE YOUR NEXT COMMANDS ^^`);
+        }
+        if (result.error) {
+          parts.push(`  Error: ${result.error}`);
+        }
+      }
+      parts.push(`\n===================================\n`);
+    }
+    parts.push('=== WHAT DO YOU WANT TO DO? ===');
+    parts.push('Respond with JSON following the format specified in the system prompt.');
+    return parts.join('\n');
+  }
+  /**
+   * Parse agent decision from LLM response
+   */
+  private parseAgentDecision(response: string): AgentDecision {
+    try {
+      // Extract JSON from response
+      const jsonMatch = response.match(/\{[\s\S]*\}/);
+      if (!jsonMatch) {
+        this.logger?.(`[Orchestrator] ✗ No JSON found in LLM response`, 'error');
+        this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
+        throw new Error('No JSON found in response');
+      }
+      const parsed = JSON.parse(jsonMatch[0]);
+      // Validate required fields
+      // Accept either "reasoning" or "statusReasoning" (LLMs sometimes only provide one)
+      if (!parsed.status || (!parsed.reasoning && !parsed.statusReasoning)) {
+        this.logger?.(`[Orchestrator] ✗ Missing required fields in parsed JSON`, 'error');
+        this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
+        this.logger?.(`[Orchestrator] 📄 PARSED JSON:\n${JSON.stringify(parsed, null, 2)}`, 'error');
+        this.logger?.(`[Orchestrator] ❌ Has status: ${!!parsed.status}, Has reasoning: ${!!parsed.reasoning}, Has statusReasoning: ${!!parsed.statusReasoning}`, 'error');
+        throw new Error('Missing required fields: status and (reasoning or statusReasoning)');
+      }
+      // Normalize: if reasoning is missing but statusReasoning exists, use statusReasoning as reasoning
+      if (!parsed.reasoning && parsed.statusReasoning) {
+        parsed.reasoning = parsed.statusReasoning;
+      }
+      return parsed as AgentDecision;
+    } catch (error: any) {
+      this.logger?.(`[Orchestrator] ✗ Failed to parse agent decision: ${error.message}`, 'error');
+      // Only log full response if not already logged above
+      if (!error.message.includes('Missing required fields') && !error.message.includes('No JSON found')) {
+        this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
+      }
+      // Return fallback
+      return {
+        status: 'stuck',
+        statusReasoning: 'Failed to parse agent response',
+        reasoning: `Parse error: ${error.message}`
+      };
+    }
+  }
+  /**
+   * Execute tools
+   */
+  private async executeTools(
+    toolCalls: any[],
+    page: any,
+    memory: JourneyMemory,
+    stepNumber: number
+  ): Promise<Record<string, any>> {
+    this.logger?.(`[Orchestrator] 🔧 Executing ${toolCalls.length} tool(s)`);
+    const results: Record<string, any> = {};
+    const toolContext: ToolExecutionContext = {
+      page,
+      memory,
+      stepNumber,
+      logger: this.logger
+    };
+    for (const toolCall of toolCalls.slice(0, this.config.maxToolCallsPerIteration)) {
+      this.logger?.(`[Orchestrator]   ▶ ${toolCall.name}(${JSON.stringify(toolCall.params).substring(0, 50)}...)`);
+      const result = await this.toolRegistry.execute(toolCall, toolContext);
+      results[toolCall.name] = result;
+      if (result.success) {
+        this.logger?.(`[Orchestrator]   ✓ ${toolCall.name} succeeded`);
+      } else {
+        this.logger?.(`[Orchestrator]   ✗ ${toolCall.name} failed: ${result.error}`, 'error');
+      }
+    }
+    return results;
+  }
+  /**
+   * Execute commands sequentially with SHARED context (variables persist across commands)
+   */
+  private async executeCommandsSequentially(
+    commands: string[],
+    page: any,
+    memory: JourneyMemory,
+    stepNumber: number,
+    iteration: number,
+    jobId: string
+  ): Promise<{ executed: string[]; allSucceeded: boolean }> {
+    this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s) in shared context`);
+    const executed: string[] = [];
+    const limitedCommands = commands.slice(0, this.config.maxCommandsPerIteration);
+    // Build execution with shared context (all commands share scope - variables persist)
+    const commandsWithTracking = limitedCommands.map((cmd, i) => {
+      return `
+// Command ${i + 1}/${limitedCommands.length}
+try {
+  ${cmd}
+  __results.push({ index: ${i}, success: true });
+} catch (error) {
+  __results.push({ index: ${i}, success: false, error: error.message });
+  throw error; // Stop on first failure
+}`;
+    }).join('\n');
+    const wrappedCode = `
+const __results = [];
+${commandsWithTracking}
+return __results;
+`;
+    try {
+      // Execute in shared context - variables declared here persist for entire scenario
+      // Pass both page and expect to make Playwright assertions available
+      const func = new Function('page', 'expect', 'return (async () => { ' + wrappedCode + ' })()');
+      const results = await func(page, expect);
+      // Record results for each command
+      for (let i = 0; i < limitedCommands.length; i++) {
+        const cmd = limitedCommands[i];
+        const result = results[i];
+        if (result && result.success) {
+          this.logger?.(`[Orchestrator]   ✓ [${i + 1}/${limitedCommands.length}] Success`);
+          // Record in history
+          memory.history.push({
+            stepNumber,
+            iteration,
+            action: `Command ${i + 1}/${limitedCommands.length}`,
+            code: cmd,
+            result: 'success',
+            observation: 'Executed successfully',
+            url: page.url(),
+            timestamp: Date.now()
+          });
+          executed.push(cmd);
+        }
+      }
+      // Cap history
+      if (memory.history.length > this.config.maxHistorySize) {
+        memory.history = memory.history.slice(-this.config.maxHistorySize);
+      }
+      return { executed, allSucceeded: true };
+    } catch (error: any) {
+      // One of the commands failed - find which one
+      const errorMessage = error.message || String(error);
+      // Capture page state for debug logging
+      let pageStateDebug = '';
+      if (this.debugMode) {
+        try {
+          const pageInfo = await getEnhancedPageInfo(page);
+          pageStateDebug = `
+=== DEBUG: PAGE STATE AT FAILURE ===
+URL: ${page.url()}
+Title: ${pageInfo.title}
+INTERACTIVE ELEMENTS:
+${pageInfo.formattedElements}
+ARIA SNAPSHOT:
+${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
+====================================`;
+        } catch (debugError: any) {
+          pageStateDebug = `Failed to capture page state: ${debugError.message}`;
+        }
+      }
+      // Record all that succeeded, then the failure
+      for (let i = 0; i < limitedCommands.length; i++) {
+        const cmd = limitedCommands[i];
+        // This is a failed command (error happened here or earlier)
+        if (executed.length <= i) {
+          this.logger?.(`[Orchestrator]   ✗ [${i + 1}/${limitedCommands.length}] Failed: ${errorMessage}`, 'error');
+          // Log detailed debug info
+          if (this.debugMode && pageStateDebug) {
+            this.logger?.(pageStateDebug, 'debug');
+          }
+          memory.history.push({
+            stepNumber,
+            iteration,
+            action: `Command ${i + 1}/${limitedCommands.length} - FAILED`,
+            code: cmd,
+            result: 'failure',
+            observation: `Failed with error: ${errorMessage}. This selector likely doesn't exist or is incorrect.`,
+            error: errorMessage,
+            url: page.url(),
+            timestamp: Date.now()
+          });
+          if (i < limitedCommands.length - 1) {
+            this.logger?.(`[Orchestrator]   ⚠ Skipping remaining ${limitedCommands.length - i - 1} command(s)`, 'warn');
+          }
+          break;
+        }
+      }
+      return { executed, allSucceeded: false };
+    }
+  }
+  /**
+   * Execute a single command
+   */
+  private async executeCommand(cmd: string, page: any): Promise<void> {
+    // Wrap in async function and execute
+    const wrapped = `(async () => { ${cmd} })()`;
+    try {
+      await eval(wrapped);
+    } catch (error: any) {
+      // If eval fails, try direct execution with page context
+      // Pass both page and expect to make Playwright assertions available
+      const func = new Function('page', 'expect', `return (async () => { ${cmd} })()`);
+      await func(page, expect);
+    }
+  }
+  /**
+   * Log agent decision
+   */
+  private logAgentDecision(decision: AgentDecision, iteration: number): void {
+    this.logger?.(`[Orchestrator] 💭 REASONING: ${decision.reasoning}`);
+    if (decision.selfReflection) {
+      this.logger?.(`[Orchestrator] 🧠 SELF-REFLECTION:`);
+      this.logger?.(`[Orchestrator]    Next: ${decision.selfReflection.guidanceForNext}`);
+      if (decision.selfReflection.detectingLoop) {
+        this.logger?.(`[Orchestrator]    🔄 LOOP DETECTED: ${decision.selfReflection.loopReasoning}`, 'warn');
+      }
+    }
+    if (decision.toolCalls && decision.toolCalls.length > 0) {
+      this.logger?.(`[Orchestrator] 🔧 TOOLS: ${decision.toolCalls.map(t => t.name).join(', ')}`);
+      if (decision.toolReasoning) {
+        this.logger?.(`[Orchestrator] 📋 Why: ${decision.toolReasoning}`);
+      }
+    }
+    if (decision.blockerDetected) {
+      this.logger?.(`[Orchestrator] 🚧 BLOCKER: ${decision.blockerDetected.description}`, 'warn');
+      this.logger?.(`[Orchestrator] 🧹 Clearing with ${decision.blockerDetected.clearingCommands.length} command(s)`);
+    }
+    if (decision.stepReEvaluation?.detected) {
+      this.logger?.(`[Orchestrator] 🔍 STEP RE-EVALUATION: ${decision.stepReEvaluation.issue}`, 'warn');
+      this.logger?.(`[Orchestrator] 📝 Explanation: ${decision.stepReEvaluation.explanation}`);
+    }
+    if (decision.commands && decision.commands.length > 0) {
+      this.logger?.(`[Orchestrator] 📝 COMMANDS (${decision.commands.length}):`);
+      decision.commands.slice(0, 3).forEach((cmd, i) => {
+        this.logger?.(`[Orchestrator]    ${i + 1}. ${cmd.substring(0, 80)}...`);
+      });
+      if (decision.commands.length > 3) {
+        this.logger?.(`[Orchestrator]    ... and ${decision.commands.length - 3} more`);
+      }
+      if (decision.commandReasoning) {
+        this.logger?.(`[Orchestrator] 💡 Why: ${decision.commandReasoning}`);
+      }
+    }
+    // Experiences will be logged when added to memory, no need to log here
+  }
+  /**
+   * Report step progress
+   */
+  private async reportStepProgress(
+    jobId: string,
+    stepNumber: number,
+    description: string,
+    decision: AgentDecision,
+    iteration: number
+  ): Promise<void> {
+    if (!this.progressReporter?.onStepProgress) return;
+    await this.progressReporter.onStepProgress({
+      jobId,
+      stepNumber,
+      description,
+      status: decision.status === 'complete' ? StepExecutionStatus.SUCCESS :
+              decision.status === 'stuck' || decision.status === 'infeasible' ? StepExecutionStatus.FAILURE :
+              StepExecutionStatus.IN_PROGRESS,
+      code: decision.commands?.join('\n'),
+      // Include agent metadata for transparency
+      agentIteration: iteration,
+      agentReasoning: decision.reasoning,
+      agentSelfReflection: decision.selfReflection,
+      agentExperiences: decision.experiences,
+      agentToolsUsed: decision.toolCalls?.map(t => t.name),
+      agentStatus: decision.status
+    });
+  }
+}