testchimp-runner-core 0.0.33 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/dist/llm-facade.d.ts.map +1 -1
  2. package/dist/llm-facade.js +7 -7
  3. package/dist/llm-facade.js.map +1 -1
  4. package/dist/llm-provider.d.ts +9 -0
  5. package/dist/llm-provider.d.ts.map +1 -1
  6. package/dist/model-constants.d.ts +16 -5
  7. package/dist/model-constants.d.ts.map +1 -1
  8. package/dist/model-constants.js +17 -6
  9. package/dist/model-constants.js.map +1 -1
  10. package/dist/orchestrator/index.d.ts +1 -1
  11. package/dist/orchestrator/index.d.ts.map +1 -1
  12. package/dist/orchestrator/index.js +3 -2
  13. package/dist/orchestrator/index.js.map +1 -1
  14. package/dist/orchestrator/orchestrator-agent.d.ts +0 -8
  15. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  16. package/dist/orchestrator/orchestrator-agent.js +206 -405
  17. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  18. package/dist/orchestrator/orchestrator-prompts.d.ts +20 -0
  19. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
  20. package/dist/orchestrator/orchestrator-prompts.js +455 -0
  21. package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
  22. package/dist/orchestrator/tools/index.d.ts +2 -1
  23. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  24. package/dist/orchestrator/tools/index.js +4 -2
  25. package/dist/orchestrator/tools/index.js.map +1 -1
  26. package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
  27. package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
  28. package/dist/orchestrator/tools/verify-action-result.js +140 -0
  29. package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
  30. package/dist/orchestrator/types.d.ts +26 -0
  31. package/dist/orchestrator/types.d.ts.map +1 -1
  32. package/dist/orchestrator/types.js.map +1 -1
  33. package/dist/prompts.d.ts.map +1 -1
  34. package/dist/prompts.js +40 -34
  35. package/dist/prompts.js.map +1 -1
  36. package/dist/scenario-worker-class.d.ts.map +1 -1
  37. package/dist/scenario-worker-class.js +4 -1
  38. package/dist/scenario-worker-class.js.map +1 -1
  39. package/dist/utils/coordinate-converter.d.ts +32 -0
  40. package/dist/utils/coordinate-converter.d.ts.map +1 -0
  41. package/dist/utils/coordinate-converter.js +130 -0
  42. package/dist/utils/coordinate-converter.js.map +1 -0
  43. package/package.json +1 -1
  44. package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
  45. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
  46. package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
  47. package/plandocs/PHASE_1_COMPLETE.md +165 -0
  48. package/plandocs/PHASE_1_SUMMARY.md +184 -0
  49. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
  50. package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
  51. package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
  52. package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
  53. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
  54. package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
  55. package/src/llm-facade.ts +8 -8
  56. package/src/llm-provider.ts +11 -1
  57. package/src/model-constants.ts +17 -5
  58. package/src/orchestrator/index.ts +3 -2
  59. package/src/orchestrator/orchestrator-agent.ts +249 -424
  60. package/src/orchestrator/orchestrator-agent.ts.backup +1386 -0
  61. package/src/orchestrator/orchestrator-prompts.ts +474 -0
  62. package/src/orchestrator/tools/index.ts +2 -1
  63. package/src/orchestrator/tools/verify-action-result.ts +159 -0
  64. package/src/orchestrator/types.ts +48 -0
  65. package/src/prompts.ts +40 -34
  66. package/src/scenario-worker-class.ts +7 -2
  67. package/src/utils/coordinate-converter.ts +162 -0
  68. package/testchimp-runner-core-0.0.33.tgz +0 -0
  69. /package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
  70. /package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
  71. /package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0
@@ -0,0 +1,1386 @@
1
+ /**
2
+ * Orchestrator Agent
3
+ * Single agent with tool-use capabilities that maintains journey memory and executes scenarios
4
+ */
5
+
6
+ import { expect } from '@playwright/test';
7
+ import { LLMFacade } from '../llm-facade';
8
+ import { ProgressReporter, TokenUsage, StepExecutionStatus } from '../progress-reporter';
9
+ import { getEnhancedPageInfo } from '../utils/page-info-utils';
10
+ import { CoordinateConverter } from '../utils/coordinate-converter';
11
+ import { ToolRegistry, ToolExecutionContext } from './tool-registry';
12
+ import { DEFAULT_MODEL } from '../model-constants';
13
+ import {
14
+ AgentConfig,
15
+ AgentContext,
16
+ AgentDecision,
17
+ JourneyMemory,
18
+ MemoryStep,
19
+ OrchestratorStepResult,
20
+ SelfReflection,
21
+ NoteToFutureSelf,
22
+ CoordinateAction,
23
+ DEFAULT_AGENT_CONFIG
24
+ } from './types';
25
+ import { OrchestratorPrompts } from './orchestrator-prompts';
26
+
27
+ /**
28
+ * Orchestrator Agent - manages step execution with tool use and memory
29
+ */
30
+ export class OrchestratorAgent {
31
+ private llmFacade: LLMFacade;
32
+ private toolRegistry: ToolRegistry;
33
+ private progressReporter?: ProgressReporter;
34
+ private config: Required<AgentConfig>;
35
+ private logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void;
36
+ private debugMode: boolean = false;
37
+
38
+ constructor(
39
+ llmFacade: LLMFacade,
40
+ toolRegistry: ToolRegistry,
41
+ config?: Partial<AgentConfig>,
42
+ progressReporter?: ProgressReporter,
43
+ logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void,
44
+ debugMode?: boolean
45
+ ) {
46
+ this.llmFacade = llmFacade;
47
+ this.toolRegistry = toolRegistry;
48
+ this.config = { ...DEFAULT_AGENT_CONFIG, ...config };
49
+ this.progressReporter = progressReporter;
50
+ this.logger = logger;
51
+ this.debugMode = debugMode || false;
52
+ }
53
+
54
+ setDebugMode(enabled: boolean): void {
55
+ this.debugMode = enabled;
56
+ }
57
+
58
+ /**
59
+ * Execute a single step of the scenario
60
+ */
61
+ async executeStep(
62
+ page: any,
63
+ stepDescription: string,
64
+ stepNumber: number,
65
+ totalSteps: number,
66
+ scenarioSteps: string[],
67
+ memory: JourneyMemory,
68
+ jobId: string
69
+ ): Promise<OrchestratorStepResult> {
70
+ this.logger?.(`\n[Orchestrator] ========== STEP ${stepNumber}/${totalSteps} ==========`);
71
+ this.logger?.(`[Orchestrator] 🎯 Goal: ${stepDescription}`);
72
+
73
+ let iteration = 0;
74
+ let previousReflection: SelfReflection | undefined = undefined;
75
+ let noteToSelf: NoteToFutureSelf | undefined = memory.latestNote; // Start with note from previous step
76
+ const commandsExecuted: string[] = [];
77
+ let consecutiveFailures = 0; // Track consecutive iterations with failed commands
78
+ let coordinateAttempts = 0; // Track coordinate mode attempts (max 2)
79
+
80
+ while (iteration < this.config.maxIterationsPerStep) {
81
+ iteration++;
82
+
83
+ this.logger?.(`\n[Orchestrator] === Iteration ${iteration}/${this.config.maxIterationsPerStep} ===`);
84
+
85
+ // Build context for agent
86
+ const context = await this.buildAgentContext(
87
+ page,
88
+ stepDescription,
89
+ stepNumber,
90
+ totalSteps,
91
+ scenarioSteps,
92
+ memory,
93
+ previousReflection,
94
+ consecutiveFailures,
95
+ noteToSelf // NEW: Pass note from previous iteration
96
+ );
97
+
98
+ // Call agent to make decision
99
+ const decision = await this.callAgent(
100
+ context,
101
+ jobId,
102
+ stepNumber,
103
+ iteration,
104
+ consecutiveFailures
105
+ );
106
+
107
+ // Log agent's reasoning
108
+ this.logAgentDecision(decision, iteration);
109
+
110
+ // Report progress
111
+ await this.reportStepProgress(jobId, stepNumber, stepDescription, decision, iteration);
112
+
113
+ // Execute tools if requested (tools are READ-ONLY, they don't change state)
114
+ let toolResults: Record<string, any> = {};
115
+
116
+ if (decision.toolCalls && decision.toolCalls.length > 0) {
117
+ toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
118
+
119
+ // If agent wants to wait for tool results before proceeding, call agent again
120
+ if (decision.needsToolResults) {
121
+ const updatedContext = { ...context, toolResults };
122
+ const continuedDecision = await this.callAgent(updatedContext, jobId, stepNumber, iteration, consecutiveFailures);
123
+
124
+ // Merge continued decision
125
+ decision.commands = continuedDecision.commands || decision.commands;
126
+ decision.commandReasoning = continuedDecision.commandReasoning || decision.commandReasoning;
127
+ decision.status = continuedDecision.status;
128
+ decision.statusReasoning = continuedDecision.statusReasoning;
129
+ decision.reasoning = continuedDecision.reasoning;
130
+ }
131
+ }
132
+
133
+ // Execute commands sequentially
134
+ let iterationHadFailure = false;
135
+
136
+ // Handle blocker if detected (clear blocker FIRST, then proceed with main commands)
137
+ if (decision.blockerDetected && decision.blockerDetected.clearingCommands && decision.blockerDetected.clearingCommands.length > 0) {
138
+ this.logger?.(`[Orchestrator] 🚧 BLOCKER DETECTED: ${decision.blockerDetected.description}`);
139
+ this.logger?.(`[Orchestrator] 🧹 Clearing blocker with ${decision.blockerDetected.clearingCommands.length} command(s)...`);
140
+
141
+ const blockerResult = await this.executeCommandsSequentially(
142
+ decision.blockerDetected.clearingCommands,
143
+ page,
144
+ memory,
145
+ stepNumber,
146
+ iteration,
147
+ jobId
148
+ );
149
+
150
+ // Add blocker commands with comment to output
151
+ if (blockerResult.executed.length > 0) {
152
+ commandsExecuted.push(`// Blocker: ${decision.blockerDetected.description}`);
153
+ commandsExecuted.push(...blockerResult.executed);
154
+ }
155
+
156
+ // If blocker clearing failed, track it
157
+ if (!blockerResult.allSucceeded) {
158
+ this.logger?.(`[Orchestrator] ❌ Failed to clear blocker - continuing anyway`);
159
+ consecutiveFailures++;
160
+ iterationHadFailure = true;
161
+ }
162
+ }
163
+
164
+ // Execute main commands (only if no blocker failure)
165
+ if (decision.commands && decision.commands.length > 0 && !iterationHadFailure) {
166
+ const executeResult = await this.executeCommandsSequentially(
167
+ decision.commands,
168
+ page,
169
+ memory,
170
+ stepNumber,
171
+ iteration,
172
+ jobId
173
+ );
174
+
175
+ commandsExecuted.push(...executeResult.executed);
176
+
177
+ // Track failures
178
+ if (!executeResult.allSucceeded) {
179
+ this.logger?.(`[Orchestrator] ⚠ Command execution stopped at failure`);
180
+ consecutiveFailures++;
181
+ iterationHadFailure = true;
182
+ } else {
183
+ consecutiveFailures = 0; // Reset on success
184
+ }
185
+ }
186
+
187
+ // Handle coordinate-based actions (NEW - fallback when selectors fail)
188
+ if (decision.coordinateAction && !iterationHadFailure) {
189
+ coordinateAttempts++;
190
+
191
+ this.logger?.(`[Orchestrator] 🎯 Coordinate Action (attempt ${coordinateAttempts}/2): ${decision.coordinateAction.action} at (${decision.coordinateAction.xPercent}%, ${decision.coordinateAction.yPercent}%)`);
192
+
193
+ try {
194
+ // BEFORE screenshot for visual verification
195
+ const beforeScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
196
+ const beforeDataUrl = `data:image/jpeg;base64,${beforeScreenshot}`;
197
+
198
+ // Generate Playwright commands from coordinate action
199
+ const coordCommands = await CoordinateConverter.generateCommands(decision.coordinateAction, page);
200
+
201
+ this.logger?.(`[Orchestrator] Generated commands:`);
202
+ coordCommands.forEach(cmd => this.logger?.(` ${cmd}`));
203
+
204
+ // Execute coordinate commands
205
+ const coordResult = await this.executeCommandsSequentially(
206
+ coordCommands,
207
+ page,
208
+ memory,
209
+ stepNumber,
210
+ iteration,
211
+ jobId
212
+ );
213
+
214
+ commandsExecuted.push(...coordResult.executed);
215
+
216
+ if (!coordResult.allSucceeded) {
217
+ this.logger?.(`[Orchestrator] ❌ Coordinate action failed (Playwright error)`);
218
+ consecutiveFailures++;
219
+ iterationHadFailure = true;
220
+
221
+ // Give up after 2 coordinate attempts
222
+ if (coordinateAttempts >= 2) {
223
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
224
+ return {
225
+ success: false,
226
+ commands: commandsExecuted,
227
+ iterations: iteration,
228
+ terminationReason: 'agent_stuck',
229
+ memory,
230
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
231
+ };
232
+ }
233
+ } else {
234
+ this.logger?.(`[Orchestrator] ✅ Coordinate action succeeded (no Playwright error)`);
235
+
236
+ // CRITICAL: Verify visually if goal was achieved (coordinates might have clicked wrong place)
237
+ // Wait for network idle (smarter than fixed timeout - waits as needed, max 10s)
238
+ try {
239
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
240
+ } catch (e) {
241
+ // Network idle timeout - page may still be loading, but proceed with verification
242
+ this.logger?.(`[Orchestrator] ⚠️ Network idle timeout after 10s, proceeding with verification`, 'warn');
243
+ }
244
+
245
+ const afterScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
246
+ const afterDataUrl = `data:image/jpeg;base64,${afterScreenshot}`;
247
+
248
+ this.logger?.(`[Orchestrator] 📸 Verifying coordinate action visually...`);
249
+
250
+ const verificationRequest = {
251
+ model: 'gpt-5-mini',
252
+ systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
253
+ userPrompt: `Goal: ${scenarioSteps[stepNumber - 1]}\n\nA coordinate-based action was just executed. Compare the BEFORE and AFTER screenshots.\n\nDid the action achieve the goal? Respond with JSON:\n{\n "verified": boolean,\n "reasoning": "What changed (or didn't change) between screenshots",\n "visibleChanges": ["List of UI changes observed"],\n "clickedWrongPlace": boolean\n}\n\nBe strict: Only return verified=true if you clearly see the expected change matching the goal.`,
254
+ images: [
255
+ { label: 'BEFORE', dataUrl: beforeDataUrl },
256
+ { label: 'AFTER', dataUrl: afterDataUrl }
257
+ ]
258
+ };
259
+
260
+ const verificationResponse = await this.llmFacade.llmProvider.callLLM(verificationRequest);
261
+ const jsonMatch = verificationResponse.answer.match(/\{[\s\S]*\}/);
262
+
263
+ if (jsonMatch) {
264
+ const verificationResult = JSON.parse(jsonMatch[0]);
265
+ const verified = verificationResult.verified === true;
266
+ const reasoning = verificationResult.reasoning || 'No reasoning provided';
267
+ const clickedWrongPlace = verificationResult.clickedWrongPlace === true;
268
+
269
+ this.logger?.(`[Orchestrator] 📊 Visual verification: ${verified ? '✅ VERIFIED' : '❌ NOT VERIFIED'}`);
270
+ this.logger?.(`[Orchestrator] 💭 Reasoning: ${reasoning}`);
271
+
272
+ if (verified) {
273
+ // Goal achieved! Can mark complete
274
+ consecutiveFailures = 0;
275
+
276
+ // Store note for context
277
+ noteToSelf = {
278
+ fromIteration: iteration,
279
+ content: `Coordinate action verified successful: ${reasoning}`
280
+ };
281
+ memory.latestNote = noteToSelf;
282
+ } else {
283
+ // Goal NOT achieved despite no error
284
+ this.logger?.(`[Orchestrator] ⚠️ Coordinate click succeeded but goal NOT achieved`, 'warn');
285
+ consecutiveFailures++;
286
+ iterationHadFailure = true;
287
+
288
+ // Store diagnostic note for next attempt
289
+ const diagnostic = clickedWrongPlace
290
+ ? `Clicked wrong place. ${reasoning}. Try different coordinates.`
291
+ : `Action executed but goal not achieved. ${reasoning}. May need different approach.`;
292
+
293
+ noteToSelf = {
294
+ fromIteration: iteration,
295
+ content: diagnostic
296
+ };
297
+ memory.latestNote = noteToSelf;
298
+
299
+ // Give up after 2 coordinate attempts
300
+ if (coordinateAttempts >= 2) {
301
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts, none achieved goal) - marking stuck`);
302
+ return {
303
+ success: false,
304
+ commands: commandsExecuted,
305
+ iterations: iteration,
306
+ terminationReason: 'agent_stuck',
307
+ memory,
308
+ error: `Coordinate actions clicked but didn't achieve goal: ${reasoning}`
309
+ };
310
+ }
311
+ }
312
+ } else {
313
+ this.logger?.(`[Orchestrator] ⚠️ Could not parse verification response - treating as unverified`, 'warn');
314
+ consecutiveFailures++;
315
+ iterationHadFailure = true;
316
+ }
317
+ }
318
+
319
+ } catch (error: any) {
320
+ this.logger?.(`[Orchestrator] ❌ Coordinate action error: ${error.message}`, 'error');
321
+ consecutiveFailures++;
322
+ iterationHadFailure = true;
323
+
324
+ // Give up after 2 coordinate attempts
325
+ if (coordinateAttempts >= 2) {
326
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
327
+ return {
328
+ success: false,
329
+ commands: commandsExecuted,
330
+ iterations: iteration,
331
+ terminationReason: 'agent_stuck',
332
+ memory,
333
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
334
+ };
335
+ }
336
+ }
337
+ }
338
+
339
+ // System-enforced stuck detection (agent might not detect it)
340
+ // Allow 5 failures: 3 selector attempts + 2 coordinate attempts
341
+ if (consecutiveFailures >= 5) {
342
+ this.logger?.(`[Orchestrator] 🛑 SYSTEM: ${consecutiveFailures} consecutive failures detected - forcing stuck`, 'warn');
343
+ return {
344
+ success: false,
345
+ commands: commandsExecuted,
346
+ iterations: iteration,
347
+ terminationReason: 'agent_stuck',
348
+ memory,
349
+ error: `Failed ${consecutiveFailures} iterations in a row - unable to proceed`
350
+ };
351
+ }
352
+
353
+ // Update memory with experiences
354
+ if (decision.experiences && decision.experiences.length > 0) {
355
+ for (const exp of decision.experiences) {
356
+ // Deduplicate - don't add if very similar experience exists
357
+ const exists = memory.experiences.some(existing =>
358
+ existing.toLowerCase().includes(exp.toLowerCase()) ||
359
+ exp.toLowerCase().includes(existing.toLowerCase())
360
+ );
361
+
362
+ if (!exists) {
363
+ memory.experiences.push(exp);
364
+ this.logger?.(`[Orchestrator] 📚 Experience: ${exp}`);
365
+ }
366
+ }
367
+
368
+ // Cap experiences
369
+ if (memory.experiences.length > this.config.maxExperiences) {
370
+ memory.experiences = memory.experiences.slice(-this.config.maxExperiences);
371
+ }
372
+ }
373
+
374
+ // Store self-reflection for next iteration
375
+ previousReflection = decision.selfReflection;
376
+
377
+ // Store note to future self (NEW - tactical memory across iterations AND steps)
378
+ if (decision.noteToFutureSelf) {
379
+ noteToSelf = {
380
+ fromIteration: iteration,
381
+ content: decision.noteToFutureSelf
382
+ };
383
+ memory.latestNote = noteToSelf; // Persist in journey memory across steps
384
+ this.logger?.(`[Orchestrator] 📝 Note to self: ${decision.noteToFutureSelf}`);
385
+ }
386
+
387
+ // Check termination
388
+ if (decision.status !== 'continue') {
389
+ this.logger?.(`[Orchestrator] 🎯 Status: ${decision.status}`);
390
+ this.logger?.(`[Orchestrator] 💭 Reason: ${decision.statusReasoning}`);
391
+
392
+ // SAFETY CHECK: Don't allow "complete" if commands failed this iteration
393
+ if (decision.status === 'complete' && iterationHadFailure) {
394
+ this.logger?.(`[Orchestrator] ⚠️ OVERRIDE: Agent said "complete" but commands FAILED - forcing "continue"`, 'warn');
395
+ this.logger?.(`[Orchestrator] 💭 System: Commands must succeed before marking complete`);
396
+ // Don't return - continue to next iteration
397
+ } else {
398
+ // Valid termination
399
+ return {
400
+ success: decision.status === 'complete',
401
+ commands: commandsExecuted,
402
+ iterations: iteration,
403
+ terminationReason: decision.status === 'complete' ? 'complete' :
404
+ decision.status === 'stuck' ? 'agent_stuck' :
405
+ 'infeasible',
406
+ memory
407
+ };
408
+ }
409
+ }
410
+ }
411
+
412
+ // Hit iteration limit
413
+ this.logger?.(`[Orchestrator] ⚠ Maximum iterations reached (${this.config.maxIterationsPerStep})`);
414
+
415
+ return {
416
+ success: false,
417
+ commands: commandsExecuted,
418
+ iterations: iteration,
419
+ terminationReason: 'system_limit',
420
+ memory,
421
+ error: 'Maximum iterations reached'
422
+ };
423
+ }
424
+
425
+ /**
426
+ * Build context for agent
427
+ */
428
+ private async buildAgentContext(
429
+ page: any,
430
+ currentStepGoal: string,
431
+ stepNumber: number,
432
+ totalSteps: number,
433
+ scenarioSteps: string[],
434
+ memory: JourneyMemory,
435
+ previousReflection?: SelfReflection,
436
+ consecutiveFailures?: number,
437
+ noteFromPreviousIteration?: NoteToFutureSelf // NEW
438
+ ): Promise<AgentContext> {
439
+ // Get fresh DOM
440
+ const currentPageInfo = await getEnhancedPageInfo(page);
441
+ const currentURL = page.url();
442
+
443
+ // Get recent steps
444
+ const recentSteps = memory.history.slice(-this.config.recentStepsCount);
445
+
446
+ // Build context
447
+ return {
448
+ overallGoal: scenarioSteps.join('\n'),
449
+ currentStepGoal,
450
+ stepNumber,
451
+ totalSteps,
452
+ completedSteps: scenarioSteps.slice(0, stepNumber - 1),
453
+ remainingSteps: scenarioSteps.slice(stepNumber),
454
+ currentPageInfo,
455
+ currentURL,
456
+ recentSteps,
457
+ experiences: memory.experiences,
458
+ extractedData: memory.extractedData,
459
+ previousIterationGuidance: previousReflection,
460
+ noteFromPreviousIteration // NEW: Pass tactical note from previous iteration
461
+ };
462
+ }
463
+
464
+ /**
465
+ * Call agent to make decision
466
+ */
467
+ private async callAgent(
468
+ context: AgentContext,
469
+ jobId: string,
470
+ stepNumber: number,
471
+ iteration: number,
472
+ consecutiveFailures?: number
473
+ ): Promise<AgentDecision> {
474
+ // Detect if coordinate mode should be activated
475
+ // Phase 1: Only 2 tiers (selectors → coordinates), so activate after 3 failures
476
+ // Phase 2: Will have 3 tiers (selectors → index → coordinates), threshold will be 5
477
+ const useCoordinateMode = consecutiveFailures !== undefined && consecutiveFailures >= 3;
478
+
479
+ // Build appropriate system prompt based on mode
480
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
481
+ const systemPrompt = useCoordinateMode
482
+ ? OrchestratorPrompts.buildCoordinateSystemPrompt()
483
+ : OrchestratorPrompts.buildSystemPrompt(toolDescriptions);
484
+ const userPrompt = OrchestratorPrompts.buildUserPrompt(context, consecutiveFailures);
485
+
486
+ // Log prompt lengths for monitoring
487
+ const systemLength = systemPrompt.length;
488
+ const userLength = userPrompt.length;
489
+ const totalLength = systemLength + userLength;
490
+ const estimatedTokens = Math.ceil(totalLength / 4); // Rough estimate: 4 chars per token
491
+
492
+ this.logger?.(`[Orchestrator] 📊 Prompt lengths: system=${systemLength} chars, user=${userLength} chars, total=${totalLength} chars (~${estimatedTokens} tokens)`, 'log');
493
+
494
+ try {
495
+ // Call LLM directly via provider
496
+ const llmRequest = {
497
+ model: DEFAULT_MODEL,
498
+ systemPrompt,
499
+ userPrompt
500
+ };
501
+
502
+ const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
503
+
504
+ // Report token usage
505
+ if (response.usage && this.progressReporter?.onTokensUsed) {
506
+ const tokenUsage: TokenUsage = {
507
+ jobId,
508
+ stepNumber,
509
+ iteration,
510
+ inputTokens: response.usage.inputTokens,
511
+ outputTokens: response.usage.outputTokens,
512
+ includesImage: false,
513
+ model: DEFAULT_MODEL,
514
+ timestamp: Date.now()
515
+ };
516
+ this.logger?.(`[Orchestrator] 💰 Reporting token usage: ${tokenUsage.inputTokens} + ${tokenUsage.outputTokens}`, 'log');
517
+ await this.progressReporter.onTokensUsed(tokenUsage);
518
+ } else if (!response.usage) {
519
+ this.logger?.(`[Orchestrator] ⚠ No usage data in LLM response`, 'warn');
520
+ }
521
+
522
+ // Parse response
523
+ return this.parseAgentDecision(response.answer);
524
+
525
+ } catch (error: any) {
526
+ this.logger?.(`[Orchestrator] ✗ Agent call failed: ${error.message}`, 'error');
527
+
528
+ // Return fallback decision
529
+ return {
530
+ status: 'stuck',
531
+ statusReasoning: `Agent call failed: ${error.message}`,
532
+ reasoning: 'LLM call failed'
533
+ };
534
+ }
535
+ }
536
+
537
+ /**
538
+ * Build system prompt with tool descriptions
539
+ */
540
+ private buildSystemPrompt(): string {
541
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
542
+
543
+ return `You are an intelligent test automation agent that executes web scenarios using Playwright.
544
+
545
+ ${toolDescriptions}
546
+
547
+ YOUR RESPONSE FORMAT - Output JSON matching this interface:
548
+
549
+ interface AgentDecisionLLMResponse {
550
+ status: string; // REQUIRED: "continue" | "complete" | "stuck" | "infeasible"
551
+ reasoning: string; // REQUIRED: Your thinking - what you're doing and why
552
+ commands?: string[]; // Playwright commands to execute
553
+ commandReasoning?: string; // Why these commands
554
+ toolCalls?: Array<{ // Tools to call
555
+ name: string;
556
+ params: Record<string, any>;
557
+ }>;
558
+ toolReasoning?: string; // Why these tools
559
+ needsToolResults?: boolean; // Wait for tool results before commands
560
+ noteToFutureSelf?: string; // Free-form tactical note for next iteration
561
+ coordinateAction?: { // Use when selectors fail (after 3 attempts)
562
+ type: "coordinate";
563
+ action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
564
+ xPercent: number; // 0-100, 3 decimals (e.g., 15.755)
565
+ yPercent: number; // 0-100, 3 decimals (e.g., 8.500)
566
+ toXPercent?: number; // For drag
567
+ toYPercent?: number; // For drag
568
+ value?: string; // For fill
569
+ scrollAmount?: number; // For scroll
570
+ };
571
+ selfReflection?: {
572
+ guidanceForNext: string;
573
+ detectingLoop: boolean;
574
+ loopReasoning?: string;
575
+ };
576
+ experiences?: string[]; // App-specific learnings
577
+ blockerDetected?: {
578
+ description: string;
579
+ clearingCommands: string[];
580
+ };
581
+ stepReEvaluation?: {
582
+ detected: boolean;
583
+ issue: "prior_incomplete" | "already_done" | "wrong_order" | null;
584
+ explanation: string;
585
+ };
586
+ }
587
+
588
+ STATUS DECISION RULES (CRITICAL - Think carefully!):
589
+
590
+ RULE #1: NEVER MARK "complete" IF ANY COMMAND FAILED
591
+ - Command failed (timeout, error, exception)? → status MUST be "continue" or "stuck"
592
+ - EVEN IF you think the goal might be achieved, if command failed → NOT "complete"
593
+ - System will OVERRIDE and force "continue" if you violate this
594
+
595
+ RULE #2: Decision tree:
596
+ - Command FAILED? → "continue" (retry different way) OR "stuck" (exhausted all attempts)
597
+ - Command SUCCEEDED? → "complete" (goal done) OR "continue" (need more actions)
598
+
599
+ Status meanings:
600
+ - "complete": Commands succeeded AND goal achieved
601
+ - "continue": Command failed OR need more actions
602
+ - "stuck": Tried 5 iterations, all failed, can't proceed
603
+ - "infeasible": Goal impossible (element truly doesn't exist)
604
+
605
+ Examples:
606
+ ✅ Command: page.click('button') → Success → Goal done → status: "complete"
607
+ ❌ Command: page.click('button') → Timeout → status: "complete" (WRONG! Must be "continue")
608
+ ✅ Command: page.click('button') → Timeout → status: "continue" (try different selector)
609
+
610
+ STEP RE-EVALUATION (After 2+ failures - Question assumptions!):
611
+
612
+ After repeated failures, check:
613
+ 1. LOOK BACK: Did prior steps actually complete? (Check COMPLETED vs page state)
614
+ 2. LOOK FORWARD: Is current step already done?
615
+ 3. LOOK AHEAD: Is next step more feasible with current page state?
616
+
617
+ Stick to original plan unless clear evidence suggests otherwise.
618
+
619
+ BLOCKER DETECTION:
620
+
621
+ Use when unexpected UI blocks current goal (modals, overlays, prompts).
622
+ Provide clearingCommands to dismiss blocker, then regular commands execute.
623
+
624
+ Example: Cookie modal → clearingCommands: ["click Accept"], commands: ["fill email"]
625
+ NOT blockers: Wrong selectors, missing elements (those are "infeasible")
626
+
627
+ EXPERIENCES - App-specific patterns only:
628
+ - Concise, 1-2 per iteration
629
+ - Focus on app quirks (custom dropdowns, data-testid patterns, semantic selector availability)
630
+ - NOT obvious things ("button has role=button") or individual selectors
631
+ - Combine related learnings
632
+
633
+ CRITICAL RULES FOR DECISION MAKING:
634
+
635
+ DECISION-MAKING PROCESS:
636
+
637
+ 1. **FOCUS**: Do ONLY what current step asks. No extra actions unless step says "verify/check".
638
+
639
+ 2. **EFFICIENCY**: Don't click before fill. Don't add unnecessary preparation. Minimal commands.
640
+
641
+ 3. **WHEN COMMAND FAILS**: Analyze error → Check DOM for semantic selectors → Try different approach
642
+ Never repeat same selector. Avoid auto-generated IDs (#«r3»-form-item). Screenshot if stuck.
643
+
644
+ 4. **DETECT LOOPS**: Same selector failed 2+ times? Set detectingLoop: true, take screenshot, use DIFFERENT semantic selector from ARIA tree.
645
+
646
+ 5. **AVAILABLE CONTEXT**: page, expect (already imported), extractedData (from extract_data tool)
647
+
648
+ 6. **USE DOM SNAPSHOT** (CRITICAL - Don't invent selectors!):
649
+ - You get INTERACTIVE ELEMENTS with: position, tag, id, text, SUGGESTED SELECTORS
650
+ - ONLY use what's in the list - DON'T invent text/names not shown
651
+ - Use EXACT selectors provided (#submit-btn, getByRole('button', {name: 'X'}), etc.)
652
+ - ⚠️ If text appears multiple times → scope to parent or use role filter
653
+ - Element not in list? → take_screenshot
654
+ - ARIA tree = source of truth for roles/names
655
+
656
+ ⚠️ IGNORE NON-INTERACTIVE ELEMENTS:
657
+ - Tooltips (role="tooltip") - informational only, NOT clickable
658
+ - Status messages (role="status", role="alert") - display info, NOT clickable
659
+ - Popovers with no buttons inside - usually just show info on hover
660
+ - Hidden elements (aria-hidden="true", display:none) - can't interact
661
+
662
+ If you see a tooltip text in the goal, find the TRIGGER element (button/icon that shows the tooltip), NOT the tooltip itself.
663
+
664
+ ⚠️ TOOLTIPS CAUSE STRICT MODE VIOLATIONS:
665
+ - Tooltips/popovers DUPLICATE text in the DOM (button + tooltip both have same text)
666
+ - BAD: getByText('Settings') → matches both button AND tooltip → STRICT MODE ERROR
667
+ - GOOD: getByRole('button', { name: 'Settings' }) → matches only button, ignores tooltip
668
+ - GOOD: locator('button').getByText('Settings') → scoped to button element
669
+ - Always prefer role-based selectors when text might appear in tooltips
670
+
671
+ 7. **SELECTOR PREFERENCE** (CRITICAL):
672
+ Prefer in order:
673
+ 1. getByRole/Label/Placeholder - Semantic, stable
674
+ 2. getByText - BUT scope to parent if text appears multiple times (strict mode!)
675
+ 3. data-testid or stable IDs
676
+ 4. Avoid: Auto-generated IDs (#«r3»-form-item), unicode, complex CSS paths
677
+
678
+ ⚠️ Common mistakes:
679
+ - getByText('Settings') when "Settings" appears 2+ times → STRICT MODE ERROR
680
+ Fix: locator('#parent').getByText('Settings') OR getByRole('button').filter({hasText: 'Settings'})
681
+ - Missing timeout on goto() → Add { timeout: 30000 }
682
+ - Using auto-generated IDs → Break when components re-render
683
+
684
+ 8. **ASSERTIONS** (CRITICAL):
685
+ Use expect() ONLY when step explicitly asks: "verify", "check", "ensure", "confirm"
686
+
687
+ When to use:
688
+ - "Verify X appears" → await expect(locator).toBeVisible()
689
+ - "Check field is empty" → await expect(locator).toBeEmpty()
690
+ - "Confirm URL" → await expect(page).toHaveURL('...')
691
+
692
+ When NOT to use:
693
+ - "Send message" → DON'T verify it appeared (unless step asks)
694
+ - "Click Submit" → DON'T check button state
695
+ - "Fill field" → DON'T verify it's filled
696
+
697
+ 9. **TOOLS vs COMMANDS**:
698
+ Tools = read-only info gathering (screenshot, recall_history, extract_data)
699
+ Commands = state changes (Playwright: goto, click, fill, etc.)
700
+
701
+ Navigation commands MUST include timeout:
702
+ - page.goto(url, { waitUntil: 'load', timeout: 30000 })
703
+ - page.waitForLoadState('load', { timeout: 30000 })
704
+
705
+ 10. **ERROR ANALYSIS** (Think about what went wrong):
706
+ - "Timeout waiting for locator" → Selector doesn't exist, find different one in DOM
707
+ - "page.goto: Timeout" → Missing timeout param: page.goto(url, { timeout: 30000 })
708
+ - "strict mode violation" → Text appears multiple times. Scope to parent: locator('#parent').getByText()
709
+ - "Element is not <select>" → Custom dropdown, use .click() not .selectOption()
710
+ - Loop detected (same selector 2+ times) → Try completely different selector from ARIA tree
711
+
712
+ 11. **WHEN TO RUN COMMANDS vs TOOLS**:
713
+ - Confident about selectors from DOM → Run commands directly
714
+ - Unsure or failed 2+ times → Take screenshot first
715
+ - First iteration of a step → Usually can run commands from DOM
716
+
717
+ 12. **NOTE TO FUTURE SELF** (Tactical memory across iterations):
718
+
719
+ Write FREE-FORM notes for your next iteration about:
720
+ - What you tried and why it failed
721
+ - Hypothesis being tested
722
+ - Plan for next attempt
723
+ - Page behavior patterns observed
724
+
725
+ Your next iteration reads this FIRST - use it to maintain strategic continuity.
726
+
727
+ 13. **COORDINATE-BASED ACTIONS** (Last resort after 3 selector failures):
728
+
729
+ Activated automatically after 3 failures. Use PERCENTAGES (0-100, 3 decimals):
730
+ - xPercent: 0=left, 100=right
731
+ - yPercent: 0=top, 100=bottom
732
+
733
+ Format:
734
+ {
735
+ "coordinateAction": {
736
+ "type": "coordinate",
737
+ "action": "click|doubleClick|rightClick|hover|drag|fill|scroll",
738
+ "xPercent": 15.755, "yPercent": 8.500,
739
+ "toXPercent": 45.25, "toYPercent": 8.50, // For drag
740
+ "value": "text", // For fill
741
+ "scrollAmount": 500 // For scroll
742
+ }
743
+ }
744
+
745
+ AFTER coordinate action succeeds:
746
+ - If goal verification unclear → CALL verify_action_result tool
747
+ - Tool compares before/after screenshots to confirm goal achieved
748
+ - If verified: mark status="complete"
749
+ - If not verified: try different coordinates (2 attempts max)
750
+
751
+ Example after coordinate click:
752
+ {
753
+ "status": "continue",
754
+ "reasoning": "Coordinate click succeeded, verifying if dashboard page loaded",
755
+ "toolCalls": [{"name": "verify_action_result", "params": {"expectedChange": "Dashboard page with data grid visible"}}],
756
+ "needsToolResults": true
757
+ }`;
758
+ }
759
+
760
+ /**
761
+ * Verify goal achievement using before/after screenshot comparison
762
+ * Use sparingly - expensive (2 images sent to vision model)
763
+ */
764
+ private async verifyGoalWithScreenshotComparison(
765
+ goalDescription: string,
766
+ beforeDataUrl: string,
767
+ afterDataUrl: string
768
+ ): Promise<boolean> {
769
+ this.logger?.(`[Orchestrator] 🔍 Verifying goal with screenshot comparison...`);
770
+
771
+ try {
772
+ const request = {
773
+ model: 'gpt-5-mini', // Vision model
774
+ systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
775
+ userPrompt: `Goal: ${goalDescription}
776
+
777
+ Compare the BEFORE and AFTER screenshots.
778
+
779
+ Did the action achieve the goal? Respond with JSON:
780
+ {
781
+ "goalAchieved": boolean,
782
+ "reasoning": "What changed (or didn't change) between screenshots",
783
+ "visibleChanges": ["List of UI changes observed"]
784
+ }
785
+
786
+ Focus on:
787
+ - Did expected elements appear/disappear?
788
+ - Did page navigate or content change?
789
+ - Are there visual indicators the action succeeded (new panels, forms, highlights)?
790
+
791
+ Be strict: Only return true if you clearly see the expected change.`,
792
+ images: [
793
+ { label: 'BEFORE', dataUrl: beforeDataUrl },
794
+ { label: 'AFTER', dataUrl: afterDataUrl }
795
+ ]
796
+ };
797
+
798
+ const response = await this.llmFacade.llmProvider.callLLM(request);
799
+
800
+ // Parse response
801
+ const jsonMatch = response.answer.match(/\{[\s\S]*\}/);
802
+ if (!jsonMatch) {
803
+ this.logger?.(`[Orchestrator] ⚠️ Visual verification: Could not parse response`, 'warn');
804
+ return false; // Default to not achieved if can't parse
805
+ }
806
+
807
+ const parsed = JSON.parse(jsonMatch[0]);
808
+ this.logger?.(`[Orchestrator] 📊 Visual verification result: ${parsed.goalAchieved}`);
809
+ this.logger?.(`[Orchestrator] 💭 Reasoning: ${parsed.reasoning}`);
810
+
811
+ return parsed.goalAchieved === true;
812
+
813
+ } catch (error: any) {
814
+ this.logger?.(`[Orchestrator] ❌ Visual verification failed: ${error.message}`, 'error');
815
+ return false; // Default to not achieved on error
816
+ }
817
+ }
818
+
819
+ /**
820
+ * Build coordinate-specific system prompt (used when selectors repeatedly fail)
821
+ */
822
+ private buildCoordinateSystemPrompt(): string {
823
+ return `You are a visual web automation expert. Selector generation has FAILED multiple times.
824
+
825
+ YOU MUST NOW USE COORDINATE-BASED ACTIONS (this is not optional).
826
+
827
+ SCREENSHOT PROVIDED:
828
+ You will see a screenshot with visual indicators (bounding boxes or markers).
829
+
830
+ CRITICAL - IDENTIFY THE CORRECT ELEMENT:
831
+ 1. READ the step goal carefully - what specific element are you looking for?
832
+ 2. LOCATE that element in the screenshot (NOT a similar-looking element!)
833
+ 3. VERIFY position using screen regions:
834
+ - Left sidebar/menu: xPercent ~5-25% (FAR LEFT)
835
+ - Center content: xPercent ~30-70%
836
+ - Right panel/sidebar: xPercent ~75-95% (FAR RIGHT)
837
+ 4. CALCULATE percentages from element's CENTER position
838
+ 5. SANITY CHECK your percentages:
839
+ - Sidebar menu item at 85%? WRONG - that's far right, not sidebar!
840
+ - Button in top-left at 90%? WRONG - that's top-right!
841
+ - Element description says "left" but x > 50%? WRONG - recheck!
842
+
843
+ Example thought process:
844
+ Goal: "Click Settings link in left navigation"
845
+ → I see "Settings" text in LEFT navigation panel in the screenshot
846
+ → Visual estimate: The link appears in the far left sidebar
847
+ → Horizontal: The link center is roughly 1/8th from the left edge → ~12-13% from left
848
+ → Vertical: The link center is roughly 1/3rd down from top → ~30-35% from top
849
+ → xPercent: 12.500, yPercent: 32.000
850
+ → Sanity check: 12.5% is FAR LEFT ✓ (NOT 80%+ which would be far right!)
851
+ → Description: "Clicking center of Settings link in left sidebar"
852
+
853
+ CRITICAL VISUAL ESTIMATION TIPS:
854
+ - Divide screenshot mentally into quadrants/regions
855
+ - Left sidebar usually ~5-20% from left, center content ~30-70%, right sidebar ~75-95%
856
+ - Aim for CENTER of element, not edges
857
+ - Top bar usually 0-10% from top, footer usually 90-100%
858
+ - Be conservative: slightly off-center is better than way off
859
+
860
+ YOUR RESPONSE FORMAT - Output JSON matching this interface:
861
+
862
+ interface AgentDecisionLLMResponse {
863
+ status: string; // REQUIRED: "continue" (usually for coordinate mode)
864
+ reasoning: string; // REQUIRED: "I see [element] at (X%, Y%) - using coordinates"
865
+ coordinateAction: { // REQUIRED in coordinate mode
866
+ type: "coordinate";
867
+ action: "click" | "doubleClick" | "rightClick" | "hover" | "drag" | "fill" | "scroll";
868
+ xPercent: number; // 0-100, 3 decimals
869
+ yPercent: number; // 0-100, 3 decimals
870
+ toXPercent?: number; // For drag
871
+ toYPercent?: number; // For drag
872
+ value?: string; // For fill
873
+ scrollAmount?: number; // For scroll
874
+ };
875
+ noteToFutureSelf?: string; // Optional: What to try if this fails
876
+ }
877
+
878
+ COORDINATE REFERENCE:
879
+ - Top-left corner: xPercent=0, yPercent=0
880
+ - Top-right corner: xPercent=100, yPercent=0
881
+ - Bottom-left corner: xPercent=0, yPercent=100
882
+ - Bottom-right corner: xPercent=100, yPercent=100
883
+ - Center of screen: xPercent=50, yPercent=50
884
+
885
+ Use 3 decimal places for precision (e.g., 15.755, not 16).
886
+
887
+ ACTIONS:
888
+
889
+ **Physical clicks:**
890
+ - click: { action: "click", xPercent: 15.755, yPercent: 8.500 }
891
+ - doubleClick: { action: "doubleClick", xPercent: 15.755, yPercent: 8.500 }
892
+ - rightClick: { action: "rightClick", xPercent: 15.755, yPercent: 8.500 }
893
+ - hover: { action: "hover", xPercent: 15.755, yPercent: 8.500 }
894
+
895
+ **Input actions:**
896
+ - fill: Click then type
897
+ { action: "fill", xPercent: 30.000, yPercent: 25.000, value: "alice@example.com" }
898
+
899
+ **Movement actions:**
900
+ - drag: From one position to another
901
+ { action: "drag", xPercent: 10.000, yPercent: 50.000, toXPercent: 60.000, toYPercent: 50.000 }
902
+ - scroll: At position, scroll by amount
903
+ { action: "scroll", xPercent: 50.000, yPercent: 50.000, scrollAmount: 500 }
904
+
905
+ CRITICAL RULES:
906
+ - Percentages are from viewport TOP-LEFT (not full page)
907
+ - Use element CENTER for coordinates, not edges
908
+ - Be precise with decimals - wrong coords click wrong element
909
+ - For fill: system will click at (x%,y%) then type value automatically
910
+ - For drag: toXPercent/toYPercent are REQUIRED
911
+
912
+ DO NOT try to generate selectors - that approach already failed. Use coordinates only.
913
+ This is a last-resort mechanism, but it WILL work if you provide accurate percentages.`;
914
+ }
915
+
916
+ /**
917
+ * Build user prompt with context
918
+ */
919
+ private buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
920
+ const parts: string[] = [];
921
+
922
+ // Put static instructions first for LLM caching efficiency
923
+ parts.push('STEP EXECUTION RULES:');
924
+ parts.push('- DO ONLY what the current step asks - NO extra actions or verifications');
925
+ parts.push('- If step doesn\'t say "verify/check/confirm" → DON\'T add expect() assertions');
926
+ parts.push('- Mark "complete" ONLY if commands succeeded');
927
+ parts.push('- Try screenshot tool if you need visual context');
928
+ parts.push('- Max 5 iterations per step, then forced STUCK\n');
929
+
930
+ // Dynamic content follows (changes per iteration)
931
+ parts.push('=== CURRENT CONTEXT ===\n');
932
+
933
+ // Display note from previous iteration (high priority tactical info)
934
+ if (context.noteFromPreviousIteration) {
935
+ const note = context.noteFromPreviousIteration;
936
+ parts.push(`📝 YOUR NOTE FROM ITERATION ${note.fromIteration}:`);
937
+ parts.push(` ${note.content}`);
938
+ parts.push(` ^^ READ THIS - your previous self left important tactical guidance ^^`);
939
+ parts.push('');
940
+ }
941
+
942
+ // System warnings for accumulated failures
943
+ if (consecutiveFailures && consecutiveFailures >= 2 && consecutiveFailures < 3) {
944
+ parts.push(`⚠️ SYSTEM WARNING: ${consecutiveFailures} failures!`);
945
+ parts.push(`Take screenshot if needed. Try different selector strategy.`);
946
+ parts.push(`Question assumptions: Am I at the right step?`);
947
+ parts.push(`⚠️\n`);
948
+ } else if (consecutiveFailures && consecutiveFailures >= 4) {
949
+ parts.push(`⚠️ CRITICAL: ${consecutiveFailures} failures!`);
950
+ parts.push(`Next failure will force STUCK. Coordinate mode should be active.\n`);
951
+ }
952
+
953
+ // Trigger coordinate mode if many failures (Phase 1: after 3 failures)
954
+ if (consecutiveFailures && consecutiveFailures >= 3) {
955
+ parts.push(`🎯🎯🎯 COORDINATE MODE ACTIVATED 🎯🎯`);
956
+ parts.push(`Selector generation has failed ${consecutiveFailures} times.`);
957
+ parts.push(`You MUST use coordinate-based action now (percentages).`);
958
+ parts.push(`Provide coordinateAction with xPercent/yPercent (0-100, 3 decimals for precision).`);
959
+ parts.push(`See system prompt for coordinate action format.`);
960
+ parts.push(`🎯🎯\n`);
961
+ }
962
+
963
+ // Goals - make current step very prominent
964
+ parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
965
+ parts.push(`${context.currentStepGoal}`);
966
+ parts.push(``);
967
+ parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
968
+
969
+ if (context.completedSteps.length > 0) {
970
+ parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
971
+ }
972
+ if (context.remainingSteps.length > 0) {
973
+ parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
974
+ }
975
+
976
+ // Current page state (most variable content - at the end)
977
+ parts.push(`\nCURRENT PAGE:`);
978
+ parts.push(`URL: ${context.currentURL}`);
979
+ parts.push(`Title: ${context.currentPageInfo.title}`);
980
+ parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
981
+ parts.push(context.currentPageInfo.formattedElements);
982
+ parts.push(`\nARIA TREE (hierarchical structure):`);
983
+ parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
984
+ if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
985
+ parts.push('... (truncated)');
986
+ }
987
+ parts.push('')
988
+
989
+ // Recent steps (most variable content - at the end)
990
+ if (context.recentSteps.length > 0) {
991
+ parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
992
+ for (const step of context.recentSteps) {
993
+ const status = step.result === 'success' ? '✓' : '✗';
994
+ parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
995
+ parts.push(` Code: ${step.code}`);
996
+ if (step.result === 'failure' && step.error) {
997
+ parts.push(` ❌ ERROR: ${step.error}`);
998
+ parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
999
+ } else {
1000
+ parts.push(` Result: ${step.observation}`);
1001
+ }
1002
+ }
1003
+ parts.push('');
1004
+
1005
+ // Detect repeated failures
1006
+ const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
1007
+ if (recentFailures.length >= 2) {
1008
+ const sameSelector = recentFailures.slice(-2).every((s, i, arr) =>
1009
+ i === 0 || s.code === arr[i-1].code
1010
+ );
1011
+ if (sameSelector) {
1012
+ parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
1013
+ parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
1014
+ parts.push(` YOU MUST try a completely different selector this time!\n`);
1015
+ }
1016
+ }
1017
+ }
1018
+
1019
+ // Experiences
1020
+ if (context.experiences.length > 0) {
1021
+ parts.push(`\nLEARNINGS (${context.experiences.length}):`);
1022
+ for (const exp of context.experiences) {
1023
+ parts.push(` - ${exp}`);
1024
+ }
1025
+ parts.push('');
1026
+ }
1027
+
1028
+ // Extracted data
1029
+ if (Object.keys(context.extractedData).length > 0) {
1030
+ parts.push(`\nEXTRACTED DATA:`);
1031
+ for (const [key, value] of Object.entries(context.extractedData)) {
1032
+ parts.push(` ${key}: ${value}`);
1033
+ }
1034
+ parts.push('');
1035
+ }
1036
+
1037
+ // Previous reflection
1038
+ if (context.previousIterationGuidance) {
1039
+ parts.push(`\nYOUR PREVIOUS GUIDANCE TO YOURSELF:`);
1040
+ parts.push(`Focus: ${context.previousIterationGuidance.guidanceForNext}`);
1041
+ if (context.previousIterationGuidance.detectingLoop) {
1042
+ parts.push(`⚠ YOU DETECTED A LOOP: ${context.previousIterationGuidance.loopReasoning}`);
1043
+ }
1044
+ parts.push('');
1045
+ }
1046
+
1047
+ // Tool results (if any)
1048
+ if (context.toolResults) {
1049
+ parts.push(`\n========== TOOL RESULTS ==========`);
1050
+ for (const [toolName, result] of Object.entries(context.toolResults)) {
1051
+ parts.push(`\n${toolName}: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
1052
+
1053
+ if (result.learning) {
1054
+ parts.push(`\n📊 ANALYSIS FROM ${toolName.toUpperCase()}:`);
1055
+ parts.push(result.learning);
1056
+ parts.push(`^^ USE THIS INFORMATION TO DECIDE YOUR NEXT COMMANDS ^^`);
1057
+ }
1058
+
1059
+ if (result.error) {
1060
+ parts.push(` Error: ${result.error}`);
1061
+ }
1062
+ }
1063
+ parts.push(`\n===================================\n`);
1064
+ }
1065
+
1066
+ parts.push('=== WHAT DO YOU WANT TO DO? ===');
1067
+ parts.push('Respond with JSON following the format specified in the system prompt.');
1068
+
1069
+ return parts.join('\n');
1070
+ }
1071
+
1072
+ /**
1073
+ * Parse agent decision from LLM response
1074
+ */
1075
+ private parseAgentDecision(response: string): AgentDecision {
1076
+ try {
1077
+ // Extract JSON from response
1078
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
1079
+ if (!jsonMatch) {
1080
+ this.logger?.(`[Orchestrator] ✗ No JSON found in LLM response`, 'error');
1081
+ this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
1082
+ throw new Error('No JSON found in response');
1083
+ }
1084
+
1085
+ const parsed = JSON.parse(jsonMatch[0]);
1086
+
1087
+ // Validate required fields
1088
+ // Accept either "reasoning" or "statusReasoning" (LLMs sometimes only provide one)
1089
+ if (!parsed.status || (!parsed.reasoning && !parsed.statusReasoning)) {
1090
+ this.logger?.(`[Orchestrator] ✗ Missing required fields in parsed JSON`, 'error');
1091
+ this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
1092
+ this.logger?.(`[Orchestrator] 📄 PARSED JSON:\n${JSON.stringify(parsed, null, 2)}`, 'error');
1093
+ this.logger?.(`[Orchestrator] ❌ Has status: ${!!parsed.status}, Has reasoning: ${!!parsed.reasoning}, Has statusReasoning: ${!!parsed.statusReasoning}`, 'error');
1094
+ throw new Error('Missing required fields: status and (reasoning or statusReasoning)');
1095
+ }
1096
+
1097
+ // Normalize: if reasoning is missing but statusReasoning exists, use statusReasoning as reasoning
1098
+ if (!parsed.reasoning && parsed.statusReasoning) {
1099
+ parsed.reasoning = parsed.statusReasoning;
1100
+ }
1101
+
1102
+ return parsed as AgentDecision;
1103
+
1104
+ } catch (error: any) {
1105
+ this.logger?.(`[Orchestrator] ✗ Failed to parse agent decision: ${error.message}`, 'error');
1106
+
1107
+ // Only log full response if not already logged above
1108
+ if (!error.message.includes('Missing required fields') && !error.message.includes('No JSON found')) {
1109
+ this.logger?.(`[Orchestrator] 📄 FULL LLM RESPONSE:\n${response}`, 'error');
1110
+ }
1111
+
1112
+ // Return fallback
1113
+ return {
1114
+ status: 'stuck',
1115
+ statusReasoning: 'Failed to parse agent response',
1116
+ reasoning: `Parse error: ${error.message}`
1117
+ };
1118
+ }
1119
+ }
1120
+
1121
+ /**
1122
+ * Execute tools
1123
+ */
1124
+ private async executeTools(
1125
+ toolCalls: any[],
1126
+ page: any,
1127
+ memory: JourneyMemory,
1128
+ stepNumber: number
1129
+ ): Promise<Record<string, any>> {
1130
+ this.logger?.(`[Orchestrator] 🔧 Executing ${toolCalls.length} tool(s)`);
1131
+
1132
+ const results: Record<string, any> = {};
1133
+ const toolContext: ToolExecutionContext = {
1134
+ page,
1135
+ memory,
1136
+ stepNumber,
1137
+ logger: this.logger
1138
+ };
1139
+
1140
+ for (const toolCall of toolCalls.slice(0, this.config.maxToolCallsPerIteration)) {
1141
+ this.logger?.(`[Orchestrator] ▶ ${toolCall.name}(${JSON.stringify(toolCall.params).substring(0, 50)}...)`);
1142
+
1143
+ const result = await this.toolRegistry.execute(toolCall, toolContext);
1144
+ results[toolCall.name] = result;
1145
+
1146
+ if (result.success) {
1147
+ this.logger?.(`[Orchestrator] ✓ ${toolCall.name} succeeded`);
1148
+ } else {
1149
+ this.logger?.(`[Orchestrator] ✗ ${toolCall.name} failed: ${result.error}`, 'error');
1150
+ }
1151
+ }
1152
+
1153
+ return results;
1154
+ }
1155
+
1156
+ /**
1157
+ * Execute commands sequentially with SHARED context (variables persist across commands)
1158
+ */
1159
+ private async executeCommandsSequentially(
1160
+ commands: string[],
1161
+ page: any,
1162
+ memory: JourneyMemory,
1163
+ stepNumber: number,
1164
+ iteration: number,
1165
+ jobId: string
1166
+ ): Promise<{ executed: string[]; allSucceeded: boolean }> {
1167
+ this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s) in shared context`);
1168
+
1169
+ const executed: string[] = [];
1170
+ const limitedCommands = commands.slice(0, this.config.maxCommandsPerIteration);
1171
+
1172
+ // Build execution with shared context (all commands share scope - variables persist)
1173
+ const commandsWithTracking = limitedCommands.map((cmd, i) => {
1174
+ return `
1175
+ // Command ${i + 1}/${limitedCommands.length}
1176
+ try {
1177
+ ${cmd}
1178
+ __results.push({ index: ${i}, success: true });
1179
+ } catch (error) {
1180
+ __results.push({ index: ${i}, success: false, error: error.message });
1181
+ throw error; // Stop on first failure
1182
+ }`;
1183
+ }).join('\n');
1184
+
1185
+ const wrappedCode = `
1186
+ const __results = [];
1187
+ ${commandsWithTracking}
1188
+ return __results;
1189
+ `;
1190
+
1191
+ try {
1192
+ // Execute in shared context - variables declared here persist for entire scenario
1193
+ // Pass both page and expect to make Playwright assertions available
1194
+ const func = new Function('page', 'expect', 'return (async () => { ' + wrappedCode + ' })()');
1195
+ const results = await func(page, expect);
1196
+
1197
+ // Record results for each command
1198
+ for (let i = 0; i < limitedCommands.length; i++) {
1199
+ const cmd = limitedCommands[i];
1200
+ const result = results[i];
1201
+
1202
+ if (result && result.success) {
1203
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${limitedCommands.length}] Success`);
1204
+
1205
+ // Record in history
1206
+ memory.history.push({
1207
+ stepNumber,
1208
+ iteration,
1209
+ action: `Command ${i + 1}/${limitedCommands.length}`,
1210
+ code: cmd,
1211
+ result: 'success',
1212
+ observation: 'Executed successfully',
1213
+ url: page.url(),
1214
+ timestamp: Date.now()
1215
+ });
1216
+
1217
+ executed.push(cmd);
1218
+ }
1219
+ }
1220
+
1221
+ // Cap history
1222
+ if (memory.history.length > this.config.maxHistorySize) {
1223
+ memory.history = memory.history.slice(-this.config.maxHistorySize);
1224
+ }
1225
+
1226
+ return { executed, allSucceeded: true };
1227
+
1228
+ } catch (error: any) {
1229
+ // One of the commands failed - find which one
1230
+ const errorMessage = error.message || String(error);
1231
+
1232
+ // Capture page state for debug logging
1233
+ let pageStateDebug = '';
1234
+ if (this.debugMode) {
1235
+ try {
1236
+ const pageInfo = await getEnhancedPageInfo(page);
1237
+ pageStateDebug = `
1238
+ === DEBUG: PAGE STATE AT FAILURE ===
1239
+ URL: ${page.url()}
1240
+ Title: ${pageInfo.title}
1241
+
1242
+ INTERACTIVE ELEMENTS:
1243
+ ${pageInfo.formattedElements}
1244
+
1245
+ ARIA SNAPSHOT:
1246
+ ${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
1247
+
1248
+ ====================================`;
1249
+ } catch (debugError: any) {
1250
+ pageStateDebug = `Failed to capture page state: ${debugError.message}`;
1251
+ }
1252
+ }
1253
+
1254
+ // Record all that succeeded, then the failure
1255
+ for (let i = 0; i < limitedCommands.length; i++) {
1256
+ const cmd = limitedCommands[i];
1257
+
1258
+ // This is a failed command (error happened here or earlier)
1259
+ if (executed.length <= i) {
1260
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${limitedCommands.length}] Failed: ${errorMessage}`, 'error');
1261
+
1262
+ // Log detailed debug info
1263
+ if (this.debugMode && pageStateDebug) {
1264
+ this.logger?.(pageStateDebug, 'debug');
1265
+ }
1266
+
1267
+ memory.history.push({
1268
+ stepNumber,
1269
+ iteration,
1270
+ action: `Command ${i + 1}/${limitedCommands.length} - FAILED`,
1271
+ code: cmd,
1272
+ result: 'failure',
1273
+ observation: `Failed with error: ${errorMessage}. This selector likely doesn't exist or is incorrect.`,
1274
+ error: errorMessage,
1275
+ url: page.url(),
1276
+ timestamp: Date.now()
1277
+ });
1278
+
1279
+ if (i < limitedCommands.length - 1) {
1280
+ this.logger?.(`[Orchestrator] ⚠ Skipping remaining ${limitedCommands.length - i - 1} command(s)`, 'warn');
1281
+ }
1282
+
1283
+ break;
1284
+ }
1285
+ }
1286
+
1287
+ return { executed, allSucceeded: false };
1288
+ }
1289
+ }
1290
+
1291
+ /**
1292
+ * Execute a single command
1293
+ */
1294
+ private async executeCommand(cmd: string, page: any): Promise<void> {
1295
+ // Wrap in async function and execute
1296
+ const wrapped = `(async () => { ${cmd} })()`;
1297
+
1298
+ try {
1299
+ await eval(wrapped);
1300
+ } catch (error: any) {
1301
+ // If eval fails, try direct execution with page context
1302
+ // Pass both page and expect to make Playwright assertions available
1303
+ const func = new Function('page', 'expect', `return (async () => { ${cmd} })()`);
1304
+ await func(page, expect);
1305
+ }
1306
+ }
1307
+
1308
+ /**
1309
+ * Log agent decision
1310
+ */
1311
+ private logAgentDecision(decision: AgentDecision, iteration: number): void {
1312
+ this.logger?.(`[Orchestrator] 💭 REASONING: ${decision.reasoning}`);
1313
+
1314
+ if (decision.selfReflection) {
1315
+ this.logger?.(`[Orchestrator] 🧠 SELF-REFLECTION:`);
1316
+ this.logger?.(`[Orchestrator] Next: ${decision.selfReflection.guidanceForNext}`);
1317
+ if (decision.selfReflection.detectingLoop) {
1318
+ this.logger?.(`[Orchestrator] 🔄 LOOP DETECTED: ${decision.selfReflection.loopReasoning}`, 'warn');
1319
+ }
1320
+ }
1321
+
1322
+ if (decision.toolCalls && decision.toolCalls.length > 0) {
1323
+ this.logger?.(`[Orchestrator] 🔧 TOOLS: ${decision.toolCalls.map(t => t.name).join(', ')}`);
1324
+ if (decision.toolReasoning) {
1325
+ this.logger?.(`[Orchestrator] 📋 Why: ${decision.toolReasoning}`);
1326
+ }
1327
+ }
1328
+
1329
+ if (decision.blockerDetected) {
1330
+ this.logger?.(`[Orchestrator] 🚧 BLOCKER: ${decision.blockerDetected.description}`, 'warn');
1331
+ this.logger?.(`[Orchestrator] 🧹 Clearing with ${decision.blockerDetected.clearingCommands.length} command(s)`);
1332
+ }
1333
+
1334
+ if (decision.stepReEvaluation?.detected) {
1335
+ this.logger?.(`[Orchestrator] 🔍 STEP RE-EVALUATION: ${decision.stepReEvaluation.issue}`, 'warn');
1336
+ this.logger?.(`[Orchestrator] 📝 Explanation: ${decision.stepReEvaluation.explanation}`);
1337
+ }
1338
+
1339
+ if (decision.commands && decision.commands.length > 0) {
1340
+ this.logger?.(`[Orchestrator] 📝 COMMANDS (${decision.commands.length}):`);
1341
+ decision.commands.slice(0, 3).forEach((cmd, i) => {
1342
+ this.logger?.(`[Orchestrator] ${i + 1}. ${cmd.substring(0, 80)}...`);
1343
+ });
1344
+ if (decision.commands.length > 3) {
1345
+ this.logger?.(`[Orchestrator] ... and ${decision.commands.length - 3} more`);
1346
+ }
1347
+ if (decision.commandReasoning) {
1348
+ this.logger?.(`[Orchestrator] 💡 Why: ${decision.commandReasoning}`);
1349
+ }
1350
+ }
1351
+
1352
+ // Experiences will be logged when added to memory, no need to log here
1353
+ }
1354
+
1355
+ /**
1356
+ * Report step progress
1357
+ */
1358
+ private async reportStepProgress(
1359
+ jobId: string,
1360
+ stepNumber: number,
1361
+ description: string,
1362
+ decision: AgentDecision,
1363
+ iteration: number
1364
+ ): Promise<void> {
1365
+ if (!this.progressReporter?.onStepProgress) return;
1366
+
1367
+ await this.progressReporter.onStepProgress({
1368
+ jobId,
1369
+ stepNumber,
1370
+ description,
1371
+ status: decision.status === 'complete' ? StepExecutionStatus.SUCCESS :
1372
+ decision.status === 'stuck' || decision.status === 'infeasible' ? StepExecutionStatus.FAILURE :
1373
+ StepExecutionStatus.IN_PROGRESS,
1374
+ code: decision.commands?.join('\n'),
1375
+ // Include agent metadata for transparency
1376
+ agentIteration: iteration,
1377
+ agentReasoning: decision.reasoning,
1378
+ agentSelfReflection: decision.selfReflection,
1379
+ agentExperiences: decision.experiences,
1380
+ agentToolsUsed: decision.toolCalls?.map(t => t.name),
1381
+ agentStatus: decision.status
1382
+ });
1383
+ }
1384
+ }
1385
+
1386
+