testchimp-runner-core 0.0.33 → 0.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/dist/execution-service.d.ts +1 -4
  2. package/dist/execution-service.d.ts.map +1 -1
  3. package/dist/execution-service.js +155 -468
  4. package/dist/execution-service.js.map +1 -1
  5. package/dist/index.d.ts +3 -1
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +11 -1
  8. package/dist/index.js.map +1 -1
  9. package/dist/llm-facade.d.ts.map +1 -1
  10. package/dist/llm-facade.js +7 -7
  11. package/dist/llm-facade.js.map +1 -1
  12. package/dist/llm-provider.d.ts +9 -0
  13. package/dist/llm-provider.d.ts.map +1 -1
  14. package/dist/model-constants.d.ts +16 -5
  15. package/dist/model-constants.d.ts.map +1 -1
  16. package/dist/model-constants.js +17 -6
  17. package/dist/model-constants.js.map +1 -1
  18. package/dist/orchestrator/decision-parser.d.ts +18 -0
  19. package/dist/orchestrator/decision-parser.d.ts.map +1 -0
  20. package/dist/orchestrator/decision-parser.js +127 -0
  21. package/dist/orchestrator/decision-parser.js.map +1 -0
  22. package/dist/orchestrator/index.d.ts +4 -2
  23. package/dist/orchestrator/index.d.ts.map +1 -1
  24. package/dist/orchestrator/index.js +15 -2
  25. package/dist/orchestrator/index.js.map +1 -1
  26. package/dist/orchestrator/orchestrator-agent.d.ts +17 -22
  27. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  28. package/dist/orchestrator/orchestrator-agent.js +708 -577
  29. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  30. package/dist/orchestrator/orchestrator-prompts.d.ts +32 -0
  31. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
  32. package/dist/orchestrator/orchestrator-prompts.js +737 -0
  33. package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
  34. package/dist/orchestrator/page-som-handler.d.ts +106 -0
  35. package/dist/orchestrator/page-som-handler.d.ts.map +1 -0
  36. package/dist/orchestrator/page-som-handler.js +1353 -0
  37. package/dist/orchestrator/page-som-handler.js.map +1 -0
  38. package/dist/orchestrator/som-types.d.ts +149 -0
  39. package/dist/orchestrator/som-types.d.ts.map +1 -0
  40. package/dist/orchestrator/som-types.js +87 -0
  41. package/dist/orchestrator/som-types.js.map +1 -0
  42. package/dist/orchestrator/tool-registry.d.ts +2 -0
  43. package/dist/orchestrator/tool-registry.d.ts.map +1 -1
  44. package/dist/orchestrator/tool-registry.js.map +1 -1
  45. package/dist/orchestrator/tools/index.d.ts +5 -1
  46. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  47. package/dist/orchestrator/tools/index.js +9 -2
  48. package/dist/orchestrator/tools/index.js.map +1 -1
  49. package/dist/orchestrator/tools/refresh-som-markers.d.ts +12 -0
  50. package/dist/orchestrator/tools/refresh-som-markers.d.ts.map +1 -0
  51. package/dist/orchestrator/tools/refresh-som-markers.js +64 -0
  52. package/dist/orchestrator/tools/refresh-som-markers.js.map +1 -0
  53. package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
  54. package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
  55. package/dist/orchestrator/tools/verify-action-result.js +140 -0
  56. package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
  57. package/dist/orchestrator/tools/view-previous-screenshot.d.ts +15 -0
  58. package/dist/orchestrator/tools/view-previous-screenshot.d.ts.map +1 -0
  59. package/dist/orchestrator/tools/view-previous-screenshot.js +92 -0
  60. package/dist/orchestrator/tools/view-previous-screenshot.js.map +1 -0
  61. package/dist/orchestrator/types.d.ts +49 -1
  62. package/dist/orchestrator/types.d.ts.map +1 -1
  63. package/dist/orchestrator/types.js +11 -1
  64. package/dist/orchestrator/types.js.map +1 -1
  65. package/dist/prompts.d.ts.map +1 -1
  66. package/dist/prompts.js +40 -34
  67. package/dist/prompts.js.map +1 -1
  68. package/dist/scenario-service.d.ts +5 -0
  69. package/dist/scenario-service.d.ts.map +1 -1
  70. package/dist/scenario-service.js +17 -0
  71. package/dist/scenario-service.js.map +1 -1
  72. package/dist/scenario-worker-class.d.ts +4 -0
  73. package/dist/scenario-worker-class.d.ts.map +1 -1
  74. package/dist/scenario-worker-class.js +21 -3
  75. package/dist/scenario-worker-class.js.map +1 -1
  76. package/dist/testing/agent-tester.d.ts +35 -0
  77. package/dist/testing/agent-tester.d.ts.map +1 -0
  78. package/dist/testing/agent-tester.js +84 -0
  79. package/dist/testing/agent-tester.js.map +1 -0
  80. package/dist/testing/ref-translator-tester.d.ts +44 -0
  81. package/dist/testing/ref-translator-tester.d.ts.map +1 -0
  82. package/dist/testing/ref-translator-tester.js +104 -0
  83. package/dist/testing/ref-translator-tester.js.map +1 -0
  84. package/dist/utils/coordinate-converter.d.ts +32 -0
  85. package/dist/utils/coordinate-converter.d.ts.map +1 -0
  86. package/dist/utils/coordinate-converter.js +130 -0
  87. package/dist/utils/coordinate-converter.js.map +1 -0
  88. package/dist/utils/hierarchical-selector.d.ts +47 -0
  89. package/dist/utils/hierarchical-selector.d.ts.map +1 -0
  90. package/dist/utils/hierarchical-selector.js +212 -0
  91. package/dist/utils/hierarchical-selector.js.map +1 -0
  92. package/dist/utils/page-info-retry.d.ts +14 -0
  93. package/dist/utils/page-info-retry.d.ts.map +1 -0
  94. package/dist/utils/page-info-retry.js +60 -0
  95. package/dist/utils/page-info-retry.js.map +1 -0
  96. package/dist/utils/page-info-utils.d.ts +1 -0
  97. package/dist/utils/page-info-utils.d.ts.map +1 -1
  98. package/dist/utils/page-info-utils.js +46 -18
  99. package/dist/utils/page-info-utils.js.map +1 -1
  100. package/dist/utils/ref-attacher.d.ts +21 -0
  101. package/dist/utils/ref-attacher.d.ts.map +1 -0
  102. package/dist/utils/ref-attacher.js +149 -0
  103. package/dist/utils/ref-attacher.js.map +1 -0
  104. package/dist/utils/ref-translator.d.ts +49 -0
  105. package/dist/utils/ref-translator.d.ts.map +1 -0
  106. package/dist/utils/ref-translator.js +276 -0
  107. package/dist/utils/ref-translator.js.map +1 -0
  108. package/package.json +1 -1
  109. package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
  110. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
  111. package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
  112. package/plandocs/PHASE_1_COMPLETE.md +165 -0
  113. package/plandocs/PHASE_1_SUMMARY.md +184 -0
  114. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
  115. package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
  116. package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
  117. package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
  118. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
  119. package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
  120. package/plandocs/exploratory-mode-support-v2.plan.md +953 -0
  121. package/plandocs/exploratory-mode-support.plan.md +928 -0
  122. package/plandocs/journey-id-tracking-addendum.md +227 -0
  123. package/src/execution-service.ts +179 -596
  124. package/src/index.ts +10 -0
  125. package/src/llm-facade.ts +8 -8
  126. package/src/llm-provider.ts +11 -1
  127. package/src/model-constants.ts +17 -5
  128. package/src/orchestrator/decision-parser.ts +139 -0
  129. package/src/orchestrator/index.ts +27 -2
  130. package/src/orchestrator/orchestrator-agent.ts +868 -623
  131. package/src/orchestrator/orchestrator-prompts.ts +786 -0
  132. package/src/orchestrator/page-som-handler.ts +1565 -0
  133. package/src/orchestrator/som-types.ts +188 -0
  134. package/src/orchestrator/tool-registry.ts +2 -0
  135. package/src/orchestrator/tools/index.ts +5 -1
  136. package/src/orchestrator/tools/refresh-som-markers.ts +69 -0
  137. package/src/orchestrator/tools/verify-action-result.ts +159 -0
  138. package/src/orchestrator/tools/view-previous-screenshot.ts +103 -0
  139. package/src/orchestrator/types.ts +95 -4
  140. package/src/prompts.ts +40 -34
  141. package/src/scenario-service.ts +20 -0
  142. package/src/scenario-worker-class.ts +30 -4
  143. package/src/utils/coordinate-converter.ts +162 -0
  144. package/src/utils/page-info-retry.ts +65 -0
  145. package/src/utils/page-info-utils.ts +53 -18
  146. package/testchimp-runner-core-0.0.35.tgz +0 -0
  147. /package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
  148. /package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
  149. /package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0
  150. /package/{RELEASE_0.0.26.md → releasenotes/RELEASE_0.0.26.md} +0 -0
  151. /package/{RELEASE_0.0.27.md → releasenotes/RELEASE_0.0.27.md} +0 -0
  152. /package/{RELEASE_0.0.28.md → releasenotes/RELEASE_0.0.28.md} +0 -0
@@ -6,7 +6,8 @@
6
6
  import { expect } from '@playwright/test';
7
7
  import { LLMFacade } from '../llm-facade';
8
8
  import { ProgressReporter, TokenUsage, StepExecutionStatus } from '../progress-reporter';
9
- import { getEnhancedPageInfo } from '../utils/page-info-utils';
9
+ import { getEnhancedPageInfo, PageInfo } from '../utils/page-info-utils';
10
+ import { CoordinateConverter } from '../utils/coordinate-converter';
10
11
  import { ToolRegistry, ToolExecutionContext } from './tool-registry';
11
12
  import { DEFAULT_MODEL } from '../model-constants';
12
13
  import {
@@ -17,8 +18,16 @@ import {
17
18
  MemoryStep,
18
19
  OrchestratorStepResult,
19
20
  SelfReflection,
21
+ NoteToFutureSelf,
22
+ CoordinateAction,
23
+ ExplorationMode,
20
24
  DEFAULT_AGENT_CONFIG
21
25
  } from './types';
26
+ import { OrchestratorPrompts } from './orchestrator-prompts';
27
+ import { PageInfoRetry } from '../utils/page-info-retry';
28
+ import { DecisionParser } from './decision-parser';
29
+ import { PageSoMHandler } from './page-som-handler';
30
+ import { SomCommand, CommandRunStatus, InteractionAction, isSomVerification, isSomCommand, SomVerification } from './som-types';
22
31
 
23
32
  /**
24
33
  * Orchestrator Agent - manages step execution with tool use and memory
@@ -30,6 +39,9 @@ export class OrchestratorAgent {
30
39
  private config: Required<AgentConfig>;
31
40
  private logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void;
32
41
  private debugMode: boolean = false;
42
+ private decisionParser: DecisionParser;
43
+ private somHandler?: PageSoMHandler;
44
+ private previousSomScreenshot?: string; // Track previous iteration's screenshot
33
45
 
34
46
  constructor(
35
47
  llmFacade: LLMFacade,
@@ -45,6 +57,12 @@ export class OrchestratorAgent {
45
57
  this.progressReporter = progressReporter;
46
58
  this.logger = logger;
47
59
  this.debugMode = debugMode || false;
60
+ this.decisionParser = new DecisionParser(logger);
61
+
62
+ // Initialize SoM handler if enabled
63
+ if (this.config.useSoM) {
64
+ this.somHandler = new PageSoMHandler(null as any, this.logger);
65
+ }
48
66
  }
49
67
 
50
68
  setDebugMode(enabled: boolean): void {
@@ -61,15 +79,18 @@ export class OrchestratorAgent {
61
79
  totalSteps: number,
62
80
  scenarioSteps: string[],
63
81
  memory: JourneyMemory,
64
- jobId: string
82
+ jobId: string,
83
+ priorSteps?: string[], // NEW: For repair mode (undefined for script gen)
84
+ nextSteps?: string[] // NEW: For repair mode (undefined for script gen)
65
85
  ): Promise<OrchestratorStepResult> {
66
86
  this.logger?.(`\n[Orchestrator] ========== STEP ${stepNumber}/${totalSteps} ==========`);
67
87
  this.logger?.(`[Orchestrator] 🎯 Goal: ${stepDescription}`);
68
88
 
69
89
  let iteration = 0;
70
- let previousReflection: SelfReflection | undefined = undefined;
90
+ let noteToSelf: NoteToFutureSelf | undefined = memory.latestNote; // Start with note from previous step
71
91
  const commandsExecuted: string[] = [];
72
92
  let consecutiveFailures = 0; // Track consecutive iterations with failed commands
93
+ let coordinateAttempts = 0; // Track coordinate mode attempts (max 2)
73
94
 
74
95
  while (iteration < this.config.maxIterationsPerStep) {
75
96
  iteration++;
@@ -84,15 +105,23 @@ export class OrchestratorAgent {
84
105
  totalSteps,
85
106
  scenarioSteps,
86
107
  memory,
87
- previousReflection,
88
- consecutiveFailures
108
+ consecutiveFailures,
109
+ noteToSelf, // Pass note from previous iteration
110
+ priorSteps, // NEW: Pass repair context
111
+ nextSteps // NEW: Pass repair context
89
112
  );
90
113
 
91
114
  // Call agent to make decision
92
- const decision = await this.callAgent(context, jobId, stepNumber, iteration, consecutiveFailures);
115
+ const decision = await this.callAgent(
116
+ context,
117
+ jobId,
118
+ stepNumber,
119
+ iteration,
120
+ consecutiveFailures
121
+ );
93
122
 
94
123
  // Log agent's reasoning
95
- this.logAgentDecision(decision, iteration);
124
+ this.decisionParser.log(decision, iteration);
96
125
 
97
126
  // Report progress
98
127
  await this.reportStepProgress(jobId, stepNumber, stepDescription, decision, iteration);
@@ -100,8 +129,39 @@ export class OrchestratorAgent {
100
129
  // Execute tools if requested (tools are READ-ONLY, they don't change state)
101
130
  let toolResults: Record<string, any> = {};
102
131
 
132
+ // ANTI-LOOP: Detect and BLOCK screenshot loops (PER STEP)
133
+ const screenshotsThisStep = memory.history.filter(s =>
134
+ s.stepNumber === stepNumber &&
135
+ (s.code.includes('take_screenshot') || s.action.includes('Screenshot'))
136
+ );
137
+ const recentScreenshots = memory.history.slice(-3).filter(s =>
138
+ s.code.includes('take_screenshot') || s.action.includes('Screenshot')
139
+ );
140
+
141
+ if (screenshotsThisStep.length >= 3) {
142
+ this.logger?.(`[Orchestrator] 🚨 SCREENSHOT LOOP - ${screenshotsThisStep.length} screenshots THIS STEP! BLOCKING further screenshots`, 'error');
143
+ } else if (recentScreenshots.length >= 2 && iteration >= 3) {
144
+ this.logger?.(`[Orchestrator] ⚠️ WARNING: ${recentScreenshots.length} screenshots in last 3 iterations - agent may be looping`, 'warn');
145
+ }
146
+
103
147
  if (decision.toolCalls && decision.toolCalls.length > 0) {
104
- toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
148
+ // ENFORCE: Block screenshot tool calls if too many taken IN THIS STEP
149
+ if (screenshotsThisStep.length >= 3) {
150
+ decision.toolCalls = decision.toolCalls.filter(tc => tc.name !== 'take_screenshot');
151
+ if (decision.toolCalls.length === 0) {
152
+ this.logger?.(`[Orchestrator] 🚫 REJECTED screenshot tool call - loop detected. Agent must ACT.`, 'warn');
153
+ toolResults = [{
154
+ toolName: 'take_screenshot',
155
+ success: false,
156
+ error: 'SYSTEM BLOCKED: Too many screenshots taken. You must use existing DOM snapshots and execute commands now. Analysis paralysis detected.',
157
+ data: null
158
+ }];
159
+ }
160
+ }
161
+
162
+ if (decision.toolCalls.length > 0) {
163
+ toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber, context.currentPageInfo.refMap);
164
+ }
105
165
 
106
166
  // If agent wants to wait for tool results before proceeding, call agent again
107
167
  if (decision.needsToolResults) {
@@ -125,7 +185,7 @@ export class OrchestratorAgent {
125
185
  this.logger?.(`[Orchestrator] 🚧 BLOCKER DETECTED: ${decision.blockerDetected.description}`);
126
186
  this.logger?.(`[Orchestrator] 🧹 Clearing blocker with ${decision.blockerDetected.clearingCommands.length} command(s)...`);
127
187
 
128
- const blockerResult = await this.executeCommandsSequentially(
188
+ const blockerResult = await this.executeCommands(
129
189
  decision.blockerDetected.clearingCommands,
130
190
  page,
131
191
  memory,
@@ -148,9 +208,9 @@ export class OrchestratorAgent {
148
208
  }
149
209
  }
150
210
 
151
- // Execute main commands (only if no blocker failure or continueOnFailure is true)
152
- if (decision.commands && decision.commands.length > 0 && !iterationHadFailure) {
153
- const executeResult = await this.executeCommandsSequentially(
211
+ // Execute main commands (only if no blocker failure)
212
+ if (!iterationHadFailure && decision.commands && decision.commands.length > 0) {
213
+ const executeResult = await this.executeCommands(
154
214
  decision.commands,
155
215
  page,
156
216
  memory,
@@ -171,8 +231,161 @@ export class OrchestratorAgent {
171
231
  }
172
232
  }
173
233
 
234
+ // Handle coordinate-based actions (NEW - fallback when selectors fail) - ONLY if enabled
235
+ if (this.config.enableCoordinateMode && decision.coordinateAction && !iterationHadFailure) {
236
+ coordinateAttempts++;
237
+
238
+ this.logger?.(`[Orchestrator] 🎯 Coordinate Action (attempt ${coordinateAttempts}/2): ${decision.coordinateAction.action} at (${decision.coordinateAction.xPercent}%, ${decision.coordinateAction.yPercent}%)`);
239
+
240
+ try {
241
+ // BEFORE screenshot for visual verification
242
+ const beforeScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
243
+ const beforeDataUrl = `data:image/jpeg;base64,${beforeScreenshot}`;
244
+
245
+ // Generate Playwright commands from coordinate action
246
+ const coordCommands = await CoordinateConverter.generateCommands(decision.coordinateAction, page);
247
+
248
+ this.logger?.(`[Orchestrator] Generated commands:`);
249
+ coordCommands.forEach(cmd => this.logger?.(` ${cmd}`));
250
+
251
+ // Execute coordinate commands
252
+ const coordResult = await this.executeCommands(
253
+ coordCommands,
254
+ page,
255
+ memory,
256
+ stepNumber,
257
+ iteration,
258
+ jobId
259
+ );
260
+
261
+ commandsExecuted.push(...coordResult.executed);
262
+
263
+ if (!coordResult.allSucceeded) {
264
+ this.logger?.(`[Orchestrator] ❌ Coordinate action failed (Playwright error)`);
265
+ consecutiveFailures++;
266
+ iterationHadFailure = true;
267
+
268
+ // Give up after 2 coordinate attempts
269
+ if (coordinateAttempts >= 2) {
270
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
271
+ return {
272
+ success: false,
273
+ commands: commandsExecuted,
274
+ iterations: iteration,
275
+ terminationReason: 'agent_stuck',
276
+ memory,
277
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
278
+ };
279
+ }
280
+ } else {
281
+ this.logger?.(`[Orchestrator] ✅ Coordinate action succeeded (no Playwright error)`);
282
+
283
+ // CRITICAL: Verify visually if goal was achieved (coordinates might have clicked wrong place)
284
+ // Wait for network idle (smarter than fixed timeout - waits as needed, max 10s)
285
+ try {
286
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
287
+ } catch (e) {
288
+ // Network idle timeout - page may still be loading, but proceed with verification
289
+ this.logger?.(`[Orchestrator] ⚠️ Network idle timeout after 10s, proceeding with verification`, 'warn');
290
+ }
291
+
292
+ const afterScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
293
+ const afterDataUrl = `data:image/jpeg;base64,${afterScreenshot}`;
294
+
295
+ this.logger?.(`[Orchestrator] 📸 Verifying coordinate action visually...`);
296
+
297
+ const verificationRequest = {
298
+ model: 'gpt-5-mini',
299
+ systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
300
+ userPrompt: `Goal: ${scenarioSteps[stepNumber - 1]}\n\nA coordinate-based action was just executed. Compare the BEFORE and AFTER screenshots.\n\nDid the action achieve the goal? Respond with JSON:\n{\n "verified": boolean,\n "reasoning": "What changed (or didn't change) between screenshots",\n "visibleChanges": ["List of UI changes observed"],\n "clickedWrongPlace": boolean\n}\n\nBe strict: Only return verified=true if you clearly see the expected change matching the goal.`,
301
+ images: [
302
+ { label: 'BEFORE', dataUrl: beforeDataUrl },
303
+ { label: 'AFTER', dataUrl: afterDataUrl }
304
+ ]
305
+ };
306
+
307
+ const verificationResponse = await this.llmFacade.llmProvider.callLLM(verificationRequest);
308
+ const jsonMatch = verificationResponse.answer.match(/\{[\s\S]*\}/);
309
+
310
+ if (jsonMatch) {
311
+ const verificationResult = JSON.parse(jsonMatch[0]);
312
+ const verified = verificationResult.verified === true;
313
+ const reasoning = verificationResult.reasoning || 'No reasoning provided';
314
+ const clickedWrongPlace = verificationResult.clickedWrongPlace === true;
315
+
316
+ this.logger?.(`[Orchestrator] 📊 Visual verification: ${verified ? '✅ VERIFIED' : '❌ NOT VERIFIED'}`);
317
+ this.logger?.(`[Orchestrator] 💭 Reasoning: ${reasoning}`);
318
+
319
+ if (verified) {
320
+ // Goal achieved! Can mark complete
321
+ consecutiveFailures = 0;
322
+
323
+ // Store note for context
324
+ noteToSelf = {
325
+ fromIteration: iteration,
326
+ content: `Coordinate action verified successful: ${reasoning}`
327
+ };
328
+ memory.latestNote = noteToSelf;
329
+ } else {
330
+ // Goal NOT achieved despite no error
331
+ this.logger?.(`[Orchestrator] ⚠️ Coordinate click succeeded but goal NOT achieved`, 'warn');
332
+ consecutiveFailures++;
333
+ iterationHadFailure = true;
334
+
335
+ // Store diagnostic note for next attempt
336
+ const diagnostic = clickedWrongPlace
337
+ ? `Clicked wrong place. ${reasoning}. Try different coordinates.`
338
+ : `Action executed but goal not achieved. ${reasoning}. May need different approach.`;
339
+
340
+ noteToSelf = {
341
+ fromIteration: iteration,
342
+ content: diagnostic
343
+ };
344
+ memory.latestNote = noteToSelf;
345
+
346
+ // Give up after 2 coordinate attempts
347
+ if (coordinateAttempts >= 2) {
348
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts, none achieved goal) - marking stuck`);
349
+ return {
350
+ success: false,
351
+ commands: commandsExecuted,
352
+ iterations: iteration,
353
+ terminationReason: 'agent_stuck',
354
+ memory,
355
+ error: `Coordinate actions clicked but didn't achieve goal: ${reasoning}`
356
+ };
357
+ }
358
+ }
359
+ } else {
360
+ this.logger?.(`[Orchestrator] ⚠️ Could not parse verification response - treating as unverified`, 'warn');
361
+ consecutiveFailures++;
362
+ iterationHadFailure = true;
363
+ }
364
+ }
365
+
366
+ } catch (error: any) {
367
+ this.logger?.(`[Orchestrator] ❌ Coordinate action error: ${error.message}`, 'error');
368
+ consecutiveFailures++;
369
+ iterationHadFailure = true;
370
+
371
+ // Give up after 2 coordinate attempts
372
+ if (coordinateAttempts >= 2) {
373
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
374
+ return {
375
+ success: false,
376
+ commands: commandsExecuted,
377
+ iterations: iteration,
378
+ terminationReason: 'agent_stuck',
379
+ memory,
380
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
381
+ };
382
+ }
383
+ }
384
+ }
385
+
174
386
  // System-enforced stuck detection (agent might not detect it)
175
- if (consecutiveFailures >= 3) {
387
+ // Allow 5 failures: 3 selector attempts + 2 coordinate attempts
388
+ if (consecutiveFailures >= 5) {
176
389
  this.logger?.(`[Orchestrator] 🛑 SYSTEM: ${consecutiveFailures} consecutive failures detected - forcing stuck`, 'warn');
177
390
  return {
178
391
  success: false,
@@ -205,23 +418,38 @@ export class OrchestratorAgent {
205
418
  }
206
419
  }
207
420
 
208
- // Store self-reflection for next iteration
209
- previousReflection = decision.selfReflection;
421
+ // Store note to future self (tactical memory across iterations AND steps)
422
+ if (decision.noteToFutureSelf) {
423
+ noteToSelf = {
424
+ fromIteration: iteration,
425
+ content: decision.noteToFutureSelf
426
+ };
427
+ memory.latestNote = noteToSelf; // Persist in journey memory across steps
428
+ this.logger?.(`[Orchestrator] 📝 Note to self: ${decision.noteToFutureSelf}`);
429
+ }
210
430
 
211
431
  // Check termination
212
432
  if (decision.status !== 'continue') {
213
433
  this.logger?.(`[Orchestrator] 🎯 Status: ${decision.status}`);
214
434
  this.logger?.(`[Orchestrator] 💭 Reason: ${decision.statusReasoning}`);
215
435
 
216
- return {
217
- success: decision.status === 'complete',
218
- commands: commandsExecuted,
219
- iterations: iteration,
220
- terminationReason: decision.status === 'complete' ? 'complete' :
221
- decision.status === 'stuck' ? 'agent_stuck' :
222
- 'infeasible',
223
- memory
224
- };
436
+ // SAFETY CHECK: Don't allow "complete" if commands failed this iteration
437
+ if (decision.status === 'complete' && iterationHadFailure) {
438
+ this.logger?.(`[Orchestrator] ⚠️ OVERRIDE: Agent said "complete" but commands FAILED - forcing "continue"`, 'warn');
439
+ this.logger?.(`[Orchestrator] 💭 System: Commands must succeed before marking complete`);
440
+ // Don't return - continue to next iteration
441
+ } else {
442
+ // Valid termination
443
+ return {
444
+ success: decision.status === 'complete',
445
+ commands: commandsExecuted,
446
+ iterations: iteration,
447
+ terminationReason: decision.status === 'complete' ? 'complete' :
448
+ decision.status === 'stuck' ? 'agent_stuck' :
449
+ 'infeasible',
450
+ memory
451
+ };
452
+ }
225
453
  }
226
454
  }
227
455
 
@@ -248,8 +476,10 @@ export class OrchestratorAgent {
248
476
  totalSteps: number,
249
477
  scenarioSteps: string[],
250
478
  memory: JourneyMemory,
251
- previousReflection?: SelfReflection,
252
- consecutiveFailures?: number
479
+ consecutiveFailures?: number,
480
+ noteFromPreviousIteration?: NoteToFutureSelf,
481
+ priorSteps?: string[], // NEW: For repair mode
482
+ nextSteps?: string[] // NEW: For repair mode
253
483
  ): Promise<AgentContext> {
254
484
  // Get fresh DOM
255
485
  const currentPageInfo = await getEnhancedPageInfo(page);
@@ -258,8 +488,41 @@ export class OrchestratorAgent {
258
488
  // Get recent steps
259
489
  const recentSteps = memory.history.slice(-this.config.recentStepsCount);
260
490
 
491
+ // SoM integration: Update markers and capture screenshot with visual IDs
492
+ let somScreenshot: string | undefined = undefined;
493
+ let somElementMap: string | undefined = undefined;
494
+ if (this.config.useSoM && this.somHandler) {
495
+ try {
496
+ if (!this.somHandler) {
497
+ this.somHandler = new PageSoMHandler(page, this.logger);
498
+ } else {
499
+ this.somHandler.setPage(page);
500
+ }
501
+
502
+ // Wait briefly for page stability (handles first iteration + safety net for fast SPAs)
503
+ try {
504
+ await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
505
+ } catch (error: any) {
506
+ // Page already loaded or timeout - continue
507
+ }
508
+
509
+ // Update SoM markers
510
+ await this.somHandler.updateSom();
511
+
512
+ // Get screenshot WITH markers (viewport only - agent can scroll or use take_screenshot for full page)
513
+ somScreenshot = await this.somHandler.getScreenshot(true, false, 60);
514
+
515
+ // Get element map for disambiguation
516
+ somElementMap = this.somHandler.getSomElementMap();
517
+
518
+ this.logger?.(`[Orchestrator] SoM screenshot captured for agent decision-making`, 'log');
519
+ } catch (error: any) {
520
+ this.logger?.(`[Orchestrator] Failed to capture SoM screenshot: ${error.message}`, 'error');
521
+ }
522
+ }
523
+
261
524
  // Build context
262
- return {
525
+ const context = {
263
526
  overallGoal: scenarioSteps.join('\n'),
264
527
  currentStepGoal,
265
528
  stepNumber,
@@ -271,8 +534,19 @@ export class OrchestratorAgent {
271
534
  recentSteps,
272
535
  experiences: memory.experiences,
273
536
  extractedData: memory.extractedData,
274
- previousIterationGuidance: previousReflection
537
+ noteFromPreviousIteration, // Pass tactical note from previous iteration
538
+ somScreenshot, // SoM screenshot with visual markers (current)
539
+ somElementMap, // SoM element details for disambiguation
540
+ priorSteps, // NEW: Repair context (undefined for script gen)
541
+ nextSteps // NEW: Repair context (undefined for script gen)
275
542
  };
543
+
544
+ // Save current screenshot as previous for next iteration (for tool access)
545
+ if (somScreenshot) {
546
+ this.previousSomScreenshot = somScreenshot;
547
+ }
548
+
549
+ return context;
276
550
  }
277
551
 
278
552
  /**
@@ -285,18 +559,50 @@ export class OrchestratorAgent {
285
559
  iteration: number,
286
560
  consecutiveFailures?: number
287
561
  ): Promise<AgentDecision> {
288
- // Build prompt
289
- const systemPrompt = this.buildSystemPrompt();
290
- const userPrompt = this.buildUserPrompt(context, consecutiveFailures);
562
+ // Detect if coordinate mode should be activated
563
+ // Phase 1: Only 2 tiers (selectors → coordinates), so activate after 3 failures
564
+ // Phase 2: Will have 3 tiers (selectors → index → coordinates), threshold will be 5
565
+ const useCoordinateMode = consecutiveFailures !== undefined && consecutiveFailures >= 3;
566
+
567
+ // Build appropriate system prompt based on mode
568
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
569
+ let systemPrompt: string;
570
+
571
+ if (this.config.useSoM) {
572
+ // SoM mode: Use visual element identification
573
+ systemPrompt = OrchestratorPrompts.buildSomSystemPrompt(this.config.somRestrictCoordinates);
574
+ } else if (useCoordinateMode) {
575
+ // Coordinate mode: Fallback when selectors fail
576
+ systemPrompt = OrchestratorPrompts.buildCoordinateSystemPrompt();
577
+ } else {
578
+ // Standard mode: DOM-based selectors
579
+ systemPrompt = OrchestratorPrompts.buildSystemPrompt(toolDescriptions, this.config.enableCoordinateMode);
580
+ }
581
+
582
+ const userPrompt = OrchestratorPrompts.buildUserPrompt(context, consecutiveFailures, this.config.enableCoordinateMode);
583
+
584
+ // Log prompt lengths for monitoring
585
+ const systemLength = systemPrompt.length;
586
+ const userLength = userPrompt.length;
587
+ const totalLength = systemLength + userLength;
588
+ const estimatedTokens = Math.ceil(totalLength / 4); // Rough estimate: 4 chars per token
589
+
590
+ this.logger?.(`[Orchestrator] 📊 Prompt lengths: system=${systemLength} chars, user=${userLength} chars, total=${totalLength} chars (~${estimatedTokens} tokens)`, 'log');
291
591
 
292
592
  try {
293
593
  // Call LLM directly via provider
294
- const llmRequest = {
594
+ const llmRequest: any = {
295
595
  model: DEFAULT_MODEL,
296
596
  systemPrompt,
297
597
  userPrompt
298
598
  };
299
599
 
600
+ // Include current SoM screenshot as image
601
+ if (context.somScreenshot) {
602
+ llmRequest.imageUrl = context.somScreenshot;
603
+ this.logger?.(`[Orchestrator] Including SoM screenshot in LLM request`, 'log');
604
+ }
605
+
300
606
  const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
301
607
 
302
608
  // Report token usage
@@ -318,7 +624,7 @@ export class OrchestratorAgent {
318
624
  }
319
625
 
320
626
  // Parse response
321
- return this.parseAgentDecision(response.answer);
627
+ return this.decisionParser.parse(response.answer);
322
628
 
323
629
  } catch (error: any) {
324
630
  this.logger?.(`[Orchestrator] ✗ Agent call failed: ${error.message}`, 'error');
@@ -332,443 +638,6 @@ export class OrchestratorAgent {
332
638
  }
333
639
  }
334
640
 
335
- /**
336
- * Build system prompt with tool descriptions
337
- */
338
- private buildSystemPrompt(): string {
339
- const toolDescriptions = this.toolRegistry.generateToolDescriptions();
340
-
341
- return `You are an intelligent test automation agent that executes web scenarios using Playwright.
342
-
343
- ${toolDescriptions}
344
-
345
- YOUR RESPONSE FORMAT (JSON):
346
- {
347
- "toolCalls": [{"name": "tool_name", "params": {...}}], // Optional: Tools to use
348
- "toolReasoning": "Why I need these tools", // If using tools
349
- "needsToolResults": true/false, // Wait for tool results before commands?
350
- "commands": ["await page.click(...)", ...], // Playwright commands to execute
351
- "commandReasoning": "Why these commands", // If executing commands
352
- "selfReflection": { // Guidance for your next iteration
353
- "guidanceForNext": "What to focus on next",
354
- "detectingLoop": false, // Set true if repeating same approach
355
- "loopReasoning": "Why I think I'm looping" // If detecting loop
356
- },
357
- "experiences": ["App-specific learning"], // APP-SPECIFIC patterns (see rules below)
358
- "status": "continue|complete|stuck|infeasible", // Your decision
359
- "statusReasoning": "Why this status",
360
- "reasoning": "Overall reasoning for this iteration",
361
- "blockerDetected": { // OPTIONAL: If something blocks the current goal
362
- "description": "What's blocking", // E.g., "Cookie consent modal"
363
- "clearingCommands": ["await page.click(...)"] // Commands to clear blocker FIRST
364
- },
365
- "stepReEvaluation": { // OPTIONAL: If questioning step order (last resort)
366
- "detected": true, // Set true if you think step order is wrong
367
- "issue": "prior_incomplete|already_done|wrong_order", // What's the issue?
368
- "explanation": "Why step order seems wrong" // Your reasoning
369
- }
370
- }
371
-
372
- STATUS DECISION RULES (CRITICAL!):
373
-
374
- - "complete": ALL commands succeeded AND step goal achieved
375
- ❌ NEVER mark complete if ANY command failed!
376
-
377
- - "continue": Command failed OR need more actions
378
- Use when: Need to retry with different selector, wait for something, or continue execution
379
-
380
- - "stuck": Tried 3+ iterations, all failed
381
-
382
- - "infeasible": Goal truly impossible (element doesn't exist anywhere)
383
-
384
- IF COMMAND FAILED → status must be "continue" (to retry) or "stuck" (if tried many times)
385
- IF COMMAND SUCCEEDED → status can be "complete" (if goal done) or "continue" (if more needed)
386
-
387
- STEP RE-EVALUATION (LAST RESORT - After 2+ Failures):
388
-
389
- When you've failed multiple times on current step, QUESTION YOUR ASSUMPTIONS:
390
-
391
- 1. **LOOK BACK**: Did prior steps really complete?
392
- - Check COMPLETED steps list vs current page state
393
- - Example: Says "Login complete" but you see login form → PRIOR STEP FAILED
394
- - If prior step incomplete: Execute commands to complete it, explain in reasoning
395
-
396
- 2. **LOOK FORWARD**: Is current step already done or no longer relevant?
397
- - Check if current step goal already achieved (element exists, state correct)
398
- - Example: Step says "Click Submit" but form already submitted → STEP ALREADY DONE
399
- - If already done: Mark "complete" and explain why
400
-
401
- 3. **LOOK AHEAD**: Is next step more feasible?
402
- - Sometimes UI flows differently than expected
403
- - Example: Current step "Click Edit" fails, but next step "Fill Name" elements are visible
404
- - If can do future step: Note in reasoning, but PREFER sticking to order
405
-
406
- STICK TO ORIGINAL PLAN unless clear evidence suggests otherwise.
407
- Only re-evaluate after repeated failures on same step.
408
-
409
- BLOCKER DETECTION (IMPORTANT!):
410
-
411
- Use "blockerDetected" when the CURRENT STEP GOAL is not immediately feasible because something unexpected is in the way:
412
-
413
- Examples of BLOCKERS:
414
- - Cookie consent modal appears before login
415
- - "What's New" tour overlay prevents clicking navigation
416
- - Email verification prompt blocks form submission
417
- - Session timeout modal requires re-login before continuing
418
-
419
- When you detect a blocker:
420
- 1. Set "blockerDetected.description" to describe what's blocking
421
- 2. Provide "clearingCommands" to dismiss/handle the blocker
422
- 3. Set your main "commands" to execute AFTER blocker is cleared
423
- 4. Agent will execute: clearingCommands FIRST, then your commands
424
-
425
- Example:
426
- {
427
- "blockerDetected": {
428
- "description": "Cookie consent modal blocking page interaction",
429
- "clearingCommands": ["await page.getByRole('button', {name: 'Accept'}).click()"]
430
- },
431
- "commands": ["await page.getByRole('textbox', {name: 'Email'}).fill('user@example.com')"],
432
- ...
433
- }
434
-
435
- DON'T use blockerDetected for:
436
- - Normal step execution (just use commands)
437
- - Wrong selectors (that's not a blocker, just fix the selector)
438
- - Missing elements (that's infeasible, not blocked)
439
-
440
- EXPERIENCES - APP-SPECIFIC PATTERNS ONLY:
441
-
442
- Guidelines:
443
- - Concise - combine related info into single learning
444
- - Focus on app quirks, not obvious practices
445
- - Max 1-2 experiences per iteration
446
- - Capture SEMANTIC SELECTOR patterns when discovered
447
-
448
- ✅ GOOD - App-specific patterns:
449
- - "Login form fields accessible via getByLabel: 'Email' and 'Password'"
450
- - "Submit buttons consistently use role=button with text matching action"
451
- - "Form uses custom dropdowns (role=combobox) instead of native selects"
452
- - "Navigation items accessible via getByRole('link') with visible text"
453
- - "Site uses data-testid attributes consistently (pattern: action-element-name)"
454
- - "Input fields have clear placeholders - prefer getByPlaceholder over IDs"
455
-
456
- ❌ BAD - Too specific or obvious:
457
- - Listing individual field selectors (combine them)
458
- - "Button exists with role=button" (obvious)
459
- - Generic automation advice
460
- - Noting auto-generated IDs like #«r3»-form-item (these are unreliable)
461
-
462
- COMBINE related learnings instead of listing each element separately.
463
- PRIORITIZE learning about semantic selector availability over CSS selectors.
464
-
465
- CRITICAL RULES FOR DECISION MAKING:
466
-
467
- 1. **BE FOCUSED AND CONCISE** (CRITICAL!):
468
- - Read the CURRENT STEP GOAL carefully - ONLY do what's explicitly asked for in THAT step
469
- - DON'T add extra actions, verifications, or defensive checks not mentioned in the step goal
470
- - If step says "Send a message" → ONLY send the message, DON'T verify it appeared
471
- - If step says "Click Submit" → ONLY click, DON'T wait or verify submission success
472
- - ONLY add expect() assertions if the step explicitly mentions "verify", "check", "ensure", or "confirm"
473
- - Keep commands minimal - no unnecessary preparatory or validation actions
474
-
475
- 2. **COMMAND EFFICIENCY**:
476
- - DON'T click on input field before filling it - just use .fill() directly
477
- - DON'T do preparatory actions unless necessary
478
- - Example: page.fill works without clicking the input first
479
-
480
- 2b. **VARIABLE SCOPE & AVAILABLE FUNCTIONS** (CRITICAL!):
481
- - Commands in SAME BATCH share scope - variables persist within iteration
482
- - To save data for later steps: Use extract_data tool, then access via extractedData in context
483
- - Within same batch: Variables work across commands
484
- - AVAILABLE IN CONTEXT: page (Playwright Page object) and expect (Playwright assertions)
485
- - You can use expect() directly - it is already imported and available
486
-
487
- 3. **WHEN COMMAND FAILS** (most important!):
488
- - DON'T repeat the same command/selector
489
- - DO analyze the error message
490
- - DO look at DOM snapshot for SEMANTIC selectors (getByRole, getByLabel, getByText)
491
- - AVOID falling back to auto-generated IDs with unicode characters (e.g., #«r3»-form-item)
492
- - DO take screenshot if DOM doesn't help
493
- - "Element is not a <select>" → It's a custom dropdown, use .click() then select from menu
494
- - "Timeout waiting for locator" → Try semantic selectors from ARIA tree instead of CSS IDs
495
-
496
- 4. **CHECK YOUR RECENT STEPS FOR PATTERNS**:
497
- - If you see same selector failed 2+ times → YOU ARE LOOPING!
498
- - Set detectingLoop: true and explain why
499
- - Take screenshot to see what's actually on the page
500
- - Use different SEMANTIC selector from DOM/ARIA tree (getByRole, getByLabel, getByText, data-testid)
501
- - AVOID repeating with similar auto-generated ID patterns (e.g., #«r3»-form-item → #«r4»-form-item)
502
-
503
- 5. **USE DOM SNAPSHOT** - ONLY USE WHAT'S ACTUALLY THERE (CRITICAL!):
504
- - You get INTERACTIVE ELEMENTS list showing: position, tag, id, text, SUGGESTED SELECTORS
505
- - ONLY use selectors from this list - DO NOT invent selectors with text/names not in the list!
506
- - ❌ WRONG: List shows button "Submit" but you use getByRole('button', {name: 'SomeOtherText'})
507
- - ✅ RIGHT: Use EXACT selector from the provided list
508
- - ✅ RIGHT: If list shows "#submit-btn" → use that
509
- - ✅ RIGHT: If list shows "getByRole('button', {name: 'X'})" → use exactly that
510
- - If element you need is NOT in list → take_screenshot to find it
511
- - ARIA tree shows roles/names that exist - inventing roles/names not in tree will fail!
512
-
513
- 5b. **SELECTOR PREFERENCE** (CRITICAL - Playwright Best Practices):
514
- ALWAYS prefer user-friendly, semantic selectors in this order:
515
-
516
- 🏆 BEST (use these first):
517
- 1. page.getByRole('role', {name: 'text'}) → ✅ Accessible, semantic, resilient
518
- 2. page.getByLabel('label text') → ✅ Great for form inputs
519
- 3. page.getByPlaceholder('placeholder') → ✅ Good for inputs without labels
520
- 4. page.getByText('visible text') → ✅ Clear and readable
521
- 5. page.getByTestId('test-id') → ✅ Stable if available
522
-
523
- ⚠️ AVOID (last resort only):
524
- - CSS selectors with auto-generated IDs: #r3-form-item, #«r3»-form-item
525
- - CSS selectors with unicode: #«r4»-form-item
526
- - Complex CSS paths: div.ant-form > div:nth-child(2)
527
-
528
- Examples:
529
- ❌ BAD: await page.fill('#«r3»-form-item', 'alice@example.com')
530
- ✅ GOOD: await page.getByLabel('Email').fill('alice@example.com')
531
- ✅ GOOD: await page.getByRole('textbox', {name: 'Email'}).fill('alice@example.com')
532
- ✅ GOOD: await page.getByPlaceholder('Enter your email').fill('alice@example.com')
533
-
534
- ❌ BAD: await page.click('#submit-btn-23847')
535
- ✅ GOOD: await page.getByRole('button', {name: 'Submit'}).click()
536
- ✅ GOOD: await page.getByText('Submit').click()
537
-
538
- WHY: Semantic selectors are more maintainable, self-documenting, and resilient to UI changes.
539
- Auto-generated IDs (especially with unicode like «r3») break when component instances change.
540
-
541
- 5c. **ASSERTIONS AND VERIFICATION** (CRITICAL - Use expect):
542
- Use expect() for verification ONLY when the step explicitly asks for verification.
543
-
544
- 🏆 WHEN TO USE expect():
545
- - Step says "Verify message appears" → await expect(locator).toBeVisible()
546
- - Step says "Check field is empty" → await expect(locator).toBeEmpty()
547
- - Step says "Confirm URL changed" → await expect(page).toHaveURL('...')
548
- - Step explicitly mentions: "verify", "check", "ensure", "confirm", "validate"
549
-
550
- ❌ WHEN NOT TO USE expect():
551
- - Step says "Send a message" → DON'T add expect() for message visibility
552
- - Step says "Click Submit" → DON'T add expect() for button state or result
553
- - Step says "Fill Email field" → DON'T add expect() to verify it's filled
554
- - Step says "Navigate to Dashboard" → DON'T add expect() for URL (unless step asks)
555
-
556
- 🏆 expect() syntax (when needed):
557
- - await expect(locator).toBeVisible()
558
- - await expect(locator).toHaveText('expected text')
559
- - await expect(locator).toHaveValue('expected value')
560
- - await expect(locator).toBeEmpty()
561
- - await expect(page).toHaveURL('expected-url')
562
-
563
- ❌ BAD - Manual checks:
564
- const value = await page.getByPlaceholder('Message...').inputValue();
565
- if (value !== '') throw new Error(\`Expected empty but found: '\${value}'\`);
566
-
567
- ✅ GOOD - Use expect (but ONLY if step asks to verify):
568
- await expect(page.getByPlaceholder('Message...')).toBeEmpty();
569
-
570
- WHY: expect() has built-in retries and better error messages.
571
- BUT: Only use it when verification is explicitly requested in the step goal.
572
-
573
- 6. **TOOLS ARE READ-ONLY (Information Gathering)**:
574
- AVAILABLE TOOLS (for information only):
575
- - take_screenshot: Get visual context when DOM isn't enough or stuck 2+ times
576
- * DEFAULT: Captures FULL PAGE (elements below fold included)
577
- * Set isFullPage=false ONLY if you need just viewport
578
- * Use for: Planning, finding hidden elements, verifying layout
579
- - recall_history: Remember earlier steps in journey
580
- - inspect_page: Force fresh DOM snapshot (rarely needed, already in context)
581
- - check_page_ready: Verify page loaded (use sparingly!)
582
- - extract_data: Save data from page to memory for later steps
583
-
584
- ALL STATE CHANGES USE PLAYWRIGHT COMMANDS:
585
- - Navigation: await page.goto('url')
586
- - Go back: await page.goBack()
587
- - Refresh: await page.reload()
588
- - Fill inputs: await page.fill() or page.getByRole().fill()
589
- - Click buttons: await page.click() or page.getByRole().click()
590
- - Wait: await page.waitForURL(), page.waitForLoadState()
591
-
592
- Tools gather info → Commands change state
593
-
594
- 7. **ERROR-SPECIFIC RESPONSES**:
595
- - "Timeout waiting for locator" → Selector doesn't exist, try different one from DOM
596
- - "Element is not a <select>" → It's a custom dropdown (button with role=combobox), use .click()
597
- - "Multiple elements" → Add more specific selector from DOM (use #id)
598
-
599
- 8. **WHEN TO RUN COMMANDS vs TOOLS**:
600
- - Confident about selectors from DOM → Run commands directly
601
- - Unsure or failed 2+ times → Take screenshot first
602
- - First iteration of a step → Usually can run commands from DOM`;
603
- }
604
-
605
- /**
606
- * Build user prompt with context
607
- */
608
- private buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
609
- const parts: string[] = [];
610
-
611
- parts.push('=== YOUR CURRENT CONTEXT ===\n');
612
-
613
- // System warnings for accumulated failures
614
- if (consecutiveFailures && consecutiveFailures >= 2) {
615
- parts.push(`⚠️⚠️⚠️ SYSTEM WARNING ⚠️⚠️⚠️`);
616
- parts.push(`You've failed ${consecutiveFailures} iterations in a row!`);
617
- parts.push(`Next failure will force STUCK. You MUST:`);
618
- parts.push(`1. Take screenshot if you haven't already`);
619
- parts.push(`2. Try completely different selector strategy`);
620
- parts.push(`3. QUESTION ASSUMPTIONS: Am I at the right step?`);
621
- parts.push(` - LOOK BACK: Did prior steps really complete? Check page state vs COMPLETED list`);
622
- parts.push(` - LOOK FORWARD: Is current step already done? Check if goal already achieved`);
623
- parts.push(`4. OR mark status="stuck" if truly can't proceed`);
624
- parts.push(`⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️\n`);
625
- }
626
-
627
- // Goals - make current step very prominent
628
- parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
629
- parts.push(`${context.currentStepGoal}`);
630
- parts.push(`^^ DO ONLY WHAT THIS STEP ASKS - NO EXTRA ACTIONS OR VERIFICATIONS ^^`);
631
- parts.push(`^^ If step doesn't say "verify/check/confirm" → DON'T add expect() assertions ^^`);
632
- parts.push(`^^ Mark "complete" ONLY if commands succeeded ^^`);
633
- parts.push(``);
634
- parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
635
-
636
- if (context.completedSteps.length > 0) {
637
- parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
638
- }
639
- if (context.remainingSteps.length > 0) {
640
- parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
641
- }
642
-
643
- // Current page state
644
- parts.push(`\nCURRENT PAGE:`);
645
- parts.push(`URL: ${context.currentURL}`);
646
- parts.push(`Title: ${context.currentPageInfo.title}`);
647
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
648
- parts.push(context.currentPageInfo.formattedElements);
649
- parts.push(`\nARIA TREE (hierarchical structure):`);
650
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
651
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
652
- parts.push('... (truncated)');
653
- }
654
- parts.push('')
655
-
656
- // Recent steps
657
- if (context.recentSteps.length > 0) {
658
- parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
659
- for (const step of context.recentSteps) {
660
- const status = step.result === 'success' ? '✓' : '✗';
661
- parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
662
- parts.push(` Code: ${step.code}`);
663
- if (step.result === 'failure' && step.error) {
664
- parts.push(` ❌ ERROR: ${step.error}`);
665
- parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
666
- } else {
667
- parts.push(` Result: ${step.observation}`);
668
- }
669
- }
670
- parts.push('');
671
-
672
- // Detect repeated failures
673
- const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
674
- if (recentFailures.length >= 2) {
675
- const sameSelector = recentFailures.slice(-2).every((s, i, arr) =>
676
- i === 0 || s.code === arr[i-1].code
677
- );
678
- if (sameSelector) {
679
- parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
680
- parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
681
- parts.push(` YOU MUST try a completely different selector this time!\n`);
682
- }
683
- }
684
- }
685
-
686
- // Experiences
687
- if (context.experiences.length > 0) {
688
- parts.push(`\nLEARNINGS (${context.experiences.length}):`);
689
- for (const exp of context.experiences) {
690
- parts.push(` - ${exp}`);
691
- }
692
- parts.push('');
693
- }
694
-
695
- // Extracted data
696
- if (Object.keys(context.extractedData).length > 0) {
697
- parts.push(`\nEXTRACTED DATA:`);
698
- for (const [key, value] of Object.entries(context.extractedData)) {
699
- parts.push(` ${key}: ${value}`);
700
- }
701
- parts.push('');
702
- }
703
-
704
- // Previous reflection
705
- if (context.previousIterationGuidance) {
706
- parts.push(`\nYOUR PREVIOUS GUIDANCE TO YOURSELF:`);
707
- parts.push(`Focus: ${context.previousIterationGuidance.guidanceForNext}`);
708
- if (context.previousIterationGuidance.detectingLoop) {
709
- parts.push(`⚠ YOU DETECTED A LOOP: ${context.previousIterationGuidance.loopReasoning}`);
710
- }
711
- parts.push('');
712
- }
713
-
714
- // Tool results (if any)
715
- if (context.toolResults) {
716
- parts.push(`\n========== TOOL RESULTS ==========`);
717
- for (const [toolName, result] of Object.entries(context.toolResults)) {
718
- parts.push(`\n${toolName}: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
719
-
720
- if (result.learning) {
721
- parts.push(`\n📊 ANALYSIS FROM ${toolName.toUpperCase()}:`);
722
- parts.push(result.learning);
723
- parts.push(`^^ USE THIS INFORMATION TO DECIDE YOUR NEXT COMMANDS ^^`);
724
- }
725
-
726
- if (result.error) {
727
- parts.push(` Error: ${result.error}`);
728
- }
729
- }
730
- parts.push(`\n===================================\n`);
731
- }
732
-
733
- parts.push('=== WHAT DO YOU WANT TO DO? ===');
734
- parts.push('Respond with JSON following the format specified in the system prompt.');
735
-
736
- return parts.join('\n');
737
- }
738
-
739
- /**
740
- * Parse agent decision from LLM response
741
- */
742
- private parseAgentDecision(response: string): AgentDecision {
743
- try {
744
- // Extract JSON from response
745
- const jsonMatch = response.match(/\{[\s\S]*\}/);
746
- if (!jsonMatch) {
747
- throw new Error('No JSON found in response');
748
- }
749
-
750
- const parsed = JSON.parse(jsonMatch[0]);
751
-
752
- // Validate required fields
753
- if (!parsed.status || !parsed.reasoning) {
754
- throw new Error('Missing required fields: status and reasoning');
755
- }
756
-
757
- return parsed as AgentDecision;
758
-
759
- } catch (error: any) {
760
- this.logger?.(`[Orchestrator] ✗ Failed to parse agent decision: ${error.message}`, 'error');
761
- this.logger?.(`[Orchestrator] Raw response: ${response.substring(0, 200)}...`, 'log');
762
-
763
- // Return fallback
764
- return {
765
- status: 'stuck',
766
- statusReasoning: 'Failed to parse agent response',
767
- reasoning: `Parse error: ${error.message}`
768
- };
769
- }
770
- }
771
-
772
641
  /**
773
642
  * Execute tools
774
643
  */
@@ -776,16 +645,20 @@ CRITICAL RULES FOR DECISION MAKING:
776
645
  toolCalls: any[],
777
646
  page: any,
778
647
  memory: JourneyMemory,
779
- stepNumber: number
648
+ stepNumber: number,
649
+ refMap?: Map<string, any>
780
650
  ): Promise<Record<string, any>> {
781
651
  this.logger?.(`[Orchestrator] 🔧 Executing ${toolCalls.length} tool(s)`);
782
652
 
783
653
  const results: Record<string, any> = {};
784
- const toolContext: ToolExecutionContext = {
654
+ const toolContext: ToolExecutionContext & { refMap?: Map<string, any>; previousSomScreenshot?: string; somHandler?: any } = {
785
655
  page,
786
656
  memory,
787
657
  stepNumber,
788
- logger: this.logger
658
+ logger: this.logger,
659
+ refMap, // Pass refMap for interact_with_ref tool
660
+ previousSomScreenshot: this.previousSomScreenshot, // For view_previous_screenshot tool
661
+ somHandler: this.somHandler // For refresh_som_markers tool
789
662
  };
790
663
 
791
664
  for (const toolCall of toolCalls.slice(0, this.config.maxToolCallsPerIteration)) {
@@ -805,71 +678,259 @@ CRITICAL RULES FOR DECISION MAKING:
805
678
  }
806
679
 
807
680
  /**
808
- * Execute commands sequentially with SHARED context (variables persist across commands)
681
+ * Parse SomCommand from command object
809
682
  */
810
- private async executeCommandsSequentially(
811
- commands: string[],
683
+ private parseSomCommand(cmd: any): SomCommand | null {
684
+ if (typeof cmd === 'object' && cmd.action) {
685
+ // Valid if: has elementRef, OR has coord, OR is navigation action
686
+ const isNavigationAction = ['navigate', 'goBack', 'goForward', 'reload'].includes(cmd.action);
687
+ const hasValidTarget = cmd.elementRef || cmd.coord || isNavigationAction;
688
+
689
+ if (hasValidTarget) {
690
+ return {
691
+ elementRef: cmd.elementRef,
692
+ coord: cmd.coord,
693
+ action: cmd.action,
694
+ value: cmd.value,
695
+ fromCoord: cmd.fromCoord,
696
+ toCoord: cmd.toCoord,
697
+ force: cmd.force,
698
+ scrollAmount: cmd.scrollAmount,
699
+ scrollDirection: cmd.scrollDirection,
700
+ button: cmd.button,
701
+ clickCount: cmd.clickCount,
702
+ modifiers: cmd.modifiers,
703
+ delay: cmd.delay,
704
+ timeout: cmd.timeout
705
+ };
706
+ }
707
+ }
708
+ return null;
709
+ }
710
+
711
+ /**
712
+ * Execute commands (mix of ref and playwright commands)
713
+ */
714
+ private async executeCommands(
715
+ commands: string[] | any[],
812
716
  page: any,
813
717
  memory: JourneyMemory,
814
718
  stepNumber: number,
815
719
  iteration: number,
816
720
  jobId: string
817
721
  ): Promise<{ executed: string[]; allSucceeded: boolean }> {
818
- this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s) in shared context`);
722
+ this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s)`);
819
723
 
820
724
  const executed: string[] = [];
821
- const limitedCommands = commands.slice(0, this.config.maxCommandsPerIteration);
822
725
 
823
- // Build execution with shared context (all commands share scope - variables persist)
824
- const commandsWithTracking = limitedCommands.map((cmd, i) => {
825
- return `
826
- // Command ${i + 1}/${limitedCommands.length}
726
+ if (commands.length === 0) {
727
+ return { executed: [], allSucceeded: true };
728
+ }
729
+
730
+ // SoM mode: Execute commands through PageSoMHandler
731
+ if (this.config.useSoM && this.somHandler) {
732
+ this.logger?.(`[Orchestrator] Using SoM mode for command execution`, 'log');
733
+
734
+ for (let i = 0; i < commands.length; i++) {
735
+ const cmd = commands[i];
736
+
737
+ // Check if verification or action command
738
+ if (isSomVerification(cmd)) {
739
+ // Handle verification command
740
+ try {
741
+ const result = await this.somHandler.executeVerification(cmd);
742
+
743
+ // Always add command to executed array (even if verification failed)
744
+ // Scripts should contain the expect even if it fails during generation
745
+ if (result.playwrightCommand) {
746
+ executed.push(result.playwrightCommand);
747
+ }
748
+
749
+ if (result.success) {
750
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${commands.length}] Verification passed`, 'log');
751
+
752
+ memory.history.push({
753
+ stepNumber,
754
+ iteration,
755
+ action: `Verification ${i + 1}/${commands.length}: ${cmd.verificationType}`,
756
+ code: result.playwrightCommand,
757
+ result: 'success',
758
+ observation: `Verified: ${cmd.description || cmd.expected}`,
759
+ url: page.url(),
760
+ timestamp: Date.now()
761
+ });
762
+ } else {
763
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] Verification failed (non-fatal): ${result.error}`, 'warn');
764
+
765
+ memory.history.push({
766
+ stepNumber,
767
+ iteration,
768
+ action: `Verification ${i + 1}/${commands.length} - FAILED`,
769
+ code: result.playwrightCommand || JSON.stringify(cmd),
770
+ result: 'failure',
771
+ observation: `Failed: ${result.error}`,
772
+ error: result.error,
773
+ url: page.url(),
774
+ timestamp: Date.now()
775
+ });
776
+
777
+ // Continue anyway - verification failures are non-blocking for script generation
778
+ }
779
+
780
+ // Small delay between commands
781
+ if (i < commands.length - 1) {
782
+ await page.waitForTimeout(300);
783
+ }
784
+
785
+ } catch (error: any) {
786
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] Verification exception: ${error.message}`, 'error');
787
+ }
788
+
789
+ } else if (isSomCommand(cmd)) {
790
+ // Handle action command (existing logic)
791
+ const somCommand = cmd as SomCommand;
792
+
793
+ try {
794
+ const result = await this.somHandler.runCommand(
795
+ somCommand,
796
+ this.config.somUseSomIdBasedCommands || false
797
+ );
798
+
799
+ if (result.status === CommandRunStatus.SUCCESS && result.successAttempt) {
800
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${commands.length}] SoM action succeeded`, 'log');
801
+ executed.push(result.successAttempt.command!);
802
+
803
+ memory.history.push({
804
+ stepNumber,
805
+ iteration,
806
+ action: `SoM Action ${i + 1}/${commands.length}: ${somCommand.action}`,
807
+ code: result.successAttempt.command!,
808
+ result: 'success',
809
+ observation: 'Executed successfully',
810
+ url: page.url(),
811
+ timestamp: Date.now()
812
+ });
813
+
814
+ // Small delay for form validation/animations
815
+ if (i < commands.length - 1) {
816
+ await page.waitForTimeout(300);
817
+ }
818
+ } else {
819
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] SoM action failed: ${result.error}`, 'error');
820
+
821
+ memory.history.push({
822
+ stepNumber,
823
+ iteration,
824
+ action: `SoM Action ${i + 1}/${commands.length}: ${somCommand.action} - FAILED`,
825
+ code: JSON.stringify(somCommand),
826
+ result: 'failure',
827
+ observation: `Failed: ${result.error}`,
828
+ error: result.error,
829
+ url: page.url(),
830
+ timestamp: Date.now()
831
+ });
832
+
833
+ // Refresh SoM after batch (DOM may have changed)
834
+ if (this.somHandler && page) {
835
+ this.somHandler.setPage(page);
836
+ await this.somHandler.updateSom();
837
+ }
838
+
839
+ return { executed, allSucceeded: false };
840
+ }
841
+ } catch (error: any) {
842
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] SoM action exception: ${error.message}`, 'error');
843
+
844
+ memory.history.push({
845
+ stepNumber,
846
+ iteration,
847
+ action: `SoM Action ${i + 1}/${commands.length} - EXCEPTION`,
848
+ code: JSON.stringify(somCommand),
849
+ result: 'failure',
850
+ observation: `Exception: ${error.message}`,
851
+ error: error.message,
852
+ url: page.url(),
853
+ timestamp: Date.now()
854
+ });
855
+
856
+ // Refresh SoM after batch (DOM may have changed)
857
+ if (this.somHandler && page) {
858
+ this.somHandler.setPage(page);
859
+ await this.somHandler.updateSom();
860
+ }
861
+
862
+ return { executed, allSucceeded: false };
863
+ }
864
+ } else {
865
+ this.logger?.(`[Orchestrator] ⚠ [${i + 1}/${commands.length}] Not a valid SoM command/verification, skipping`, 'warn');
866
+ }
867
+ }
868
+
869
+ // Always wait for page to stabilize after command batch
870
+ // This handles both explicit navigation AND clicks that trigger navigation/SPA routes
871
+ try {
872
+ this.logger?.(`[Orchestrator] Waiting for page to stabilize...`, 'log');
873
+ // Use networkidle with short timeout to catch navigation without blocking on SPAs with continuous requests
874
+ await page.waitForLoadState('networkidle', { timeout: 3000 });
875
+ this.logger?.(`[Orchestrator] Page stabilized (networkidle)`, 'log');
876
+ } catch (error: any) {
877
+ // If networkidle times out, fall back to domcontentloaded
878
+ try {
879
+ await page.waitForLoadState('domcontentloaded', { timeout: 2000 });
880
+ this.logger?.(`[Orchestrator] Page loaded (domcontentloaded)`, 'log');
881
+ } catch (error2: any) {
882
+ this.logger?.(`[Orchestrator] Page load wait timeout (continuing anyway)`, 'warn');
883
+ }
884
+ }
885
+
886
+ // Refresh SoM after batch (DOM may have changed and page is now stable)
887
+ if (this.somHandler && page) {
888
+ this.somHandler.setPage(page);
889
+ await this.somHandler.updateSom();
890
+ }
891
+
892
+ return { executed, allSucceeded: true };
893
+ }
894
+
895
+ // Standard mode: Execute all commands in sequence with small delay between them
896
+ // Delay helps with form validation, button enabling, and animations
897
+ const wrappedCode = (commands as string[]).map((cmd, i) => `
898
+ // Command ${i + 1}/${commands.length}
827
899
  try {
828
900
  ${cmd}
829
901
  __results.push({ index: ${i}, success: true });
902
+ ${i < commands.length - 1 ? 'await page.waitForTimeout(300);' : ''} // Small delay for form validation/animations
830
903
  } catch (error) {
831
904
  __results.push({ index: ${i}, success: false, error: error.message });
832
- throw error; // Stop on first failure
833
- }`;
834
- }).join('\n');
905
+ throw error;
906
+ }`).join('\n');
835
907
 
836
- const wrappedCode = `
837
- const __results = [];
838
- ${commandsWithTracking}
839
- return __results;
840
- `;
908
+ const fullCode = `const __results = []; ${wrappedCode} return __results;`;
841
909
 
842
910
  try {
843
- // Execute in shared context - variables declared here persist for entire scenario
844
- // Pass both page and expect to make Playwright assertions available
845
- const func = new Function('page', 'expect', 'return (async () => { ' + wrappedCode + ' })()');
846
- const results = await func(page, expect);
911
+ const func = new Function('page', 'expect', 'return (async () => { ' + fullCode + ' })()');
912
+ const results = await func(page, (global as any).expect);
847
913
 
848
- // Record results for each command
849
- for (let i = 0; i < limitedCommands.length; i++) {
850
- const cmd = limitedCommands[i];
914
+ for (let i = 0; i < commands.length; i++) {
915
+ const cmd = commands[i];
851
916
  const result = results[i];
852
917
 
853
918
  if (result && result.success) {
854
- this.logger?.(`[Orchestrator] ✓ [${i + 1}/${limitedCommands.length}] Success`);
855
-
856
- // Record in history
919
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${commands.length}] Success`);
857
920
  memory.history.push({
858
921
  stepNumber,
859
922
  iteration,
860
- action: `Command ${i + 1}/${limitedCommands.length}`,
923
+ action: `Command ${i + 1}/${commands.length}`,
861
924
  code: cmd,
862
925
  result: 'success',
863
926
  observation: 'Executed successfully',
864
927
  url: page.url(),
865
928
  timestamp: Date.now()
866
929
  });
867
-
868
930
  executed.push(cmd);
869
931
  }
870
932
  }
871
933
 
872
- // Cap history
873
934
  if (memory.history.length > this.config.maxHistorySize) {
874
935
  memory.history = memory.history.slice(-this.config.maxHistorySize);
875
936
  }
@@ -877,132 +938,25 @@ return __results;
877
938
  return { executed, allSucceeded: true };
878
939
 
879
940
  } catch (error: any) {
880
- // One of the commands failed - find which one
881
941
  const errorMessage = error.message || String(error);
942
+ this.logger?.(`[Orchestrator] ❌ Command execution failed: ${errorMessage}`, 'error');
882
943
 
883
- // Capture page state for debug logging
884
- let pageStateDebug = '';
885
- if (this.debugMode) {
886
- try {
887
- const pageInfo = await getEnhancedPageInfo(page);
888
- pageStateDebug = `
889
- === DEBUG: PAGE STATE AT FAILURE ===
890
- URL: ${page.url()}
891
- Title: ${pageInfo.title}
892
-
893
- INTERACTIVE ELEMENTS:
894
- ${pageInfo.formattedElements}
895
-
896
- ARIA SNAPSHOT:
897
- ${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
898
-
899
- ====================================`;
900
- } catch (debugError: any) {
901
- pageStateDebug = `Failed to capture page state: ${debugError.message}`;
902
- }
903
- }
904
-
905
- // Record all that succeeded, then the failure
906
- for (let i = 0; i < limitedCommands.length; i++) {
907
- const cmd = limitedCommands[i];
908
-
909
- // This is a failed command (error happened here or earlier)
910
- if (executed.length <= i) {
911
- this.logger?.(`[Orchestrator] ✗ [${i + 1}/${limitedCommands.length}] Failed: ${errorMessage}`, 'error');
912
-
913
- // Log detailed debug info
914
- if (this.debugMode && pageStateDebug) {
915
- this.logger?.(pageStateDebug, 'debug');
916
- }
917
-
918
- memory.history.push({
919
- stepNumber,
920
- iteration,
921
- action: `Command ${i + 1}/${limitedCommands.length} - FAILED`,
922
- code: cmd,
923
- result: 'failure',
924
- observation: `Failed with error: ${errorMessage}. This selector likely doesn't exist or is incorrect.`,
925
- error: errorMessage,
926
- url: page.url(),
927
- timestamp: Date.now()
928
- });
929
-
930
- if (i < limitedCommands.length - 1) {
931
- this.logger?.(`[Orchestrator] ⚠ Skipping remaining ${limitedCommands.length - i - 1} command(s)`, 'warn');
932
- }
933
-
934
- break;
935
- }
936
- }
944
+ memory.history.push({
945
+ stepNumber,
946
+ iteration,
947
+ action: `Command - FAILED`,
948
+ code: commands[executed.length] || '',
949
+ result: 'failure',
950
+ observation: `Failed: ${errorMessage}`,
951
+ error: errorMessage,
952
+ url: page.url(),
953
+ timestamp: Date.now()
954
+ });
937
955
 
938
956
  return { executed, allSucceeded: false };
939
957
  }
940
958
  }
941
-
942
- /**
943
- * Execute a single command
944
- */
945
- private async executeCommand(cmd: string, page: any): Promise<void> {
946
- // Wrap in async function and execute
947
- const wrapped = `(async () => { ${cmd} })()`;
948
-
949
- try {
950
- await eval(wrapped);
951
- } catch (error: any) {
952
- // If eval fails, try direct execution with page context
953
- // Pass both page and expect to make Playwright assertions available
954
- const func = new Function('page', 'expect', `return (async () => { ${cmd} })()`);
955
- await func(page, expect);
956
- }
957
- }
958
-
959
- /**
960
- * Log agent decision
961
- */
962
- private logAgentDecision(decision: AgentDecision, iteration: number): void {
963
- this.logger?.(`[Orchestrator] 💭 REASONING: ${decision.reasoning}`);
964
-
965
- if (decision.selfReflection) {
966
- this.logger?.(`[Orchestrator] 🧠 SELF-REFLECTION:`);
967
- this.logger?.(`[Orchestrator] Next: ${decision.selfReflection.guidanceForNext}`);
968
- if (decision.selfReflection.detectingLoop) {
969
- this.logger?.(`[Orchestrator] 🔄 LOOP DETECTED: ${decision.selfReflection.loopReasoning}`, 'warn');
970
- }
971
- }
972
-
973
- if (decision.toolCalls && decision.toolCalls.length > 0) {
974
- this.logger?.(`[Orchestrator] 🔧 TOOLS: ${decision.toolCalls.map(t => t.name).join(', ')}`);
975
- if (decision.toolReasoning) {
976
- this.logger?.(`[Orchestrator] 📋 Why: ${decision.toolReasoning}`);
977
- }
978
- }
979
-
980
- if (decision.blockerDetected) {
981
- this.logger?.(`[Orchestrator] 🚧 BLOCKER: ${decision.blockerDetected.description}`, 'warn');
982
- this.logger?.(`[Orchestrator] 🧹 Clearing with ${decision.blockerDetected.clearingCommands.length} command(s)`);
983
- }
984
-
985
- if (decision.stepReEvaluation?.detected) {
986
- this.logger?.(`[Orchestrator] 🔍 STEP RE-EVALUATION: ${decision.stepReEvaluation.issue}`, 'warn');
987
- this.logger?.(`[Orchestrator] 📝 Explanation: ${decision.stepReEvaluation.explanation}`);
988
- }
989
-
990
- if (decision.commands && decision.commands.length > 0) {
991
- this.logger?.(`[Orchestrator] 📝 COMMANDS (${decision.commands.length}):`);
992
- decision.commands.slice(0, 3).forEach((cmd, i) => {
993
- this.logger?.(`[Orchestrator] ${i + 1}. ${cmd.substring(0, 80)}...`);
994
- });
995
- if (decision.commands.length > 3) {
996
- this.logger?.(`[Orchestrator] ... and ${decision.commands.length - 3} more`);
997
- }
998
- if (decision.commandReasoning) {
999
- this.logger?.(`[Orchestrator] 💡 Why: ${decision.commandReasoning}`);
1000
- }
1001
- }
1002
-
1003
- // Experiences will be logged when added to memory, no need to log here
1004
- }
1005
-
959
+
1006
960
  /**
1007
961
  * Report step progress
1008
962
  */
@@ -1032,6 +986,297 @@ ${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
1032
986
  agentStatus: decision.status
1033
987
  });
1034
988
  }
989
+
990
+ /**
991
+ * Execute exploration mode - agent autonomously explores to achieve journey goal
992
+ * Fires onStepProgress callbacks for each autonomous action (transparent to caller)
993
+ */
994
+ async executeExploration(
995
+ page: any,
996
+ explorationConfig: ExplorationMode,
997
+ jobId: string
998
+ ): Promise<OrchestratorStepResult> {
999
+ this.logger?.(`\n[Orchestrator] ========== EXPLORATION MODE ==========`);
1000
+ this.logger?.(`[Orchestrator] 🎯 Journey Goal: ${explorationConfig.explorationPrompt}`);
1001
+ if (explorationConfig.testDataPrompt) {
1002
+ this.logger?.(`[Orchestrator] 📋 Test Data: ${explorationConfig.testDataPrompt}`);
1003
+ }
1004
+
1005
+ const memory: JourneyMemory = {
1006
+ history: [],
1007
+ experiences: [],
1008
+ extractedData: {}
1009
+ };
1010
+
1011
+ const maxSteps = explorationConfig.maxExplorationSteps || 50;
1012
+ let stepNumber = 0;
1013
+ const commandsExecuted: string[] = [];
1014
+
1015
+ while (stepNumber < maxSteps) {
1016
+ stepNumber++;
1017
+
1018
+ this.logger?.(`\n[Orchestrator] === Exploration Step ${stepNumber}/${maxSteps} ===`);
1019
+
1020
+ // Build exploratory context
1021
+ const context = await this.buildExploratoryContext(
1022
+ page,
1023
+ explorationConfig.explorationPrompt,
1024
+ explorationConfig.testDataPrompt,
1025
+ memory,
1026
+ stepNumber,
1027
+ maxSteps
1028
+ );
1029
+
1030
+ // Call agent with exploratory prompt
1031
+ const decision = await this.callExploratoryAgent(
1032
+ context,
1033
+ jobId,
1034
+ stepNumber
1035
+ );
1036
+
1037
+ this.decisionParser.log(decision, stepNumber);
1038
+
1039
+ // Report step start (fires JourneyRunner's beforeStepStart callback)
1040
+ if (this.progressReporter?.onStepProgress) {
1041
+ const stepInfo = {
1042
+ jobId,
1043
+ stepNumber,
1044
+ stepId: `exploration-${stepNumber}-${Date.now()}`,
1045
+ description: decision.reasoning,
1046
+ code: '', // Will be filled after commands execute
1047
+ status: StepExecutionStatus.IN_PROGRESS,
1048
+ wasRepaired: false
1049
+ };
1050
+ await this.progressReporter.onStepProgress(stepInfo);
1051
+ }
1052
+
1053
+ // Execute tools if requested
1054
+ if (decision.toolCalls && decision.toolCalls.length > 0) {
1055
+ const toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
1056
+
1057
+ // If needs tool results, call agent again
1058
+ if (decision.needsToolResults) {
1059
+ const updatedContext = { ...context, toolResults };
1060
+ const continuedDecision = await this.callExploratoryAgent(updatedContext, jobId, stepNumber);
1061
+
1062
+ decision.commands = continuedDecision.commands || decision.commands;
1063
+ decision.commandReasoning = continuedDecision.commandReasoning || decision.commandReasoning;
1064
+ decision.status = continuedDecision.status;
1065
+ }
1066
+ }
1067
+
1068
+ // Handle blocker clearing
1069
+ if (decision.blockerDetected && decision.blockerDetected.clearingCommands) {
1070
+ this.logger?.(`[Orchestrator] 🚧 Clearing blocker: ${decision.blockerDetected.description}`);
1071
+ const blockerResult = await this.executeCommands(
1072
+ decision.blockerDetected.clearingCommands,
1073
+ page,
1074
+ memory,
1075
+ stepNumber,
1076
+ 1,
1077
+ jobId
1078
+ );
1079
+ commandsExecuted.push(...blockerResult.executed);
1080
+ }
1081
+
1082
+ // Execute exploration commands
1083
+ let commandsSucceeded = true;
1084
+ if (decision.commands && decision.commands.length > 0) {
1085
+ const executeResult = await this.executeCommands(
1086
+ decision.commands,
1087
+ page,
1088
+ memory,
1089
+ stepNumber,
1090
+ 1,
1091
+ jobId
1092
+ );
1093
+ commandsExecuted.push(...executeResult.executed);
1094
+ commandsSucceeded = executeResult.allSucceeded;
1095
+ }
1096
+
1097
+ // Report step completion (fires JourneyRunner's onStepComplete callback)
1098
+ if (this.progressReporter?.onStepProgress) {
1099
+ const stepInfo = {
1100
+ jobId,
1101
+ stepNumber,
1102
+ stepId: `exploration-${stepNumber}-${Date.now()}`,
1103
+ description: decision.reasoning,
1104
+ code: decision.commands?.join('\n') || '',
1105
+ status: commandsSucceeded ? StepExecutionStatus.SUCCESS : StepExecutionStatus.FAILURE,
1106
+ error: commandsSucceeded ? undefined : 'Command execution failed',
1107
+ wasRepaired: false
1108
+ };
1109
+ await this.progressReporter.onStepProgress(stepInfo);
1110
+ }
1111
+
1112
+ // Add experiences (both app patterns AND exploration progress)
1113
+ if (decision.experiences) {
1114
+ memory.experiences.push(...decision.experiences);
1115
+ if (memory.experiences.length > this.config.maxExperiences) {
1116
+ memory.experiences = memory.experiences.slice(-this.config.maxExperiences);
1117
+ }
1118
+ }
1119
+
1120
+ // Store note for next iteration
1121
+ if (decision.noteToFutureSelf) {
1122
+ memory.latestNote = {
1123
+ fromIteration: stepNumber,
1124
+ content: decision.noteToFutureSelf
1125
+ };
1126
+ }
1127
+
1128
+ // Check termination
1129
+ if (decision.status === 'complete') {
1130
+ this.logger?.(`[Orchestrator] ✅ Journey exploration complete: ${decision.statusReasoning}`);
1131
+ return {
1132
+ success: true,
1133
+ commands: commandsExecuted,
1134
+ iterations: stepNumber,
1135
+ terminationReason: 'complete',
1136
+ memory
1137
+ };
1138
+ } else if (decision.status === 'stuck') {
1139
+ this.logger?.(`[Orchestrator] ❌ Exploration stuck: ${decision.statusReasoning}`);
1140
+ return {
1141
+ success: false,
1142
+ commands: commandsExecuted,
1143
+ iterations: stepNumber,
1144
+ terminationReason: 'agent_stuck',
1145
+ memory,
1146
+ error: decision.statusReasoning
1147
+ };
1148
+ }
1149
+ }
1150
+
1151
+ // Hit max steps - not necessarily a failure
1152
+ this.logger?.(`[Orchestrator] ⚠ Maximum exploration steps reached (budget limit)`);
1153
+ return {
1154
+ success: true, // Not a failure - just budget limit
1155
+ commands: commandsExecuted,
1156
+ iterations: stepNumber,
1157
+ terminationReason: 'system_limit',
1158
+ memory
1159
+ };
1160
+ }
1161
+
1162
+ private async buildExploratoryContext(
1163
+ page: any,
1164
+ explorationPrompt: string,
1165
+ testDataPrompt: string | undefined,
1166
+ memory: JourneyMemory,
1167
+ stepNumber: number,
1168
+ maxSteps: number
1169
+ ): Promise<AgentContext> {
1170
+ // Wait for page to be ready and elements to appear (especially important after navigation)
1171
+ const currentPageInfo = await PageInfoRetry.getWithRetry(page);
1172
+ const currentURL = page.url();
1173
+ const recentSteps = memory.history.slice(-this.config.recentStepsCount);
1174
+
1175
+ // SoM integration for exploratory mode
1176
+ let somScreenshot: string | undefined = undefined;
1177
+ let somElementMap: string | undefined = undefined;
1178
+ if (this.config.useSoM && this.somHandler) {
1179
+ try {
1180
+ this.somHandler.setPage(page);
1181
+
1182
+ // Wait briefly for page stability (handles first iteration + safety net for fast SPAs)
1183
+ try {
1184
+ await page.waitForLoadState('domcontentloaded', { timeout: 2000 });
1185
+ } catch (error: any) {
1186
+ // Page already loaded or timeout - continue
1187
+ }
1188
+
1189
+ // Update SoM markers
1190
+ await this.somHandler.updateSom();
1191
+ somScreenshot = await this.somHandler.getScreenshot(true, false, 60); // Viewport only - agent can scroll or request full page
1192
+
1193
+ // Get element map for disambiguation
1194
+ somElementMap = this.somHandler.getSomElementMap();
1195
+
1196
+ this.logger?.(`[Orchestrator] SoM screenshot captured for exploratory agent`, 'log');
1197
+ } catch (error: any) {
1198
+ this.logger?.(`[Orchestrator] Failed to capture SoM screenshot: ${error.message}`, 'error');
1199
+ }
1200
+ }
1201
+
1202
+ const context = {
1203
+ overallGoal: explorationPrompt,
1204
+ currentStepGoal: explorationPrompt, // Same as overall for single journey
1205
+ stepNumber,
1206
+ totalSteps: maxSteps,
1207
+ completedSteps: [],
1208
+ remainingSteps: [],
1209
+ currentPageInfo,
1210
+ currentURL,
1211
+ recentSteps,
1212
+ experiences: memory.experiences,
1213
+ extractedData: memory.extractedData,
1214
+ noteFromPreviousIteration: memory.latestNote,
1215
+ testDataPrompt, // CRITICAL: Store testDataPrompt in context
1216
+ somScreenshot, // SoM screenshot for exploratory mode (current)
1217
+ somElementMap // SoM element details for disambiguation
1218
+ };
1219
+
1220
+ // Save current screenshot as previous for next iteration (for tool access)
1221
+ if (somScreenshot) {
1222
+ this.previousSomScreenshot = somScreenshot;
1223
+ }
1224
+
1225
+ return context;
1226
+ }
1227
+
1228
+ private async callExploratoryAgent(
1229
+ context: AgentContext,
1230
+ jobId: string,
1231
+ stepNumber: number
1232
+ ): Promise<AgentDecision> {
1233
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
1234
+
1235
+ // Use SoM system prompt if in SoM mode, otherwise use standard exploratory prompt
1236
+ const systemPrompt = this.config.useSoM
1237
+ ? OrchestratorPrompts.buildSomSystemPrompt(this.config.somRestrictCoordinates)
1238
+ : OrchestratorPrompts.buildExploratorySystemPrompt(toolDescriptions);
1239
+
1240
+ const userPrompt = OrchestratorPrompts.buildExploratoryUserPrompt(
1241
+ context,
1242
+ context.overallGoal,
1243
+ context.testDataPrompt, // Pass testDataPrompt from context
1244
+ stepNumber,
1245
+ context.totalSteps
1246
+ );
1247
+
1248
+ const llmRequest: any = {
1249
+ model: DEFAULT_MODEL,
1250
+ systemPrompt,
1251
+ userPrompt
1252
+ };
1253
+
1254
+ // Include current SoM screenshot as image
1255
+ if (context.somScreenshot) {
1256
+ llmRequest.imageUrl = context.somScreenshot;
1257
+ this.logger?.(`[Orchestrator] Including SoM screenshot in exploratory LLM request`, 'log');
1258
+ }
1259
+
1260
+ const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
1261
+
1262
+ // Report token usage
1263
+ if (response.usage && this.progressReporter?.onTokensUsed) {
1264
+ await this.progressReporter.onTokensUsed({
1265
+ jobId,
1266
+ stepNumber,
1267
+ iteration: 1,
1268
+ inputTokens: response.usage.inputTokens,
1269
+ outputTokens: response.usage.outputTokens,
1270
+ includesImage: false,
1271
+ model: DEFAULT_MODEL,
1272
+ timestamp: Date.now()
1273
+ });
1274
+ }
1275
+
1276
+ // Parse response (same JSON format as regular mode)
1277
+ const decision = this.decisionParser.parse(response.answer);
1278
+ return decision;
1279
+ }
1035
1280
  }
1036
1281
 
1037
1282