testchimp-runner-core 0.0.33 → 0.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/dist/execution-service.d.ts +1 -4
  2. package/dist/execution-service.d.ts.map +1 -1
  3. package/dist/execution-service.js +155 -468
  4. package/dist/execution-service.js.map +1 -1
  5. package/dist/index.d.ts +3 -1
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +11 -1
  8. package/dist/index.js.map +1 -1
  9. package/dist/llm-facade.d.ts.map +1 -1
  10. package/dist/llm-facade.js +7 -7
  11. package/dist/llm-facade.js.map +1 -1
  12. package/dist/llm-provider.d.ts +9 -0
  13. package/dist/llm-provider.d.ts.map +1 -1
  14. package/dist/model-constants.d.ts +16 -5
  15. package/dist/model-constants.d.ts.map +1 -1
  16. package/dist/model-constants.js +17 -6
  17. package/dist/model-constants.js.map +1 -1
  18. package/dist/orchestrator/decision-parser.d.ts +18 -0
  19. package/dist/orchestrator/decision-parser.d.ts.map +1 -0
  20. package/dist/orchestrator/decision-parser.js +127 -0
  21. package/dist/orchestrator/decision-parser.js.map +1 -0
  22. package/dist/orchestrator/index.d.ts +4 -2
  23. package/dist/orchestrator/index.d.ts.map +1 -1
  24. package/dist/orchestrator/index.js +15 -2
  25. package/dist/orchestrator/index.js.map +1 -1
  26. package/dist/orchestrator/orchestrator-agent.d.ts +17 -22
  27. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  28. package/dist/orchestrator/orchestrator-agent.js +708 -577
  29. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  30. package/dist/orchestrator/orchestrator-prompts.d.ts +32 -0
  31. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
  32. package/dist/orchestrator/orchestrator-prompts.js +737 -0
  33. package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
  34. package/dist/orchestrator/page-som-handler.d.ts +106 -0
  35. package/dist/orchestrator/page-som-handler.d.ts.map +1 -0
  36. package/dist/orchestrator/page-som-handler.js +1353 -0
  37. package/dist/orchestrator/page-som-handler.js.map +1 -0
  38. package/dist/orchestrator/som-types.d.ts +149 -0
  39. package/dist/orchestrator/som-types.d.ts.map +1 -0
  40. package/dist/orchestrator/som-types.js +87 -0
  41. package/dist/orchestrator/som-types.js.map +1 -0
  42. package/dist/orchestrator/tool-registry.d.ts +2 -0
  43. package/dist/orchestrator/tool-registry.d.ts.map +1 -1
  44. package/dist/orchestrator/tool-registry.js.map +1 -1
  45. package/dist/orchestrator/tools/index.d.ts +5 -1
  46. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  47. package/dist/orchestrator/tools/index.js +9 -2
  48. package/dist/orchestrator/tools/index.js.map +1 -1
  49. package/dist/orchestrator/tools/refresh-som-markers.d.ts +12 -0
  50. package/dist/orchestrator/tools/refresh-som-markers.d.ts.map +1 -0
  51. package/dist/orchestrator/tools/refresh-som-markers.js +64 -0
  52. package/dist/orchestrator/tools/refresh-som-markers.js.map +1 -0
  53. package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
  54. package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
  55. package/dist/orchestrator/tools/verify-action-result.js +140 -0
  56. package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
  57. package/dist/orchestrator/tools/view-previous-screenshot.d.ts +15 -0
  58. package/dist/orchestrator/tools/view-previous-screenshot.d.ts.map +1 -0
  59. package/dist/orchestrator/tools/view-previous-screenshot.js +92 -0
  60. package/dist/orchestrator/tools/view-previous-screenshot.js.map +1 -0
  61. package/dist/orchestrator/types.d.ts +49 -1
  62. package/dist/orchestrator/types.d.ts.map +1 -1
  63. package/dist/orchestrator/types.js +11 -1
  64. package/dist/orchestrator/types.js.map +1 -1
  65. package/dist/prompts.d.ts.map +1 -1
  66. package/dist/prompts.js +40 -34
  67. package/dist/prompts.js.map +1 -1
  68. package/dist/scenario-service.d.ts +5 -0
  69. package/dist/scenario-service.d.ts.map +1 -1
  70. package/dist/scenario-service.js +17 -0
  71. package/dist/scenario-service.js.map +1 -1
  72. package/dist/scenario-worker-class.d.ts +4 -0
  73. package/dist/scenario-worker-class.d.ts.map +1 -1
  74. package/dist/scenario-worker-class.js +21 -3
  75. package/dist/scenario-worker-class.js.map +1 -1
  76. package/dist/testing/agent-tester.d.ts +35 -0
  77. package/dist/testing/agent-tester.d.ts.map +1 -0
  78. package/dist/testing/agent-tester.js +84 -0
  79. package/dist/testing/agent-tester.js.map +1 -0
  80. package/dist/testing/ref-translator-tester.d.ts +44 -0
  81. package/dist/testing/ref-translator-tester.d.ts.map +1 -0
  82. package/dist/testing/ref-translator-tester.js +104 -0
  83. package/dist/testing/ref-translator-tester.js.map +1 -0
  84. package/dist/utils/coordinate-converter.d.ts +32 -0
  85. package/dist/utils/coordinate-converter.d.ts.map +1 -0
  86. package/dist/utils/coordinate-converter.js +130 -0
  87. package/dist/utils/coordinate-converter.js.map +1 -0
  88. package/dist/utils/hierarchical-selector.d.ts +47 -0
  89. package/dist/utils/hierarchical-selector.d.ts.map +1 -0
  90. package/dist/utils/hierarchical-selector.js +212 -0
  91. package/dist/utils/hierarchical-selector.js.map +1 -0
  92. package/dist/utils/page-info-retry.d.ts +14 -0
  93. package/dist/utils/page-info-retry.d.ts.map +1 -0
  94. package/dist/utils/page-info-retry.js +60 -0
  95. package/dist/utils/page-info-retry.js.map +1 -0
  96. package/dist/utils/page-info-utils.d.ts +1 -0
  97. package/dist/utils/page-info-utils.d.ts.map +1 -1
  98. package/dist/utils/page-info-utils.js +46 -18
  99. package/dist/utils/page-info-utils.js.map +1 -1
  100. package/dist/utils/ref-attacher.d.ts +21 -0
  101. package/dist/utils/ref-attacher.d.ts.map +1 -0
  102. package/dist/utils/ref-attacher.js +149 -0
  103. package/dist/utils/ref-attacher.js.map +1 -0
  104. package/dist/utils/ref-translator.d.ts +49 -0
  105. package/dist/utils/ref-translator.d.ts.map +1 -0
  106. package/dist/utils/ref-translator.js +276 -0
  107. package/dist/utils/ref-translator.js.map +1 -0
  108. package/package.json +1 -1
  109. package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
  110. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
  111. package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
  112. package/plandocs/PHASE_1_COMPLETE.md +165 -0
  113. package/plandocs/PHASE_1_SUMMARY.md +184 -0
  114. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
  115. package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
  116. package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
  117. package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
  118. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
  119. package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
  120. package/plandocs/exploratory-mode-support-v2.plan.md +953 -0
  121. package/plandocs/exploratory-mode-support.plan.md +928 -0
  122. package/plandocs/journey-id-tracking-addendum.md +227 -0
  123. package/src/execution-service.ts +179 -596
  124. package/src/index.ts +10 -0
  125. package/src/llm-facade.ts +8 -8
  126. package/src/llm-provider.ts +11 -1
  127. package/src/model-constants.ts +17 -5
  128. package/src/orchestrator/decision-parser.ts +139 -0
  129. package/src/orchestrator/index.ts +27 -2
  130. package/src/orchestrator/orchestrator-agent.ts +868 -623
  131. package/src/orchestrator/orchestrator-prompts.ts +786 -0
  132. package/src/orchestrator/page-som-handler.ts +1565 -0
  133. package/src/orchestrator/som-types.ts +188 -0
  134. package/src/orchestrator/tool-registry.ts +2 -0
  135. package/src/orchestrator/tools/index.ts +5 -1
  136. package/src/orchestrator/tools/refresh-som-markers.ts +69 -0
  137. package/src/orchestrator/tools/verify-action-result.ts +159 -0
  138. package/src/orchestrator/tools/view-previous-screenshot.ts +103 -0
  139. package/src/orchestrator/types.ts +95 -4
  140. package/src/prompts.ts +40 -34
  141. package/src/scenario-service.ts +20 -0
  142. package/src/scenario-worker-class.ts +30 -4
  143. package/src/utils/coordinate-converter.ts +162 -0
  144. package/src/utils/page-info-retry.ts +65 -0
  145. package/src/utils/page-info-utils.ts +53 -18
  146. package/testchimp-runner-core-0.0.35.tgz +0 -0
  147. /package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
  148. /package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
  149. /package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0
  150. /package/{RELEASE_0.0.26.md → releasenotes/RELEASE_0.0.26.md} +0 -0
  151. /package/{RELEASE_0.0.27.md → releasenotes/RELEASE_0.0.27.md} +0 -0
  152. /package/{RELEASE_0.0.28.md → releasenotes/RELEASE_0.0.28.md} +0 -0
@@ -5,11 +5,16 @@
5
5
  */
6
6
  Object.defineProperty(exports, "__esModule", { value: true });
7
7
  exports.OrchestratorAgent = void 0;
8
- const test_1 = require("@playwright/test");
9
8
  const progress_reporter_1 = require("../progress-reporter");
10
9
  const page_info_utils_1 = require("../utils/page-info-utils");
10
+ const coordinate_converter_1 = require("../utils/coordinate-converter");
11
11
  const model_constants_1 = require("../model-constants");
12
12
  const types_1 = require("./types");
13
+ const orchestrator_prompts_1 = require("./orchestrator-prompts");
14
+ const page_info_retry_1 = require("../utils/page-info-retry");
15
+ const decision_parser_1 = require("./decision-parser");
16
+ const page_som_handler_1 = require("./page-som-handler");
17
+ const som_types_1 = require("./som-types");
13
18
  /**
14
19
  * Orchestrator Agent - manages step execution with tool use and memory
15
20
  */
@@ -22,6 +27,11 @@ class OrchestratorAgent {
22
27
  this.progressReporter = progressReporter;
23
28
  this.logger = logger;
24
29
  this.debugMode = debugMode || false;
30
+ this.decisionParser = new decision_parser_1.DecisionParser(logger);
31
+ // Initialize SoM handler if enabled
32
+ if (this.config.useSoM) {
33
+ this.somHandler = new page_som_handler_1.PageSoMHandler(null, this.logger);
34
+ }
25
35
  }
26
36
  setDebugMode(enabled) {
27
37
  this.debugMode = enabled;
@@ -29,28 +39,59 @@ class OrchestratorAgent {
29
39
  /**
30
40
  * Execute a single step of the scenario
31
41
  */
32
- async executeStep(page, stepDescription, stepNumber, totalSteps, scenarioSteps, memory, jobId) {
42
+ async executeStep(page, stepDescription, stepNumber, totalSteps, scenarioSteps, memory, jobId, priorSteps, // NEW: For repair mode (undefined for script gen)
43
+ nextSteps // NEW: For repair mode (undefined for script gen)
44
+ ) {
33
45
  this.logger?.(`\n[Orchestrator] ========== STEP ${stepNumber}/${totalSteps} ==========`);
34
46
  this.logger?.(`[Orchestrator] 🎯 Goal: ${stepDescription}`);
35
47
  let iteration = 0;
36
- let previousReflection = undefined;
48
+ let noteToSelf = memory.latestNote; // Start with note from previous step
37
49
  const commandsExecuted = [];
38
50
  let consecutiveFailures = 0; // Track consecutive iterations with failed commands
51
+ let coordinateAttempts = 0; // Track coordinate mode attempts (max 2)
39
52
  while (iteration < this.config.maxIterationsPerStep) {
40
53
  iteration++;
41
54
  this.logger?.(`\n[Orchestrator] === Iteration ${iteration}/${this.config.maxIterationsPerStep} ===`);
42
55
  // Build context for agent
43
- const context = await this.buildAgentContext(page, stepDescription, stepNumber, totalSteps, scenarioSteps, memory, previousReflection, consecutiveFailures);
56
+ const context = await this.buildAgentContext(page, stepDescription, stepNumber, totalSteps, scenarioSteps, memory, consecutiveFailures, noteToSelf, // Pass note from previous iteration
57
+ priorSteps, // NEW: Pass repair context
58
+ nextSteps // NEW: Pass repair context
59
+ );
44
60
  // Call agent to make decision
45
61
  const decision = await this.callAgent(context, jobId, stepNumber, iteration, consecutiveFailures);
46
62
  // Log agent's reasoning
47
- this.logAgentDecision(decision, iteration);
63
+ this.decisionParser.log(decision, iteration);
48
64
  // Report progress
49
65
  await this.reportStepProgress(jobId, stepNumber, stepDescription, decision, iteration);
50
66
  // Execute tools if requested (tools are READ-ONLY, they don't change state)
51
67
  let toolResults = {};
68
+ // ANTI-LOOP: Detect and BLOCK screenshot loops (PER STEP)
69
+ const screenshotsThisStep = memory.history.filter(s => s.stepNumber === stepNumber &&
70
+ (s.code.includes('take_screenshot') || s.action.includes('Screenshot')));
71
+ const recentScreenshots = memory.history.slice(-3).filter(s => s.code.includes('take_screenshot') || s.action.includes('Screenshot'));
72
+ if (screenshotsThisStep.length >= 3) {
73
+ this.logger?.(`[Orchestrator] 🚨 SCREENSHOT LOOP - ${screenshotsThisStep.length} screenshots THIS STEP! BLOCKING further screenshots`, 'error');
74
+ }
75
+ else if (recentScreenshots.length >= 2 && iteration >= 3) {
76
+ this.logger?.(`[Orchestrator] ⚠️ WARNING: ${recentScreenshots.length} screenshots in last 3 iterations - agent may be looping`, 'warn');
77
+ }
52
78
  if (decision.toolCalls && decision.toolCalls.length > 0) {
53
- toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
79
+ // ENFORCE: Block screenshot tool calls if too many taken IN THIS STEP
80
+ if (screenshotsThisStep.length >= 3) {
81
+ decision.toolCalls = decision.toolCalls.filter(tc => tc.name !== 'take_screenshot');
82
+ if (decision.toolCalls.length === 0) {
83
+ this.logger?.(`[Orchestrator] 🚫 REJECTED screenshot tool call - loop detected. Agent must ACT.`, 'warn');
84
+ toolResults = [{
85
+ toolName: 'take_screenshot',
86
+ success: false,
87
+ error: 'SYSTEM BLOCKED: Too many screenshots taken. You must use existing DOM snapshots and execute commands now. Analysis paralysis detected.',
88
+ data: null
89
+ }];
90
+ }
91
+ }
92
+ if (decision.toolCalls.length > 0) {
93
+ toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber, context.currentPageInfo.refMap);
94
+ }
54
95
  // If agent wants to wait for tool results before proceeding, call agent again
55
96
  if (decision.needsToolResults) {
56
97
  const updatedContext = { ...context, toolResults };
@@ -69,7 +110,7 @@ class OrchestratorAgent {
69
110
  if (decision.blockerDetected && decision.blockerDetected.clearingCommands && decision.blockerDetected.clearingCommands.length > 0) {
70
111
  this.logger?.(`[Orchestrator] 🚧 BLOCKER DETECTED: ${decision.blockerDetected.description}`);
71
112
  this.logger?.(`[Orchestrator] 🧹 Clearing blocker with ${decision.blockerDetected.clearingCommands.length} command(s)...`);
72
- const blockerResult = await this.executeCommandsSequentially(decision.blockerDetected.clearingCommands, page, memory, stepNumber, iteration, jobId);
113
+ const blockerResult = await this.executeCommands(decision.blockerDetected.clearingCommands, page, memory, stepNumber, iteration, jobId);
73
114
  // Add blocker commands with comment to output
74
115
  if (blockerResult.executed.length > 0) {
75
116
  commandsExecuted.push(`// Blocker: ${decision.blockerDetected.description}`);
@@ -82,9 +123,9 @@ class OrchestratorAgent {
82
123
  iterationHadFailure = true;
83
124
  }
84
125
  }
85
- // Execute main commands (only if no blocker failure or continueOnFailure is true)
86
- if (decision.commands && decision.commands.length > 0 && !iterationHadFailure) {
87
- const executeResult = await this.executeCommandsSequentially(decision.commands, page, memory, stepNumber, iteration, jobId);
126
+ // Execute main commands (only if no blocker failure)
127
+ if (!iterationHadFailure && decision.commands && decision.commands.length > 0) {
128
+ const executeResult = await this.executeCommands(decision.commands, page, memory, stepNumber, iteration, jobId);
88
129
  commandsExecuted.push(...executeResult.executed);
89
130
  // Track failures
90
131
  if (!executeResult.allSucceeded) {
@@ -96,8 +137,136 @@ class OrchestratorAgent {
96
137
  consecutiveFailures = 0; // Reset on success
97
138
  }
98
139
  }
140
+ // Handle coordinate-based actions (NEW - fallback when selectors fail) - ONLY if enabled
141
+ if (this.config.enableCoordinateMode && decision.coordinateAction && !iterationHadFailure) {
142
+ coordinateAttempts++;
143
+ this.logger?.(`[Orchestrator] 🎯 Coordinate Action (attempt ${coordinateAttempts}/2): ${decision.coordinateAction.action} at (${decision.coordinateAction.xPercent}%, ${decision.coordinateAction.yPercent}%)`);
144
+ try {
145
+ // BEFORE screenshot for visual verification
146
+ const beforeScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
147
+ const beforeDataUrl = `data:image/jpeg;base64,${beforeScreenshot}`;
148
+ // Generate Playwright commands from coordinate action
149
+ const coordCommands = await coordinate_converter_1.CoordinateConverter.generateCommands(decision.coordinateAction, page);
150
+ this.logger?.(`[Orchestrator] Generated commands:`);
151
+ coordCommands.forEach(cmd => this.logger?.(` ${cmd}`));
152
+ // Execute coordinate commands
153
+ const coordResult = await this.executeCommands(coordCommands, page, memory, stepNumber, iteration, jobId);
154
+ commandsExecuted.push(...coordResult.executed);
155
+ if (!coordResult.allSucceeded) {
156
+ this.logger?.(`[Orchestrator] ❌ Coordinate action failed (Playwright error)`);
157
+ consecutiveFailures++;
158
+ iterationHadFailure = true;
159
+ // Give up after 2 coordinate attempts
160
+ if (coordinateAttempts >= 2) {
161
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
162
+ return {
163
+ success: false,
164
+ commands: commandsExecuted,
165
+ iterations: iteration,
166
+ terminationReason: 'agent_stuck',
167
+ memory,
168
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
169
+ };
170
+ }
171
+ }
172
+ else {
173
+ this.logger?.(`[Orchestrator] ✅ Coordinate action succeeded (no Playwright error)`);
174
+ // CRITICAL: Verify visually if goal was achieved (coordinates might have clicked wrong place)
175
+ // Wait for network idle (smarter than fixed timeout - waits as needed, max 10s)
176
+ try {
177
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
178
+ }
179
+ catch (e) {
180
+ // Network idle timeout - page may still be loading, but proceed with verification
181
+ this.logger?.(`[Orchestrator] ⚠️ Network idle timeout after 10s, proceeding with verification`, 'warn');
182
+ }
183
+ const afterScreenshot = await page.screenshot({ encoding: 'base64', fullPage: false, type: 'jpeg', quality: 60 });
184
+ const afterDataUrl = `data:image/jpeg;base64,${afterScreenshot}`;
185
+ this.logger?.(`[Orchestrator] 📸 Verifying coordinate action visually...`);
186
+ const verificationRequest = {
187
+ model: 'gpt-5-mini',
188
+ systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its goal.',
189
+ userPrompt: `Goal: ${scenarioSteps[stepNumber - 1]}\n\nA coordinate-based action was just executed. Compare the BEFORE and AFTER screenshots.\n\nDid the action achieve the goal? Respond with JSON:\n{\n "verified": boolean,\n "reasoning": "What changed (or didn't change) between screenshots",\n "visibleChanges": ["List of UI changes observed"],\n "clickedWrongPlace": boolean\n}\n\nBe strict: Only return verified=true if you clearly see the expected change matching the goal.`,
190
+ images: [
191
+ { label: 'BEFORE', dataUrl: beforeDataUrl },
192
+ { label: 'AFTER', dataUrl: afterDataUrl }
193
+ ]
194
+ };
195
+ const verificationResponse = await this.llmFacade.llmProvider.callLLM(verificationRequest);
196
+ const jsonMatch = verificationResponse.answer.match(/\{[\s\S]*\}/);
197
+ if (jsonMatch) {
198
+ const verificationResult = JSON.parse(jsonMatch[0]);
199
+ const verified = verificationResult.verified === true;
200
+ const reasoning = verificationResult.reasoning || 'No reasoning provided';
201
+ const clickedWrongPlace = verificationResult.clickedWrongPlace === true;
202
+ this.logger?.(`[Orchestrator] 📊 Visual verification: ${verified ? '✅ VERIFIED' : '❌ NOT VERIFIED'}`);
203
+ this.logger?.(`[Orchestrator] 💭 Reasoning: ${reasoning}`);
204
+ if (verified) {
205
+ // Goal achieved! Can mark complete
206
+ consecutiveFailures = 0;
207
+ // Store note for context
208
+ noteToSelf = {
209
+ fromIteration: iteration,
210
+ content: `Coordinate action verified successful: ${reasoning}`
211
+ };
212
+ memory.latestNote = noteToSelf;
213
+ }
214
+ else {
215
+ // Goal NOT achieved despite no error
216
+ this.logger?.(`[Orchestrator] ⚠️ Coordinate click succeeded but goal NOT achieved`, 'warn');
217
+ consecutiveFailures++;
218
+ iterationHadFailure = true;
219
+ // Store diagnostic note for next attempt
220
+ const diagnostic = clickedWrongPlace
221
+ ? `Clicked wrong place. ${reasoning}. Try different coordinates.`
222
+ : `Action executed but goal not achieved. ${reasoning}. May need different approach.`;
223
+ noteToSelf = {
224
+ fromIteration: iteration,
225
+ content: diagnostic
226
+ };
227
+ memory.latestNote = noteToSelf;
228
+ // Give up after 2 coordinate attempts
229
+ if (coordinateAttempts >= 2) {
230
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts, none achieved goal) - marking stuck`);
231
+ return {
232
+ success: false,
233
+ commands: commandsExecuted,
234
+ iterations: iteration,
235
+ terminationReason: 'agent_stuck',
236
+ memory,
237
+ error: `Coordinate actions clicked but didn't achieve goal: ${reasoning}`
238
+ };
239
+ }
240
+ }
241
+ }
242
+ else {
243
+ this.logger?.(`[Orchestrator] ⚠️ Could not parse verification response - treating as unverified`, 'warn');
244
+ consecutiveFailures++;
245
+ iterationHadFailure = true;
246
+ }
247
+ }
248
+ }
249
+ catch (error) {
250
+ this.logger?.(`[Orchestrator] ❌ Coordinate action error: ${error.message}`, 'error');
251
+ consecutiveFailures++;
252
+ iterationHadFailure = true;
253
+ // Give up after 2 coordinate attempts
254
+ if (coordinateAttempts >= 2) {
255
+ this.logger?.(`[Orchestrator] 🛑 Coordinate mode exhausted (2 attempts) - marking stuck`);
256
+ return {
257
+ success: false,
258
+ commands: commandsExecuted,
259
+ iterations: iteration,
260
+ terminationReason: 'agent_stuck',
261
+ memory,
262
+ error: 'Coordinate fallback failed after 2 attempts - unable to proceed'
263
+ };
264
+ }
265
+ }
266
+ }
99
267
  // System-enforced stuck detection (agent might not detect it)
100
- if (consecutiveFailures >= 3) {
268
+ // Allow 5 failures: 3 selector attempts + 2 coordinate attempts
269
+ if (consecutiveFailures >= 5) {
101
270
  this.logger?.(`[Orchestrator] 🛑 SYSTEM: ${consecutiveFailures} consecutive failures detected - forcing stuck`, 'warn');
102
271
  return {
103
272
  success: false,
@@ -124,21 +293,37 @@ class OrchestratorAgent {
124
293
  memory.experiences = memory.experiences.slice(-this.config.maxExperiences);
125
294
  }
126
295
  }
127
- // Store self-reflection for next iteration
128
- previousReflection = decision.selfReflection;
296
+ // Store note to future self (tactical memory across iterations AND steps)
297
+ if (decision.noteToFutureSelf) {
298
+ noteToSelf = {
299
+ fromIteration: iteration,
300
+ content: decision.noteToFutureSelf
301
+ };
302
+ memory.latestNote = noteToSelf; // Persist in journey memory across steps
303
+ this.logger?.(`[Orchestrator] 📝 Note to self: ${decision.noteToFutureSelf}`);
304
+ }
129
305
  // Check termination
130
306
  if (decision.status !== 'continue') {
131
307
  this.logger?.(`[Orchestrator] 🎯 Status: ${decision.status}`);
132
308
  this.logger?.(`[Orchestrator] 💭 Reason: ${decision.statusReasoning}`);
133
- return {
134
- success: decision.status === 'complete',
135
- commands: commandsExecuted,
136
- iterations: iteration,
137
- terminationReason: decision.status === 'complete' ? 'complete' :
138
- decision.status === 'stuck' ? 'agent_stuck' :
139
- 'infeasible',
140
- memory
141
- };
309
+ // SAFETY CHECK: Don't allow "complete" if commands failed this iteration
310
+ if (decision.status === 'complete' && iterationHadFailure) {
311
+ this.logger?.(`[Orchestrator] ⚠️ OVERRIDE: Agent said "complete" but commands FAILED - forcing "continue"`, 'warn');
312
+ this.logger?.(`[Orchestrator] 💭 System: Commands must succeed before marking complete`);
313
+ // Don't return - continue to next iteration
314
+ }
315
+ else {
316
+ // Valid termination
317
+ return {
318
+ success: decision.status === 'complete',
319
+ commands: commandsExecuted,
320
+ iterations: iteration,
321
+ terminationReason: decision.status === 'complete' ? 'complete' :
322
+ decision.status === 'stuck' ? 'agent_stuck' :
323
+ 'infeasible',
324
+ memory
325
+ };
326
+ }
142
327
  }
143
328
  }
144
329
  // Hit iteration limit
@@ -155,14 +340,46 @@ class OrchestratorAgent {
155
340
  /**
156
341
  * Build context for agent
157
342
  */
158
- async buildAgentContext(page, currentStepGoal, stepNumber, totalSteps, scenarioSteps, memory, previousReflection, consecutiveFailures) {
343
+ async buildAgentContext(page, currentStepGoal, stepNumber, totalSteps, scenarioSteps, memory, consecutiveFailures, noteFromPreviousIteration, priorSteps, // NEW: For repair mode
344
+ nextSteps // NEW: For repair mode
345
+ ) {
159
346
  // Get fresh DOM
160
347
  const currentPageInfo = await (0, page_info_utils_1.getEnhancedPageInfo)(page);
161
348
  const currentURL = page.url();
162
349
  // Get recent steps
163
350
  const recentSteps = memory.history.slice(-this.config.recentStepsCount);
351
+ // SoM integration: Update markers and capture screenshot with visual IDs
352
+ let somScreenshot = undefined;
353
+ let somElementMap = undefined;
354
+ if (this.config.useSoM && this.somHandler) {
355
+ try {
356
+ if (!this.somHandler) {
357
+ this.somHandler = new page_som_handler_1.PageSoMHandler(page, this.logger);
358
+ }
359
+ else {
360
+ this.somHandler.setPage(page);
361
+ }
362
+ // Wait briefly for page stability (handles first iteration + safety net for fast SPAs)
363
+ try {
364
+ await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
365
+ }
366
+ catch (error) {
367
+ // Page already loaded or timeout - continue
368
+ }
369
+ // Update SoM markers
370
+ await this.somHandler.updateSom();
371
+ // Get screenshot WITH markers (viewport only - agent can scroll or use take_screenshot for full page)
372
+ somScreenshot = await this.somHandler.getScreenshot(true, false, 60);
373
+ // Get element map for disambiguation
374
+ somElementMap = this.somHandler.getSomElementMap();
375
+ this.logger?.(`[Orchestrator] SoM screenshot captured for agent decision-making`, 'log');
376
+ }
377
+ catch (error) {
378
+ this.logger?.(`[Orchestrator] Failed to capture SoM screenshot: ${error.message}`, 'error');
379
+ }
380
+ }
164
381
  // Build context
165
- return {
382
+ const context = {
166
383
  overallGoal: scenarioSteps.join('\n'),
167
384
  currentStepGoal,
168
385
  stepNumber,
@@ -174,16 +391,48 @@ class OrchestratorAgent {
174
391
  recentSteps,
175
392
  experiences: memory.experiences,
176
393
  extractedData: memory.extractedData,
177
- previousIterationGuidance: previousReflection
394
+ noteFromPreviousIteration, // Pass tactical note from previous iteration
395
+ somScreenshot, // SoM screenshot with visual markers (current)
396
+ somElementMap, // SoM element details for disambiguation
397
+ priorSteps, // NEW: Repair context (undefined for script gen)
398
+ nextSteps // NEW: Repair context (undefined for script gen)
178
399
  };
400
+ // Save current screenshot as previous for next iteration (for tool access)
401
+ if (somScreenshot) {
402
+ this.previousSomScreenshot = somScreenshot;
403
+ }
404
+ return context;
179
405
  }
180
406
  /**
181
407
  * Call agent to make decision
182
408
  */
183
409
  async callAgent(context, jobId, stepNumber, iteration, consecutiveFailures) {
184
- // Build prompt
185
- const systemPrompt = this.buildSystemPrompt();
186
- const userPrompt = this.buildUserPrompt(context, consecutiveFailures);
410
+ // Detect if coordinate mode should be activated
411
+ // Phase 1: Only 2 tiers (selectors → coordinates), so activate after 3 failures
412
+ // Phase 2: Will have 3 tiers (selectors → index → coordinates), threshold will be 5
413
+ const useCoordinateMode = consecutiveFailures !== undefined && consecutiveFailures >= 3;
414
+ // Build appropriate system prompt based on mode
415
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
416
+ let systemPrompt;
417
+ if (this.config.useSoM) {
418
+ // SoM mode: Use visual element identification
419
+ systemPrompt = orchestrator_prompts_1.OrchestratorPrompts.buildSomSystemPrompt(this.config.somRestrictCoordinates);
420
+ }
421
+ else if (useCoordinateMode) {
422
+ // Coordinate mode: Fallback when selectors fail
423
+ systemPrompt = orchestrator_prompts_1.OrchestratorPrompts.buildCoordinateSystemPrompt();
424
+ }
425
+ else {
426
+ // Standard mode: DOM-based selectors
427
+ systemPrompt = orchestrator_prompts_1.OrchestratorPrompts.buildSystemPrompt(toolDescriptions, this.config.enableCoordinateMode);
428
+ }
429
+ const userPrompt = orchestrator_prompts_1.OrchestratorPrompts.buildUserPrompt(context, consecutiveFailures, this.config.enableCoordinateMode);
430
+ // Log prompt lengths for monitoring
431
+ const systemLength = systemPrompt.length;
432
+ const userLength = userPrompt.length;
433
+ const totalLength = systemLength + userLength;
434
+ const estimatedTokens = Math.ceil(totalLength / 4); // Rough estimate: 4 chars per token
435
+ this.logger?.(`[Orchestrator] 📊 Prompt lengths: system=${systemLength} chars, user=${userLength} chars, total=${totalLength} chars (~${estimatedTokens} tokens)`, 'log');
187
436
  try {
188
437
  // Call LLM directly via provider
189
438
  const llmRequest = {
@@ -191,6 +440,11 @@ class OrchestratorAgent {
191
440
  systemPrompt,
192
441
  userPrompt
193
442
  };
443
+ // Include current SoM screenshot as image
444
+ if (context.somScreenshot) {
445
+ llmRequest.imageUrl = context.somScreenshot;
446
+ this.logger?.(`[Orchestrator] Including SoM screenshot in LLM request`, 'log');
447
+ }
194
448
  const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
195
449
  // Report token usage
196
450
  if (response.usage && this.progressReporter?.onTokensUsed) {
@@ -211,7 +465,7 @@ class OrchestratorAgent {
211
465
  this.logger?.(`[Orchestrator] ⚠ No usage data in LLM response`, 'warn');
212
466
  }
213
467
  // Parse response
214
- return this.parseAgentDecision(response.answer);
468
+ return this.decisionParser.parse(response.answer);
215
469
  }
216
470
  catch (error) {
217
471
  this.logger?.(`[Orchestrator] ✗ Agent call failed: ${error.message}`, 'error');
@@ -223,430 +477,20 @@ class OrchestratorAgent {
223
477
  };
224
478
  }
225
479
  }
226
- /**
227
- * Build system prompt with tool descriptions
228
- */
229
- buildSystemPrompt() {
230
- const toolDescriptions = this.toolRegistry.generateToolDescriptions();
231
- return `You are an intelligent test automation agent that executes web scenarios using Playwright.
232
-
233
- ${toolDescriptions}
234
-
235
- YOUR RESPONSE FORMAT (JSON):
236
- {
237
- "toolCalls": [{"name": "tool_name", "params": {...}}], // Optional: Tools to use
238
- "toolReasoning": "Why I need these tools", // If using tools
239
- "needsToolResults": true/false, // Wait for tool results before commands?
240
- "commands": ["await page.click(...)", ...], // Playwright commands to execute
241
- "commandReasoning": "Why these commands", // If executing commands
242
- "selfReflection": { // Guidance for your next iteration
243
- "guidanceForNext": "What to focus on next",
244
- "detectingLoop": false, // Set true if repeating same approach
245
- "loopReasoning": "Why I think I'm looping" // If detecting loop
246
- },
247
- "experiences": ["App-specific learning"], // APP-SPECIFIC patterns (see rules below)
248
- "status": "continue|complete|stuck|infeasible", // Your decision
249
- "statusReasoning": "Why this status",
250
- "reasoning": "Overall reasoning for this iteration",
251
- "blockerDetected": { // OPTIONAL: If something blocks the current goal
252
- "description": "What's blocking", // E.g., "Cookie consent modal"
253
- "clearingCommands": ["await page.click(...)"] // Commands to clear blocker FIRST
254
- },
255
- "stepReEvaluation": { // OPTIONAL: If questioning step order (last resort)
256
- "detected": true, // Set true if you think step order is wrong
257
- "issue": "prior_incomplete|already_done|wrong_order", // What's the issue?
258
- "explanation": "Why step order seems wrong" // Your reasoning
259
- }
260
- }
261
-
262
- STATUS DECISION RULES (CRITICAL!):
263
-
264
- - "complete": ALL commands succeeded AND step goal achieved
265
- ❌ NEVER mark complete if ANY command failed!
266
-
267
- - "continue": Command failed OR need more actions
268
- Use when: Need to retry with different selector, wait for something, or continue execution
269
-
270
- - "stuck": Tried 3+ iterations, all failed
271
-
272
- - "infeasible": Goal truly impossible (element doesn't exist anywhere)
273
-
274
- IF COMMAND FAILED → status must be "continue" (to retry) or "stuck" (if tried many times)
275
- IF COMMAND SUCCEEDED → status can be "complete" (if goal done) or "continue" (if more needed)
276
-
277
- STEP RE-EVALUATION (LAST RESORT - After 2+ Failures):
278
-
279
- When you've failed multiple times on current step, QUESTION YOUR ASSUMPTIONS:
280
-
281
- 1. **LOOK BACK**: Did prior steps really complete?
282
- - Check COMPLETED steps list vs current page state
283
- - Example: Says "Login complete" but you see login form → PRIOR STEP FAILED
284
- - If prior step incomplete: Execute commands to complete it, explain in reasoning
285
-
286
- 2. **LOOK FORWARD**: Is current step already done or no longer relevant?
287
- - Check if current step goal already achieved (element exists, state correct)
288
- - Example: Step says "Click Submit" but form already submitted → STEP ALREADY DONE
289
- - If already done: Mark "complete" and explain why
290
-
291
- 3. **LOOK AHEAD**: Is next step more feasible?
292
- - Sometimes UI flows differently than expected
293
- - Example: Current step "Click Edit" fails, but next step "Fill Name" elements are visible
294
- - If can do future step: Note in reasoning, but PREFER sticking to order
295
-
296
- STICK TO ORIGINAL PLAN unless clear evidence suggests otherwise.
297
- Only re-evaluate after repeated failures on same step.
298
-
299
- BLOCKER DETECTION (IMPORTANT!):
300
-
301
- Use "blockerDetected" when the CURRENT STEP GOAL is not immediately feasible because something unexpected is in the way:
302
-
303
- Examples of BLOCKERS:
304
- - Cookie consent modal appears before login
305
- - "What's New" tour overlay prevents clicking navigation
306
- - Email verification prompt blocks form submission
307
- - Session timeout modal requires re-login before continuing
308
-
309
- When you detect a blocker:
310
- 1. Set "blockerDetected.description" to describe what's blocking
311
- 2. Provide "clearingCommands" to dismiss/handle the blocker
312
- 3. Set your main "commands" to execute AFTER blocker is cleared
313
- 4. Agent will execute: clearingCommands FIRST, then your commands
314
-
315
- Example:
316
- {
317
- "blockerDetected": {
318
- "description": "Cookie consent modal blocking page interaction",
319
- "clearingCommands": ["await page.getByRole('button', {name: 'Accept'}).click()"]
320
- },
321
- "commands": ["await page.getByRole('textbox', {name: 'Email'}).fill('user@example.com')"],
322
- ...
323
- }
324
-
325
- DON'T use blockerDetected for:
326
- - Normal step execution (just use commands)
327
- - Wrong selectors (that's not a blocker, just fix the selector)
328
- - Missing elements (that's infeasible, not blocked)
329
-
330
- EXPERIENCES - APP-SPECIFIC PATTERNS ONLY:
331
-
332
- Guidelines:
333
- - Concise - combine related info into single learning
334
- - Focus on app quirks, not obvious practices
335
- - Max 1-2 experiences per iteration
336
- - Capture SEMANTIC SELECTOR patterns when discovered
337
-
338
- ✅ GOOD - App-specific patterns:
339
- - "Login form fields accessible via getByLabel: 'Email' and 'Password'"
340
- - "Submit buttons consistently use role=button with text matching action"
341
- - "Form uses custom dropdowns (role=combobox) instead of native selects"
342
- - "Navigation items accessible via getByRole('link') with visible text"
343
- - "Site uses data-testid attributes consistently (pattern: action-element-name)"
344
- - "Input fields have clear placeholders - prefer getByPlaceholder over IDs"
345
-
346
- ❌ BAD - Too specific or obvious:
347
- - Listing individual field selectors (combine them)
348
- - "Button exists with role=button" (obvious)
349
- - Generic automation advice
350
- - Noting auto-generated IDs like #«r3»-form-item (these are unreliable)
351
-
352
- COMBINE related learnings instead of listing each element separately.
353
- PRIORITIZE learning about semantic selector availability over CSS selectors.
354
-
355
- CRITICAL RULES FOR DECISION MAKING:
356
-
357
- 1. **BE FOCUSED AND CONCISE** (CRITICAL!):
358
- - Read the CURRENT STEP GOAL carefully - ONLY do what's explicitly asked for in THAT step
359
- - DON'T add extra actions, verifications, or defensive checks not mentioned in the step goal
360
- - If step says "Send a message" → ONLY send the message, DON'T verify it appeared
361
- - If step says "Click Submit" → ONLY click, DON'T wait or verify submission success
362
- - ONLY add expect() assertions if the step explicitly mentions "verify", "check", "ensure", or "confirm"
363
- - Keep commands minimal - no unnecessary preparatory or validation actions
364
-
365
- 2. **COMMAND EFFICIENCY**:
366
- - DON'T click on input field before filling it - just use .fill() directly
367
- - DON'T do preparatory actions unless necessary
368
- - Example: page.fill works without clicking the input first
369
-
370
- 2b. **VARIABLE SCOPE & AVAILABLE FUNCTIONS** (CRITICAL!):
371
- - Commands in SAME BATCH share scope - variables persist within iteration
372
- - To save data for later steps: Use extract_data tool, then access via extractedData in context
373
- - Within same batch: Variables work across commands
374
- - AVAILABLE IN CONTEXT: page (Playwright Page object) and expect (Playwright assertions)
375
- - You can use expect() directly - it is already imported and available
376
-
377
- 3. **WHEN COMMAND FAILS** (most important!):
378
- - DON'T repeat the same command/selector
379
- - DO analyze the error message
380
- - DO look at DOM snapshot for SEMANTIC selectors (getByRole, getByLabel, getByText)
381
- - AVOID falling back to auto-generated IDs with unicode characters (e.g., #«r3»-form-item)
382
- - DO take screenshot if DOM doesn't help
383
- - "Element is not a <select>" → It's a custom dropdown, use .click() then select from menu
384
- - "Timeout waiting for locator" → Try semantic selectors from ARIA tree instead of CSS IDs
385
-
386
- 4. **CHECK YOUR RECENT STEPS FOR PATTERNS**:
387
- - If you see same selector failed 2+ times → YOU ARE LOOPING!
388
- - Set detectingLoop: true and explain why
389
- - Take screenshot to see what's actually on the page
390
- - Use different SEMANTIC selector from DOM/ARIA tree (getByRole, getByLabel, getByText, data-testid)
391
- - AVOID repeating with similar auto-generated ID patterns (e.g., #«r3»-form-item → #«r4»-form-item)
392
-
393
- 5. **USE DOM SNAPSHOT** - ONLY USE WHAT'S ACTUALLY THERE (CRITICAL!):
394
- - You get INTERACTIVE ELEMENTS list showing: position, tag, id, text, SUGGESTED SELECTORS
395
- - ONLY use selectors from this list - DO NOT invent selectors with text/names not in the list!
396
- - ❌ WRONG: List shows button "Submit" but you use getByRole('button', {name: 'SomeOtherText'})
397
- - ✅ RIGHT: Use EXACT selector from the provided list
398
- - ✅ RIGHT: If list shows "#submit-btn" → use that
399
- - ✅ RIGHT: If list shows "getByRole('button', {name: 'X'})" → use exactly that
400
- - If element you need is NOT in list → take_screenshot to find it
401
- - ARIA tree shows roles/names that exist - inventing roles/names not in tree will fail!
402
-
403
- 5b. **SELECTOR PREFERENCE** (CRITICAL - Playwright Best Practices):
404
- ALWAYS prefer user-friendly, semantic selectors in this order:
405
-
406
- 🏆 BEST (use these first):
407
- 1. page.getByRole('role', {name: 'text'}) → ✅ Accessible, semantic, resilient
408
- 2. page.getByLabel('label text') → ✅ Great for form inputs
409
- 3. page.getByPlaceholder('placeholder') → ✅ Good for inputs without labels
410
- 4. page.getByText('visible text') → ✅ Clear and readable
411
- 5. page.getByTestId('test-id') → ✅ Stable if available
412
-
413
- ⚠️ AVOID (last resort only):
414
- - CSS selectors with auto-generated IDs: #r3-form-item, #«r3»-form-item
415
- - CSS selectors with unicode: #«r4»-form-item
416
- - Complex CSS paths: div.ant-form > div:nth-child(2)
417
-
418
- Examples:
419
- ❌ BAD: await page.fill('#«r3»-form-item', 'alice@example.com')
420
- ✅ GOOD: await page.getByLabel('Email').fill('alice@example.com')
421
- ✅ GOOD: await page.getByRole('textbox', {name: 'Email'}).fill('alice@example.com')
422
- ✅ GOOD: await page.getByPlaceholder('Enter your email').fill('alice@example.com')
423
-
424
- ❌ BAD: await page.click('#submit-btn-23847')
425
- ✅ GOOD: await page.getByRole('button', {name: 'Submit'}).click()
426
- ✅ GOOD: await page.getByText('Submit').click()
427
-
428
- WHY: Semantic selectors are more maintainable, self-documenting, and resilient to UI changes.
429
- Auto-generated IDs (especially with unicode like «r3») break when component instances change.
430
-
431
- 5c. **ASSERTIONS AND VERIFICATION** (CRITICAL - Use expect):
432
- Use expect() for verification ONLY when the step explicitly asks for verification.
433
-
434
- 🏆 WHEN TO USE expect():
435
- - Step says "Verify message appears" → await expect(locator).toBeVisible()
436
- - Step says "Check field is empty" → await expect(locator).toBeEmpty()
437
- - Step says "Confirm URL changed" → await expect(page).toHaveURL('...')
438
- - Step explicitly mentions: "verify", "check", "ensure", "confirm", "validate"
439
-
440
- ❌ WHEN NOT TO USE expect():
441
- - Step says "Send a message" → DON'T add expect() for message visibility
442
- - Step says "Click Submit" → DON'T add expect() for button state or result
443
- - Step says "Fill Email field" → DON'T add expect() to verify it's filled
444
- - Step says "Navigate to Dashboard" → DON'T add expect() for URL (unless step asks)
445
-
446
- 🏆 expect() syntax (when needed):
447
- - await expect(locator).toBeVisible()
448
- - await expect(locator).toHaveText('expected text')
449
- - await expect(locator).toHaveValue('expected value')
450
- - await expect(locator).toBeEmpty()
451
- - await expect(page).toHaveURL('expected-url')
452
-
453
- ❌ BAD - Manual checks:
454
- const value = await page.getByPlaceholder('Message...').inputValue();
455
- if (value !== '') throw new Error(\`Expected empty but found: '\${value}'\`);
456
-
457
- ✅ GOOD - Use expect (but ONLY if step asks to verify):
458
- await expect(page.getByPlaceholder('Message...')).toBeEmpty();
459
-
460
- WHY: expect() has built-in retries and better error messages.
461
- BUT: Only use it when verification is explicitly requested in the step goal.
462
-
463
- 6. **TOOLS ARE READ-ONLY (Information Gathering)**:
464
- AVAILABLE TOOLS (for information only):
465
- - take_screenshot: Get visual context when DOM isn't enough or stuck 2+ times
466
- * DEFAULT: Captures FULL PAGE (elements below fold included)
467
- * Set isFullPage=false ONLY if you need just viewport
468
- * Use for: Planning, finding hidden elements, verifying layout
469
- - recall_history: Remember earlier steps in journey
470
- - inspect_page: Force fresh DOM snapshot (rarely needed, already in context)
471
- - check_page_ready: Verify page loaded (use sparingly!)
472
- - extract_data: Save data from page to memory for later steps
473
-
474
- ALL STATE CHANGES USE PLAYWRIGHT COMMANDS:
475
- - Navigation: await page.goto('url')
476
- - Go back: await page.goBack()
477
- - Refresh: await page.reload()
478
- - Fill inputs: await page.fill() or page.getByRole().fill()
479
- - Click buttons: await page.click() or page.getByRole().click()
480
- - Wait: await page.waitForURL(), page.waitForLoadState()
481
-
482
- Tools gather info → Commands change state
483
-
484
- 7. **ERROR-SPECIFIC RESPONSES**:
485
- - "Timeout waiting for locator" → Selector doesn't exist, try different one from DOM
486
- - "Element is not a <select>" → It's a custom dropdown (button with role=combobox), use .click()
487
- - "Multiple elements" → Add more specific selector from DOM (use #id)
488
-
489
- 8. **WHEN TO RUN COMMANDS vs TOOLS**:
490
- - Confident about selectors from DOM → Run commands directly
491
- - Unsure or failed 2+ times → Take screenshot first
492
- - First iteration of a step → Usually can run commands from DOM`;
493
- }
494
- /**
495
- * Build user prompt with context
496
- */
497
- buildUserPrompt(context, consecutiveFailures) {
498
- const parts = [];
499
- parts.push('=== YOUR CURRENT CONTEXT ===\n');
500
- // System warnings for accumulated failures
501
- if (consecutiveFailures && consecutiveFailures >= 2) {
502
- parts.push(`⚠️⚠️⚠️ SYSTEM WARNING ⚠️⚠️⚠️`);
503
- parts.push(`You've failed ${consecutiveFailures} iterations in a row!`);
504
- parts.push(`Next failure will force STUCK. You MUST:`);
505
- parts.push(`1. Take screenshot if you haven't already`);
506
- parts.push(`2. Try completely different selector strategy`);
507
- parts.push(`3. QUESTION ASSUMPTIONS: Am I at the right step?`);
508
- parts.push(` - LOOK BACK: Did prior steps really complete? Check page state vs COMPLETED list`);
509
- parts.push(` - LOOK FORWARD: Is current step already done? Check if goal already achieved`);
510
- parts.push(`4. OR mark status="stuck" if truly can't proceed`);
511
- parts.push(`⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️\n`);
512
- }
513
- // Goals - make current step very prominent
514
- parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
515
- parts.push(`${context.currentStepGoal}`);
516
- parts.push(`^^ DO ONLY WHAT THIS STEP ASKS - NO EXTRA ACTIONS OR VERIFICATIONS ^^`);
517
- parts.push(`^^ If step doesn't say "verify/check/confirm" → DON'T add expect() assertions ^^`);
518
- parts.push(`^^ Mark "complete" ONLY if commands succeeded ^^`);
519
- parts.push(``);
520
- parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
521
- if (context.completedSteps.length > 0) {
522
- parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
523
- }
524
- if (context.remainingSteps.length > 0) {
525
- parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
526
- }
527
- // Current page state
528
- parts.push(`\nCURRENT PAGE:`);
529
- parts.push(`URL: ${context.currentURL}`);
530
- parts.push(`Title: ${context.currentPageInfo.title}`);
531
- parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
532
- parts.push(context.currentPageInfo.formattedElements);
533
- parts.push(`\nARIA TREE (hierarchical structure):`);
534
- parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
535
- if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
536
- parts.push('... (truncated)');
537
- }
538
- parts.push('');
539
- // Recent steps
540
- if (context.recentSteps.length > 0) {
541
- parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
542
- for (const step of context.recentSteps) {
543
- const status = step.result === 'success' ? '✓' : '✗';
544
- parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
545
- parts.push(` Code: ${step.code}`);
546
- if (step.result === 'failure' && step.error) {
547
- parts.push(` ❌ ERROR: ${step.error}`);
548
- parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
549
- }
550
- else {
551
- parts.push(` Result: ${step.observation}`);
552
- }
553
- }
554
- parts.push('');
555
- // Detect repeated failures
556
- const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
557
- if (recentFailures.length >= 2) {
558
- const sameSelector = recentFailures.slice(-2).every((s, i, arr) => i === 0 || s.code === arr[i - 1].code);
559
- if (sameSelector) {
560
- parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
561
- parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
562
- parts.push(` YOU MUST try a completely different selector this time!\n`);
563
- }
564
- }
565
- }
566
- // Experiences
567
- if (context.experiences.length > 0) {
568
- parts.push(`\nLEARNINGS (${context.experiences.length}):`);
569
- for (const exp of context.experiences) {
570
- parts.push(` - ${exp}`);
571
- }
572
- parts.push('');
573
- }
574
- // Extracted data
575
- if (Object.keys(context.extractedData).length > 0) {
576
- parts.push(`\nEXTRACTED DATA:`);
577
- for (const [key, value] of Object.entries(context.extractedData)) {
578
- parts.push(` ${key}: ${value}`);
579
- }
580
- parts.push('');
581
- }
582
- // Previous reflection
583
- if (context.previousIterationGuidance) {
584
- parts.push(`\nYOUR PREVIOUS GUIDANCE TO YOURSELF:`);
585
- parts.push(`Focus: ${context.previousIterationGuidance.guidanceForNext}`);
586
- if (context.previousIterationGuidance.detectingLoop) {
587
- parts.push(`⚠ YOU DETECTED A LOOP: ${context.previousIterationGuidance.loopReasoning}`);
588
- }
589
- parts.push('');
590
- }
591
- // Tool results (if any)
592
- if (context.toolResults) {
593
- parts.push(`\n========== TOOL RESULTS ==========`);
594
- for (const [toolName, result] of Object.entries(context.toolResults)) {
595
- parts.push(`\n${toolName}: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
596
- if (result.learning) {
597
- parts.push(`\n📊 ANALYSIS FROM ${toolName.toUpperCase()}:`);
598
- parts.push(result.learning);
599
- parts.push(`^^ USE THIS INFORMATION TO DECIDE YOUR NEXT COMMANDS ^^`);
600
- }
601
- if (result.error) {
602
- parts.push(` Error: ${result.error}`);
603
- }
604
- }
605
- parts.push(`\n===================================\n`);
606
- }
607
- parts.push('=== WHAT DO YOU WANT TO DO? ===');
608
- parts.push('Respond with JSON following the format specified in the system prompt.');
609
- return parts.join('\n');
610
- }
611
- /**
612
- * Parse agent decision from LLM response
613
- */
614
- parseAgentDecision(response) {
615
- try {
616
- // Extract JSON from response
617
- const jsonMatch = response.match(/\{[\s\S]*\}/);
618
- if (!jsonMatch) {
619
- throw new Error('No JSON found in response');
620
- }
621
- const parsed = JSON.parse(jsonMatch[0]);
622
- // Validate required fields
623
- if (!parsed.status || !parsed.reasoning) {
624
- throw new Error('Missing required fields: status and reasoning');
625
- }
626
- return parsed;
627
- }
628
- catch (error) {
629
- this.logger?.(`[Orchestrator] ✗ Failed to parse agent decision: ${error.message}`, 'error');
630
- this.logger?.(`[Orchestrator] Raw response: ${response.substring(0, 200)}...`, 'log');
631
- // Return fallback
632
- return {
633
- status: 'stuck',
634
- statusReasoning: 'Failed to parse agent response',
635
- reasoning: `Parse error: ${error.message}`
636
- };
637
- }
638
- }
639
480
  /**
640
481
  * Execute tools
641
482
  */
642
- async executeTools(toolCalls, page, memory, stepNumber) {
483
+ async executeTools(toolCalls, page, memory, stepNumber, refMap) {
643
484
  this.logger?.(`[Orchestrator] 🔧 Executing ${toolCalls.length} tool(s)`);
644
485
  const results = {};
645
486
  const toolContext = {
646
487
  page,
647
488
  memory,
648
489
  stepNumber,
649
- logger: this.logger
490
+ logger: this.logger,
491
+ refMap, // Pass refMap for interact_with_ref tool
492
+ previousSomScreenshot: this.previousSomScreenshot, // For view_previous_screenshot tool
493
+ somHandler: this.somHandler // For refresh_som_markers tool
650
494
  };
651
495
  for (const toolCall of toolCalls.slice(0, this.config.maxToolCallsPerIteration)) {
652
496
  this.logger?.(`[Orchestrator] ▶ ${toolCall.name}(${JSON.stringify(toolCall.params).substring(0, 50)}...)`);
@@ -662,45 +506,214 @@ CRITICAL RULES FOR DECISION MAKING:
662
506
  return results;
663
507
  }
664
508
  /**
665
- * Execute commands sequentially with SHARED context (variables persist across commands)
509
+ * Parse SomCommand from command object
510
+ */
511
+ parseSomCommand(cmd) {
512
+ if (typeof cmd === 'object' && cmd.action) {
513
+ // Valid if: has elementRef, OR has coord, OR is navigation action
514
+ const isNavigationAction = ['navigate', 'goBack', 'goForward', 'reload'].includes(cmd.action);
515
+ const hasValidTarget = cmd.elementRef || cmd.coord || isNavigationAction;
516
+ if (hasValidTarget) {
517
+ return {
518
+ elementRef: cmd.elementRef,
519
+ coord: cmd.coord,
520
+ action: cmd.action,
521
+ value: cmd.value,
522
+ fromCoord: cmd.fromCoord,
523
+ toCoord: cmd.toCoord,
524
+ force: cmd.force,
525
+ scrollAmount: cmd.scrollAmount,
526
+ scrollDirection: cmd.scrollDirection,
527
+ button: cmd.button,
528
+ clickCount: cmd.clickCount,
529
+ modifiers: cmd.modifiers,
530
+ delay: cmd.delay,
531
+ timeout: cmd.timeout
532
+ };
533
+ }
534
+ }
535
+ return null;
536
+ }
537
+ /**
538
+ * Execute commands (mix of ref and playwright commands)
666
539
  */
667
- async executeCommandsSequentially(commands, page, memory, stepNumber, iteration, jobId) {
668
- this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s) in shared context`);
540
+ async executeCommands(commands, page, memory, stepNumber, iteration, jobId) {
541
+ this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s)`);
669
542
  const executed = [];
670
- const limitedCommands = commands.slice(0, this.config.maxCommandsPerIteration);
671
- // Build execution with shared context (all commands share scope - variables persist)
672
- const commandsWithTracking = limitedCommands.map((cmd, i) => {
673
- return `
674
- // Command ${i + 1}/${limitedCommands.length}
543
+ if (commands.length === 0) {
544
+ return { executed: [], allSucceeded: true };
545
+ }
546
+ // SoM mode: Execute commands through PageSoMHandler
547
+ if (this.config.useSoM && this.somHandler) {
548
+ this.logger?.(`[Orchestrator] Using SoM mode for command execution`, 'log');
549
+ for (let i = 0; i < commands.length; i++) {
550
+ const cmd = commands[i];
551
+ // Check if verification or action command
552
+ if ((0, som_types_1.isSomVerification)(cmd)) {
553
+ // Handle verification command
554
+ try {
555
+ const result = await this.somHandler.executeVerification(cmd);
556
+ // Always add command to executed array (even if verification failed)
557
+ // Scripts should contain the expect even if it fails during generation
558
+ if (result.playwrightCommand) {
559
+ executed.push(result.playwrightCommand);
560
+ }
561
+ if (result.success) {
562
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${commands.length}] Verification passed`, 'log');
563
+ memory.history.push({
564
+ stepNumber,
565
+ iteration,
566
+ action: `Verification ${i + 1}/${commands.length}: ${cmd.verificationType}`,
567
+ code: result.playwrightCommand,
568
+ result: 'success',
569
+ observation: `Verified: ${cmd.description || cmd.expected}`,
570
+ url: page.url(),
571
+ timestamp: Date.now()
572
+ });
573
+ }
574
+ else {
575
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] Verification failed (non-fatal): ${result.error}`, 'warn');
576
+ memory.history.push({
577
+ stepNumber,
578
+ iteration,
579
+ action: `Verification ${i + 1}/${commands.length} - FAILED`,
580
+ code: result.playwrightCommand || JSON.stringify(cmd),
581
+ result: 'failure',
582
+ observation: `Failed: ${result.error}`,
583
+ error: result.error,
584
+ url: page.url(),
585
+ timestamp: Date.now()
586
+ });
587
+ // Continue anyway - verification failures are non-blocking for script generation
588
+ }
589
+ // Small delay between commands
590
+ if (i < commands.length - 1) {
591
+ await page.waitForTimeout(300);
592
+ }
593
+ }
594
+ catch (error) {
595
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] Verification exception: ${error.message}`, 'error');
596
+ }
597
+ }
598
+ else if ((0, som_types_1.isSomCommand)(cmd)) {
599
+ // Handle action command (existing logic)
600
+ const somCommand = cmd;
601
+ try {
602
+ const result = await this.somHandler.runCommand(somCommand, this.config.somUseSomIdBasedCommands || false);
603
+ if (result.status === som_types_1.CommandRunStatus.SUCCESS && result.successAttempt) {
604
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${commands.length}] SoM action succeeded`, 'log');
605
+ executed.push(result.successAttempt.command);
606
+ memory.history.push({
607
+ stepNumber,
608
+ iteration,
609
+ action: `SoM Action ${i + 1}/${commands.length}: ${somCommand.action}`,
610
+ code: result.successAttempt.command,
611
+ result: 'success',
612
+ observation: 'Executed successfully',
613
+ url: page.url(),
614
+ timestamp: Date.now()
615
+ });
616
+ // Small delay for form validation/animations
617
+ if (i < commands.length - 1) {
618
+ await page.waitForTimeout(300);
619
+ }
620
+ }
621
+ else {
622
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] SoM action failed: ${result.error}`, 'error');
623
+ memory.history.push({
624
+ stepNumber,
625
+ iteration,
626
+ action: `SoM Action ${i + 1}/${commands.length}: ${somCommand.action} - FAILED`,
627
+ code: JSON.stringify(somCommand),
628
+ result: 'failure',
629
+ observation: `Failed: ${result.error}`,
630
+ error: result.error,
631
+ url: page.url(),
632
+ timestamp: Date.now()
633
+ });
634
+ // Refresh SoM after batch (DOM may have changed)
635
+ if (this.somHandler && page) {
636
+ this.somHandler.setPage(page);
637
+ await this.somHandler.updateSom();
638
+ }
639
+ return { executed, allSucceeded: false };
640
+ }
641
+ }
642
+ catch (error) {
643
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${commands.length}] SoM action exception: ${error.message}`, 'error');
644
+ memory.history.push({
645
+ stepNumber,
646
+ iteration,
647
+ action: `SoM Action ${i + 1}/${commands.length} - EXCEPTION`,
648
+ code: JSON.stringify(somCommand),
649
+ result: 'failure',
650
+ observation: `Exception: ${error.message}`,
651
+ error: error.message,
652
+ url: page.url(),
653
+ timestamp: Date.now()
654
+ });
655
+ // Refresh SoM after batch (DOM may have changed)
656
+ if (this.somHandler && page) {
657
+ this.somHandler.setPage(page);
658
+ await this.somHandler.updateSom();
659
+ }
660
+ return { executed, allSucceeded: false };
661
+ }
662
+ }
663
+ else {
664
+ this.logger?.(`[Orchestrator] ⚠ [${i + 1}/${commands.length}] Not a valid SoM command/verification, skipping`, 'warn');
665
+ }
666
+ }
667
+ // Always wait for page to stabilize after command batch
668
+ // This handles both explicit navigation AND clicks that trigger navigation/SPA routes
669
+ try {
670
+ this.logger?.(`[Orchestrator] Waiting for page to stabilize...`, 'log');
671
+ // Use networkidle with short timeout to catch navigation without blocking on SPAs with continuous requests
672
+ await page.waitForLoadState('networkidle', { timeout: 3000 });
673
+ this.logger?.(`[Orchestrator] Page stabilized (networkidle)`, 'log');
674
+ }
675
+ catch (error) {
676
+ // If networkidle times out, fall back to domcontentloaded
677
+ try {
678
+ await page.waitForLoadState('domcontentloaded', { timeout: 2000 });
679
+ this.logger?.(`[Orchestrator] Page loaded (domcontentloaded)`, 'log');
680
+ }
681
+ catch (error2) {
682
+ this.logger?.(`[Orchestrator] Page load wait timeout (continuing anyway)`, 'warn');
683
+ }
684
+ }
685
+ // Refresh SoM after batch (DOM may have changed and page is now stable)
686
+ if (this.somHandler && page) {
687
+ this.somHandler.setPage(page);
688
+ await this.somHandler.updateSom();
689
+ }
690
+ return { executed, allSucceeded: true };
691
+ }
692
+ // Standard mode: Execute all commands in sequence with small delay between them
693
+ // Delay helps with form validation, button enabling, and animations
694
+ const wrappedCode = commands.map((cmd, i) => `
695
+ // Command ${i + 1}/${commands.length}
675
696
  try {
676
697
  ${cmd}
677
698
  __results.push({ index: ${i}, success: true });
699
+ ${i < commands.length - 1 ? 'await page.waitForTimeout(300);' : ''} // Small delay for form validation/animations
678
700
  } catch (error) {
679
701
  __results.push({ index: ${i}, success: false, error: error.message });
680
- throw error; // Stop on first failure
681
- }`;
682
- }).join('\n');
683
- const wrappedCode = `
684
- const __results = [];
685
- ${commandsWithTracking}
686
- return __results;
687
- `;
702
+ throw error;
703
+ }`).join('\n');
704
+ const fullCode = `const __results = []; ${wrappedCode} return __results;`;
688
705
  try {
689
- // Execute in shared context - variables declared here persist for entire scenario
690
- // Pass both page and expect to make Playwright assertions available
691
- const func = new Function('page', 'expect', 'return (async () => { ' + wrappedCode + ' })()');
692
- const results = await func(page, test_1.expect);
693
- // Record results for each command
694
- for (let i = 0; i < limitedCommands.length; i++) {
695
- const cmd = limitedCommands[i];
706
+ const func = new Function('page', 'expect', 'return (async () => { ' + fullCode + ' })()');
707
+ const results = await func(page, global.expect);
708
+ for (let i = 0; i < commands.length; i++) {
709
+ const cmd = commands[i];
696
710
  const result = results[i];
697
711
  if (result && result.success) {
698
- this.logger?.(`[Orchestrator] ✓ [${i + 1}/${limitedCommands.length}] Success`);
699
- // Record in history
712
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${commands.length}] Success`);
700
713
  memory.history.push({
701
714
  stepNumber,
702
715
  iteration,
703
- action: `Command ${i + 1}/${limitedCommands.length}`,
716
+ action: `Command ${i + 1}/${commands.length}`,
704
717
  code: cmd,
705
718
  result: 'success',
706
719
  observation: 'Executed successfully',
@@ -710,122 +723,27 @@ return __results;
710
723
  executed.push(cmd);
711
724
  }
712
725
  }
713
- // Cap history
714
726
  if (memory.history.length > this.config.maxHistorySize) {
715
727
  memory.history = memory.history.slice(-this.config.maxHistorySize);
716
728
  }
717
729
  return { executed, allSucceeded: true };
718
730
  }
719
731
  catch (error) {
720
- // One of the commands failed - find which one
721
732
  const errorMessage = error.message || String(error);
722
- // Capture page state for debug logging
723
- let pageStateDebug = '';
724
- if (this.debugMode) {
725
- try {
726
- const pageInfo = await (0, page_info_utils_1.getEnhancedPageInfo)(page);
727
- pageStateDebug = `
728
- === DEBUG: PAGE STATE AT FAILURE ===
729
- URL: ${page.url()}
730
- Title: ${pageInfo.title}
731
-
732
- INTERACTIVE ELEMENTS:
733
- ${pageInfo.formattedElements}
734
-
735
- ARIA SNAPSHOT:
736
- ${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
737
-
738
- ====================================`;
739
- }
740
- catch (debugError) {
741
- pageStateDebug = `Failed to capture page state: ${debugError.message}`;
742
- }
743
- }
744
- // Record all that succeeded, then the failure
745
- for (let i = 0; i < limitedCommands.length; i++) {
746
- const cmd = limitedCommands[i];
747
- // This is a failed command (error happened here or earlier)
748
- if (executed.length <= i) {
749
- this.logger?.(`[Orchestrator] ✗ [${i + 1}/${limitedCommands.length}] Failed: ${errorMessage}`, 'error');
750
- // Log detailed debug info
751
- if (this.debugMode && pageStateDebug) {
752
- this.logger?.(pageStateDebug, 'debug');
753
- }
754
- memory.history.push({
755
- stepNumber,
756
- iteration,
757
- action: `Command ${i + 1}/${limitedCommands.length} - FAILED`,
758
- code: cmd,
759
- result: 'failure',
760
- observation: `Failed with error: ${errorMessage}. This selector likely doesn't exist or is incorrect.`,
761
- error: errorMessage,
762
- url: page.url(),
763
- timestamp: Date.now()
764
- });
765
- if (i < limitedCommands.length - 1) {
766
- this.logger?.(`[Orchestrator] ⚠ Skipping remaining ${limitedCommands.length - i - 1} command(s)`, 'warn');
767
- }
768
- break;
769
- }
770
- }
771
- return { executed, allSucceeded: false };
772
- }
773
- }
774
- /**
775
- * Execute a single command
776
- */
777
- async executeCommand(cmd, page) {
778
- // Wrap in async function and execute
779
- const wrapped = `(async () => { ${cmd} })()`;
780
- try {
781
- await eval(wrapped);
782
- }
783
- catch (error) {
784
- // If eval fails, try direct execution with page context
785
- // Pass both page and expect to make Playwright assertions available
786
- const func = new Function('page', 'expect', `return (async () => { ${cmd} })()`);
787
- await func(page, test_1.expect);
788
- }
789
- }
790
- /**
791
- * Log agent decision
792
- */
793
- logAgentDecision(decision, iteration) {
794
- this.logger?.(`[Orchestrator] 💭 REASONING: ${decision.reasoning}`);
795
- if (decision.selfReflection) {
796
- this.logger?.(`[Orchestrator] 🧠 SELF-REFLECTION:`);
797
- this.logger?.(`[Orchestrator] Next: ${decision.selfReflection.guidanceForNext}`);
798
- if (decision.selfReflection.detectingLoop) {
799
- this.logger?.(`[Orchestrator] 🔄 LOOP DETECTED: ${decision.selfReflection.loopReasoning}`, 'warn');
800
- }
801
- }
802
- if (decision.toolCalls && decision.toolCalls.length > 0) {
803
- this.logger?.(`[Orchestrator] 🔧 TOOLS: ${decision.toolCalls.map(t => t.name).join(', ')}`);
804
- if (decision.toolReasoning) {
805
- this.logger?.(`[Orchestrator] 📋 Why: ${decision.toolReasoning}`);
806
- }
807
- }
808
- if (decision.blockerDetected) {
809
- this.logger?.(`[Orchestrator] 🚧 BLOCKER: ${decision.blockerDetected.description}`, 'warn');
810
- this.logger?.(`[Orchestrator] 🧹 Clearing with ${decision.blockerDetected.clearingCommands.length} command(s)`);
811
- }
812
- if (decision.stepReEvaluation?.detected) {
813
- this.logger?.(`[Orchestrator] 🔍 STEP RE-EVALUATION: ${decision.stepReEvaluation.issue}`, 'warn');
814
- this.logger?.(`[Orchestrator] 📝 Explanation: ${decision.stepReEvaluation.explanation}`);
815
- }
816
- if (decision.commands && decision.commands.length > 0) {
817
- this.logger?.(`[Orchestrator] 📝 COMMANDS (${decision.commands.length}):`);
818
- decision.commands.slice(0, 3).forEach((cmd, i) => {
819
- this.logger?.(`[Orchestrator] ${i + 1}. ${cmd.substring(0, 80)}...`);
733
+ this.logger?.(`[Orchestrator] Command execution failed: ${errorMessage}`, 'error');
734
+ memory.history.push({
735
+ stepNumber,
736
+ iteration,
737
+ action: `Command - FAILED`,
738
+ code: commands[executed.length] || '',
739
+ result: 'failure',
740
+ observation: `Failed: ${errorMessage}`,
741
+ error: errorMessage,
742
+ url: page.url(),
743
+ timestamp: Date.now()
820
744
  });
821
- if (decision.commands.length > 3) {
822
- this.logger?.(`[Orchestrator] ... and ${decision.commands.length - 3} more`);
823
- }
824
- if (decision.commandReasoning) {
825
- this.logger?.(`[Orchestrator] 💡 Why: ${decision.commandReasoning}`);
826
- }
745
+ return { executed, allSucceeded: false };
827
746
  }
828
- // Experiences will be logged when added to memory, no need to log here
829
747
  }
830
748
  /**
831
749
  * Report step progress
@@ -850,6 +768,219 @@ ${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
850
768
  agentStatus: decision.status
851
769
  });
852
770
  }
771
+ /**
772
+ * Execute exploration mode - agent autonomously explores to achieve journey goal
773
+ * Fires onStepProgress callbacks for each autonomous action (transparent to caller)
774
+ */
775
+ async executeExploration(page, explorationConfig, jobId) {
776
+ this.logger?.(`\n[Orchestrator] ========== EXPLORATION MODE ==========`);
777
+ this.logger?.(`[Orchestrator] 🎯 Journey Goal: ${explorationConfig.explorationPrompt}`);
778
+ if (explorationConfig.testDataPrompt) {
779
+ this.logger?.(`[Orchestrator] 📋 Test Data: ${explorationConfig.testDataPrompt}`);
780
+ }
781
+ const memory = {
782
+ history: [],
783
+ experiences: [],
784
+ extractedData: {}
785
+ };
786
+ const maxSteps = explorationConfig.maxExplorationSteps || 50;
787
+ let stepNumber = 0;
788
+ const commandsExecuted = [];
789
+ while (stepNumber < maxSteps) {
790
+ stepNumber++;
791
+ this.logger?.(`\n[Orchestrator] === Exploration Step ${stepNumber}/${maxSteps} ===`);
792
+ // Build exploratory context
793
+ const context = await this.buildExploratoryContext(page, explorationConfig.explorationPrompt, explorationConfig.testDataPrompt, memory, stepNumber, maxSteps);
794
+ // Call agent with exploratory prompt
795
+ const decision = await this.callExploratoryAgent(context, jobId, stepNumber);
796
+ this.decisionParser.log(decision, stepNumber);
797
+ // Report step start (fires JourneyRunner's beforeStepStart callback)
798
+ if (this.progressReporter?.onStepProgress) {
799
+ const stepInfo = {
800
+ jobId,
801
+ stepNumber,
802
+ stepId: `exploration-${stepNumber}-${Date.now()}`,
803
+ description: decision.reasoning,
804
+ code: '', // Will be filled after commands execute
805
+ status: progress_reporter_1.StepExecutionStatus.IN_PROGRESS,
806
+ wasRepaired: false
807
+ };
808
+ await this.progressReporter.onStepProgress(stepInfo);
809
+ }
810
+ // Execute tools if requested
811
+ if (decision.toolCalls && decision.toolCalls.length > 0) {
812
+ const toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
813
+ // If needs tool results, call agent again
814
+ if (decision.needsToolResults) {
815
+ const updatedContext = { ...context, toolResults };
816
+ const continuedDecision = await this.callExploratoryAgent(updatedContext, jobId, stepNumber);
817
+ decision.commands = continuedDecision.commands || decision.commands;
818
+ decision.commandReasoning = continuedDecision.commandReasoning || decision.commandReasoning;
819
+ decision.status = continuedDecision.status;
820
+ }
821
+ }
822
+ // Handle blocker clearing
823
+ if (decision.blockerDetected && decision.blockerDetected.clearingCommands) {
824
+ this.logger?.(`[Orchestrator] 🚧 Clearing blocker: ${decision.blockerDetected.description}`);
825
+ const blockerResult = await this.executeCommands(decision.blockerDetected.clearingCommands, page, memory, stepNumber, 1, jobId);
826
+ commandsExecuted.push(...blockerResult.executed);
827
+ }
828
+ // Execute exploration commands
829
+ let commandsSucceeded = true;
830
+ if (decision.commands && decision.commands.length > 0) {
831
+ const executeResult = await this.executeCommands(decision.commands, page, memory, stepNumber, 1, jobId);
832
+ commandsExecuted.push(...executeResult.executed);
833
+ commandsSucceeded = executeResult.allSucceeded;
834
+ }
835
+ // Report step completion (fires JourneyRunner's onStepComplete callback)
836
+ if (this.progressReporter?.onStepProgress) {
837
+ const stepInfo = {
838
+ jobId,
839
+ stepNumber,
840
+ stepId: `exploration-${stepNumber}-${Date.now()}`,
841
+ description: decision.reasoning,
842
+ code: decision.commands?.join('\n') || '',
843
+ status: commandsSucceeded ? progress_reporter_1.StepExecutionStatus.SUCCESS : progress_reporter_1.StepExecutionStatus.FAILURE,
844
+ error: commandsSucceeded ? undefined : 'Command execution failed',
845
+ wasRepaired: false
846
+ };
847
+ await this.progressReporter.onStepProgress(stepInfo);
848
+ }
849
+ // Add experiences (both app patterns AND exploration progress)
850
+ if (decision.experiences) {
851
+ memory.experiences.push(...decision.experiences);
852
+ if (memory.experiences.length > this.config.maxExperiences) {
853
+ memory.experiences = memory.experiences.slice(-this.config.maxExperiences);
854
+ }
855
+ }
856
+ // Store note for next iteration
857
+ if (decision.noteToFutureSelf) {
858
+ memory.latestNote = {
859
+ fromIteration: stepNumber,
860
+ content: decision.noteToFutureSelf
861
+ };
862
+ }
863
+ // Check termination
864
+ if (decision.status === 'complete') {
865
+ this.logger?.(`[Orchestrator] ✅ Journey exploration complete: ${decision.statusReasoning}`);
866
+ return {
867
+ success: true,
868
+ commands: commandsExecuted,
869
+ iterations: stepNumber,
870
+ terminationReason: 'complete',
871
+ memory
872
+ };
873
+ }
874
+ else if (decision.status === 'stuck') {
875
+ this.logger?.(`[Orchestrator] ❌ Exploration stuck: ${decision.statusReasoning}`);
876
+ return {
877
+ success: false,
878
+ commands: commandsExecuted,
879
+ iterations: stepNumber,
880
+ terminationReason: 'agent_stuck',
881
+ memory,
882
+ error: decision.statusReasoning
883
+ };
884
+ }
885
+ }
886
+ // Hit max steps - not necessarily a failure
887
+ this.logger?.(`[Orchestrator] ⚠ Maximum exploration steps reached (budget limit)`);
888
+ return {
889
+ success: true, // Not a failure - just budget limit
890
+ commands: commandsExecuted,
891
+ iterations: stepNumber,
892
+ terminationReason: 'system_limit',
893
+ memory
894
+ };
895
+ }
896
+ async buildExploratoryContext(page, explorationPrompt, testDataPrompt, memory, stepNumber, maxSteps) {
897
+ // Wait for page to be ready and elements to appear (especially important after navigation)
898
+ const currentPageInfo = await page_info_retry_1.PageInfoRetry.getWithRetry(page);
899
+ const currentURL = page.url();
900
+ const recentSteps = memory.history.slice(-this.config.recentStepsCount);
901
+ // SoM integration for exploratory mode
902
+ let somScreenshot = undefined;
903
+ let somElementMap = undefined;
904
+ if (this.config.useSoM && this.somHandler) {
905
+ try {
906
+ this.somHandler.setPage(page);
907
+ // Wait briefly for page stability (handles first iteration + safety net for fast SPAs)
908
+ try {
909
+ await page.waitForLoadState('domcontentloaded', { timeout: 2000 });
910
+ }
911
+ catch (error) {
912
+ // Page already loaded or timeout - continue
913
+ }
914
+ // Update SoM markers
915
+ await this.somHandler.updateSom();
916
+ somScreenshot = await this.somHandler.getScreenshot(true, false, 60); // Viewport only - agent can scroll or request full page
917
+ // Get element map for disambiguation
918
+ somElementMap = this.somHandler.getSomElementMap();
919
+ this.logger?.(`[Orchestrator] SoM screenshot captured for exploratory agent`, 'log');
920
+ }
921
+ catch (error) {
922
+ this.logger?.(`[Orchestrator] Failed to capture SoM screenshot: ${error.message}`, 'error');
923
+ }
924
+ }
925
+ const context = {
926
+ overallGoal: explorationPrompt,
927
+ currentStepGoal: explorationPrompt, // Same as overall for single journey
928
+ stepNumber,
929
+ totalSteps: maxSteps,
930
+ completedSteps: [],
931
+ remainingSteps: [],
932
+ currentPageInfo,
933
+ currentURL,
934
+ recentSteps,
935
+ experiences: memory.experiences,
936
+ extractedData: memory.extractedData,
937
+ noteFromPreviousIteration: memory.latestNote,
938
+ testDataPrompt, // CRITICAL: Store testDataPrompt in context
939
+ somScreenshot, // SoM screenshot for exploratory mode (current)
940
+ somElementMap // SoM element details for disambiguation
941
+ };
942
+ // Save current screenshot as previous for next iteration (for tool access)
943
+ if (somScreenshot) {
944
+ this.previousSomScreenshot = somScreenshot;
945
+ }
946
+ return context;
947
+ }
948
+ async callExploratoryAgent(context, jobId, stepNumber) {
949
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
950
+ // Use SoM system prompt if in SoM mode, otherwise use standard exploratory prompt
951
+ const systemPrompt = this.config.useSoM
952
+ ? orchestrator_prompts_1.OrchestratorPrompts.buildSomSystemPrompt(this.config.somRestrictCoordinates)
953
+ : orchestrator_prompts_1.OrchestratorPrompts.buildExploratorySystemPrompt(toolDescriptions);
954
+ const userPrompt = orchestrator_prompts_1.OrchestratorPrompts.buildExploratoryUserPrompt(context, context.overallGoal, context.testDataPrompt, // Pass testDataPrompt from context
955
+ stepNumber, context.totalSteps);
956
+ const llmRequest = {
957
+ model: model_constants_1.DEFAULT_MODEL,
958
+ systemPrompt,
959
+ userPrompt
960
+ };
961
+ // Include current SoM screenshot as image
962
+ if (context.somScreenshot) {
963
+ llmRequest.imageUrl = context.somScreenshot;
964
+ this.logger?.(`[Orchestrator] Including SoM screenshot in exploratory LLM request`, 'log');
965
+ }
966
+ const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
967
+ // Report token usage
968
+ if (response.usage && this.progressReporter?.onTokensUsed) {
969
+ await this.progressReporter.onTokensUsed({
970
+ jobId,
971
+ stepNumber,
972
+ iteration: 1,
973
+ inputTokens: response.usage.inputTokens,
974
+ outputTokens: response.usage.outputTokens,
975
+ includesImage: false,
976
+ model: model_constants_1.DEFAULT_MODEL,
977
+ timestamp: Date.now()
978
+ });
979
+ }
980
+ // Parse response (same JSON format as regular mode)
981
+ const decision = this.decisionParser.parse(response.answer);
982
+ return decision;
983
+ }
853
984
  }
854
985
  exports.OrchestratorAgent = OrchestratorAgent;
855
986
  //# sourceMappingURL=orchestrator-agent.js.map