testchimp-runner-core 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. package/RELEASE_0.0.23.md +120 -0
  2. package/VISION_DIAGNOSTICS_IMPROVEMENTS.md +336 -0
  3. package/dist/credit-usage-service.d.ts +9 -0
  4. package/dist/credit-usage-service.d.ts.map +1 -1
  5. package/dist/credit-usage-service.js +20 -5
  6. package/dist/credit-usage-service.js.map +1 -1
  7. package/dist/env-loader.d.ts +5 -0
  8. package/dist/env-loader.d.ts.map +1 -1
  9. package/dist/env-loader.js +21 -0
  10. package/dist/env-loader.js.map +1 -1
  11. package/dist/execution-service.d.ts +7 -2
  12. package/dist/execution-service.d.ts.map +1 -1
  13. package/dist/execution-service.js +91 -36
  14. package/dist/execution-service.js.map +1 -1
  15. package/dist/index.d.ts +30 -2
  16. package/dist/index.d.ts.map +1 -1
  17. package/dist/index.js +91 -26
  18. package/dist/index.js.map +1 -1
  19. package/dist/llm-facade.d.ts +64 -8
  20. package/dist/llm-facade.d.ts.map +1 -1
  21. package/dist/llm-facade.js +361 -109
  22. package/dist/llm-facade.js.map +1 -1
  23. package/dist/llm-provider.d.ts +39 -0
  24. package/dist/llm-provider.d.ts.map +1 -0
  25. package/dist/llm-provider.js +7 -0
  26. package/dist/llm-provider.js.map +1 -0
  27. package/dist/model-constants.d.ts +21 -0
  28. package/dist/model-constants.d.ts.map +1 -0
  29. package/dist/model-constants.js +24 -0
  30. package/dist/model-constants.js.map +1 -0
  31. package/dist/orchestrator/index.d.ts +8 -0
  32. package/dist/orchestrator/index.d.ts.map +1 -0
  33. package/dist/orchestrator/index.js +23 -0
  34. package/dist/orchestrator/index.js.map +1 -0
  35. package/dist/orchestrator/orchestrator-agent.d.ts +66 -0
  36. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -0
  37. package/dist/orchestrator/orchestrator-agent.js +860 -0
  38. package/dist/orchestrator/orchestrator-agent.js.map +1 -0
  39. package/dist/orchestrator/tool-registry.d.ts +74 -0
  40. package/dist/orchestrator/tool-registry.d.ts.map +1 -0
  41. package/dist/orchestrator/tool-registry.js +131 -0
  42. package/dist/orchestrator/tool-registry.js.map +1 -0
  43. package/dist/orchestrator/tools/check-page-ready.d.ts +13 -0
  44. package/dist/orchestrator/tools/check-page-ready.d.ts.map +1 -0
  45. package/dist/orchestrator/tools/check-page-ready.js +72 -0
  46. package/dist/orchestrator/tools/check-page-ready.js.map +1 -0
  47. package/dist/orchestrator/tools/extract-data.d.ts +13 -0
  48. package/dist/orchestrator/tools/extract-data.d.ts.map +1 -0
  49. package/dist/orchestrator/tools/extract-data.js +84 -0
  50. package/dist/orchestrator/tools/extract-data.js.map +1 -0
  51. package/dist/orchestrator/tools/index.d.ts +10 -0
  52. package/dist/orchestrator/tools/index.d.ts.map +1 -0
  53. package/dist/orchestrator/tools/index.js +18 -0
  54. package/dist/orchestrator/tools/index.js.map +1 -0
  55. package/dist/orchestrator/tools/inspect-page.d.ts +13 -0
  56. package/dist/orchestrator/tools/inspect-page.d.ts.map +1 -0
  57. package/dist/orchestrator/tools/inspect-page.js +39 -0
  58. package/dist/orchestrator/tools/inspect-page.js.map +1 -0
  59. package/dist/orchestrator/tools/recall-history.d.ts +13 -0
  60. package/dist/orchestrator/tools/recall-history.d.ts.map +1 -0
  61. package/dist/orchestrator/tools/recall-history.js +64 -0
  62. package/dist/orchestrator/tools/recall-history.js.map +1 -0
  63. package/dist/orchestrator/tools/take-screenshot.d.ts +15 -0
  64. package/dist/orchestrator/tools/take-screenshot.d.ts.map +1 -0
  65. package/dist/orchestrator/tools/take-screenshot.js +112 -0
  66. package/dist/orchestrator/tools/take-screenshot.js.map +1 -0
  67. package/dist/orchestrator/types.d.ts +133 -0
  68. package/dist/orchestrator/types.d.ts.map +1 -0
  69. package/dist/orchestrator/types.js +28 -0
  70. package/dist/orchestrator/types.js.map +1 -0
  71. package/dist/playwright-mcp-service.d.ts +9 -0
  72. package/dist/playwright-mcp-service.d.ts.map +1 -1
  73. package/dist/playwright-mcp-service.js +20 -5
  74. package/dist/playwright-mcp-service.js.map +1 -1
  75. package/dist/progress-reporter.d.ts +97 -0
  76. package/dist/progress-reporter.d.ts.map +1 -0
  77. package/dist/progress-reporter.js +18 -0
  78. package/dist/progress-reporter.js.map +1 -0
  79. package/dist/prompts.d.ts +24 -0
  80. package/dist/prompts.d.ts.map +1 -1
  81. package/dist/prompts.js +593 -68
  82. package/dist/prompts.js.map +1 -1
  83. package/dist/providers/backend-proxy-llm-provider.d.ts +25 -0
  84. package/dist/providers/backend-proxy-llm-provider.d.ts.map +1 -0
  85. package/dist/providers/backend-proxy-llm-provider.js +76 -0
  86. package/dist/providers/backend-proxy-llm-provider.js.map +1 -0
  87. package/dist/providers/local-llm-provider.d.ts +21 -0
  88. package/dist/providers/local-llm-provider.d.ts.map +1 -0
  89. package/dist/providers/local-llm-provider.js +35 -0
  90. package/dist/providers/local-llm-provider.js.map +1 -0
  91. package/dist/scenario-service.d.ts +27 -1
  92. package/dist/scenario-service.d.ts.map +1 -1
  93. package/dist/scenario-service.js +48 -12
  94. package/dist/scenario-service.js.map +1 -1
  95. package/dist/scenario-worker-class.d.ts +39 -2
  96. package/dist/scenario-worker-class.d.ts.map +1 -1
  97. package/dist/scenario-worker-class.js +614 -86
  98. package/dist/scenario-worker-class.js.map +1 -1
  99. package/dist/script-utils.d.ts +2 -0
  100. package/dist/script-utils.d.ts.map +1 -1
  101. package/dist/script-utils.js +44 -4
  102. package/dist/script-utils.js.map +1 -1
  103. package/dist/types.d.ts +11 -0
  104. package/dist/types.d.ts.map +1 -1
  105. package/dist/types.js.map +1 -1
  106. package/dist/utils/browser-utils.d.ts +20 -1
  107. package/dist/utils/browser-utils.d.ts.map +1 -1
  108. package/dist/utils/browser-utils.js +102 -51
  109. package/dist/utils/browser-utils.js.map +1 -1
  110. package/dist/utils/page-info-utils.d.ts +23 -4
  111. package/dist/utils/page-info-utils.d.ts.map +1 -1
  112. package/dist/utils/page-info-utils.js +174 -43
  113. package/dist/utils/page-info-utils.js.map +1 -1
  114. package/package.json +1 -2
  115. package/plandocs/HUMAN_LIKE_IMPROVEMENTS.md +642 -0
  116. package/plandocs/MULTI_AGENT_ARCHITECTURE_REVIEW.md +844 -0
  117. package/plandocs/ORCHESTRATOR_MVP_SUMMARY.md +539 -0
  118. package/plandocs/PHASE1_ABSTRACTION_COMPLETE.md +241 -0
  119. package/plandocs/PHASE1_FINAL_STATUS.md +210 -0
  120. package/plandocs/PLANNING_SESSION_SUMMARY.md +372 -0
  121. package/plandocs/SCRIPT_CLEANUP_FEATURE.md +201 -0
  122. package/plandocs/SCRIPT_GENERATION_ARCHITECTURE.md +364 -0
  123. package/plandocs/SELECTOR_IMPROVEMENTS.md +139 -0
  124. package/src/credit-usage-service.ts +23 -5
  125. package/src/env-loader.ts +22 -0
  126. package/src/execution-service.ts +152 -42
  127. package/src/index.ts +169 -26
  128. package/src/llm-facade.ts +500 -126
  129. package/src/llm-provider.ts +43 -0
  130. package/src/model-constants.ts +23 -0
  131. package/src/orchestrator/index.ts +33 -0
  132. package/src/orchestrator/orchestrator-agent.ts +1042 -0
  133. package/src/orchestrator/tool-registry.ts +182 -0
  134. package/src/orchestrator/tools/check-page-ready.ts +75 -0
  135. package/src/orchestrator/tools/extract-data.ts +92 -0
  136. package/src/orchestrator/tools/index.ts +11 -0
  137. package/src/orchestrator/tools/inspect-page.ts +42 -0
  138. package/src/orchestrator/tools/recall-history.ts +72 -0
  139. package/src/orchestrator/tools/take-screenshot.ts +128 -0
  140. package/src/orchestrator/types.ts +200 -0
  141. package/src/playwright-mcp-service.ts +23 -5
  142. package/src/progress-reporter.ts +109 -0
  143. package/src/prompts.ts +606 -69
  144. package/src/providers/backend-proxy-llm-provider.ts +91 -0
  145. package/src/providers/local-llm-provider.ts +38 -0
  146. package/src/scenario-service.ts +83 -13
  147. package/src/scenario-worker-class.ts +740 -72
  148. package/src/script-utils.ts +50 -5
  149. package/src/types.ts +13 -1
  150. package/src/utils/browser-utils.ts +123 -51
  151. package/src/utils/page-info-utils.ts +210 -53
@@ -0,0 +1,1042 @@
1
+ /**
2
+ * Orchestrator Agent
3
+ * Single agent with tool-use capabilities that maintains journey memory and executes scenarios
4
+ */
5
+
6
+ import { expect } from '@playwright/test';
7
+ import { LLMFacade } from '../llm-facade';
8
+ import { ProgressReporter, TokenUsage, StepExecutionStatus } from '../progress-reporter';
9
+ import { getEnhancedPageInfo } from '../utils/page-info-utils';
10
+ import { ToolRegistry, ToolExecutionContext } from './tool-registry';
11
+ import { DEFAULT_MODEL } from '../model-constants';
12
+ import { isDevelopmentMode } from '../env-loader';
13
+ import {
14
+ AgentConfig,
15
+ AgentContext,
16
+ AgentDecision,
17
+ JourneyMemory,
18
+ MemoryStep,
19
+ OrchestratorStepResult,
20
+ SelfReflection,
21
+ DEFAULT_AGENT_CONFIG
22
+ } from './types';
23
+
24
+ /**
25
+ * Orchestrator Agent - manages step execution with tool use and memory
26
+ */
27
+ export class OrchestratorAgent {
28
+ private llmFacade: LLMFacade;
29
+ private toolRegistry: ToolRegistry;
30
+ private progressReporter?: ProgressReporter;
31
+ private config: Required<AgentConfig>;
32
+ private logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void;
33
+ private debugMode: boolean = false;
34
+
35
+ constructor(
36
+ llmFacade: LLMFacade,
37
+ toolRegistry: ToolRegistry,
38
+ config?: Partial<AgentConfig>,
39
+ progressReporter?: ProgressReporter,
40
+ logger?: (message: string, level?: 'log' | 'error' | 'warn' | 'debug') => void,
41
+ debugMode?: boolean
42
+ ) {
43
+ this.llmFacade = llmFacade;
44
+ this.toolRegistry = toolRegistry;
45
+ this.config = { ...DEFAULT_AGENT_CONFIG, ...config };
46
+ this.progressReporter = progressReporter;
47
+ this.logger = logger;
48
+ this.debugMode = debugMode || false;
49
+ }
50
+
51
+ setDebugMode(enabled: boolean): void {
52
+ this.debugMode = enabled;
53
+ }
54
+
55
+ /**
56
+ * Execute a single step of the scenario
57
+ */
58
+ async executeStep(
59
+ page: any,
60
+ stepDescription: string,
61
+ stepNumber: number,
62
+ totalSteps: number,
63
+ scenarioSteps: string[],
64
+ memory: JourneyMemory,
65
+ jobId: string
66
+ ): Promise<OrchestratorStepResult> {
67
+ this.logger?.(`\n[Orchestrator] ========== STEP ${stepNumber}/${totalSteps} ==========`);
68
+ this.logger?.(`[Orchestrator] 🎯 Goal: ${stepDescription}`);
69
+
70
+ let iteration = 0;
71
+ let previousReflection: SelfReflection | undefined = undefined;
72
+ const commandsExecuted: string[] = [];
73
+ let consecutiveFailures = 0; // Track consecutive iterations with failed commands
74
+
75
+ while (iteration < this.config.maxIterationsPerStep) {
76
+ iteration++;
77
+
78
+ this.logger?.(`\n[Orchestrator] === Iteration ${iteration}/${this.config.maxIterationsPerStep} ===`);
79
+
80
+ // Build context for agent
81
+ const context = await this.buildAgentContext(
82
+ page,
83
+ stepDescription,
84
+ stepNumber,
85
+ totalSteps,
86
+ scenarioSteps,
87
+ memory,
88
+ previousReflection,
89
+ consecutiveFailures
90
+ );
91
+
92
+ // Call agent to make decision
93
+ const decision = await this.callAgent(context, jobId, stepNumber, iteration, consecutiveFailures);
94
+
95
+ // Log agent's reasoning
96
+ this.logAgentDecision(decision, iteration);
97
+
98
+ // Report progress
99
+ await this.reportStepProgress(jobId, stepNumber, stepDescription, decision, iteration);
100
+
101
+ // Execute tools if requested (tools are READ-ONLY, they don't change state)
102
+ let toolResults: Record<string, any> = {};
103
+
104
+ if (decision.toolCalls && decision.toolCalls.length > 0) {
105
+ toolResults = await this.executeTools(decision.toolCalls, page, memory, stepNumber);
106
+
107
+ // If agent wants to wait for tool results before proceeding, call agent again
108
+ if (decision.needsToolResults) {
109
+ const updatedContext = { ...context, toolResults };
110
+ const continuedDecision = await this.callAgent(updatedContext, jobId, stepNumber, iteration, consecutiveFailures);
111
+
112
+ // Merge continued decision
113
+ decision.commands = continuedDecision.commands || decision.commands;
114
+ decision.commandReasoning = continuedDecision.commandReasoning || decision.commandReasoning;
115
+ decision.status = continuedDecision.status;
116
+ decision.statusReasoning = continuedDecision.statusReasoning;
117
+ decision.reasoning = continuedDecision.reasoning;
118
+ }
119
+ }
120
+
121
+ // Execute commands sequentially
122
+ let iterationHadFailure = false;
123
+
124
+ // Handle blocker if detected (clear blocker FIRST, then proceed with main commands)
125
+ if (decision.blockerDetected && decision.blockerDetected.clearingCommands && decision.blockerDetected.clearingCommands.length > 0) {
126
+ this.logger?.(`[Orchestrator] 🚧 BLOCKER DETECTED: ${decision.blockerDetected.description}`);
127
+ this.logger?.(`[Orchestrator] 🧹 Clearing blocker with ${decision.blockerDetected.clearingCommands.length} command(s)...`);
128
+
129
+ const blockerResult = await this.executeCommandsSequentially(
130
+ decision.blockerDetected.clearingCommands,
131
+ page,
132
+ memory,
133
+ stepNumber,
134
+ iteration,
135
+ jobId
136
+ );
137
+
138
+ // Add blocker commands with comment to output
139
+ if (blockerResult.executed.length > 0) {
140
+ commandsExecuted.push(`// Blocker: ${decision.blockerDetected.description}`);
141
+ commandsExecuted.push(...blockerResult.executed);
142
+ }
143
+
144
+ // If blocker clearing failed, track it
145
+ if (!blockerResult.allSucceeded) {
146
+ this.logger?.(`[Orchestrator] ❌ Failed to clear blocker - continuing anyway`);
147
+ consecutiveFailures++;
148
+ iterationHadFailure = true;
149
+ }
150
+ }
151
+
152
+ // Execute main commands (only if no blocker failure or continueOnFailure is true)
153
+ if (decision.commands && decision.commands.length > 0 && !iterationHadFailure) {
154
+ const executeResult = await this.executeCommandsSequentially(
155
+ decision.commands,
156
+ page,
157
+ memory,
158
+ stepNumber,
159
+ iteration,
160
+ jobId
161
+ );
162
+
163
+ commandsExecuted.push(...executeResult.executed);
164
+
165
+ // Track failures
166
+ if (!executeResult.allSucceeded) {
167
+ this.logger?.(`[Orchestrator] ⚠ Command execution stopped at failure`);
168
+ consecutiveFailures++;
169
+ iterationHadFailure = true;
170
+ } else {
171
+ consecutiveFailures = 0; // Reset on success
172
+ }
173
+ }
174
+
175
+ // System-enforced stuck detection (agent might not detect it)
176
+ if (consecutiveFailures >= 3) {
177
+ this.logger?.(`[Orchestrator] 🛑 SYSTEM: ${consecutiveFailures} consecutive failures detected - forcing stuck`, 'warn');
178
+ return {
179
+ success: false,
180
+ commands: commandsExecuted,
181
+ iterations: iteration,
182
+ terminationReason: 'agent_stuck',
183
+ memory,
184
+ error: `Failed ${consecutiveFailures} iterations in a row - unable to proceed`
185
+ };
186
+ }
187
+
188
+ // Update memory with experiences
189
+ if (decision.experiences && decision.experiences.length > 0) {
190
+ for (const exp of decision.experiences) {
191
+ // Deduplicate - don't add if very similar experience exists
192
+ const exists = memory.experiences.some(existing =>
193
+ existing.toLowerCase().includes(exp.toLowerCase()) ||
194
+ exp.toLowerCase().includes(existing.toLowerCase())
195
+ );
196
+
197
+ if (!exists) {
198
+ memory.experiences.push(exp);
199
+ this.logger?.(`[Orchestrator] 📚 Experience: ${exp}`);
200
+ }
201
+ }
202
+
203
+ // Cap experiences
204
+ if (memory.experiences.length > this.config.maxExperiences) {
205
+ memory.experiences = memory.experiences.slice(-this.config.maxExperiences);
206
+ }
207
+ }
208
+
209
+ // Store self-reflection for next iteration
210
+ previousReflection = decision.selfReflection;
211
+
212
+ // Check termination
213
+ if (decision.status !== 'continue') {
214
+ this.logger?.(`[Orchestrator] 🎯 Status: ${decision.status}`);
215
+ this.logger?.(`[Orchestrator] 💭 Reason: ${decision.statusReasoning}`);
216
+
217
+ return {
218
+ success: decision.status === 'complete',
219
+ commands: commandsExecuted,
220
+ iterations: iteration,
221
+ terminationReason: decision.status === 'complete' ? 'complete' :
222
+ decision.status === 'stuck' ? 'agent_stuck' :
223
+ 'infeasible',
224
+ memory
225
+ };
226
+ }
227
+ }
228
+
229
+ // Hit iteration limit
230
+ this.logger?.(`[Orchestrator] ⚠ Maximum iterations reached (${this.config.maxIterationsPerStep})`);
231
+
232
+ return {
233
+ success: false,
234
+ commands: commandsExecuted,
235
+ iterations: iteration,
236
+ terminationReason: 'system_limit',
237
+ memory,
238
+ error: 'Maximum iterations reached'
239
+ };
240
+ }
241
+
242
+ /**
243
+ * Build context for agent
244
+ */
245
+ private async buildAgentContext(
246
+ page: any,
247
+ currentStepGoal: string,
248
+ stepNumber: number,
249
+ totalSteps: number,
250
+ scenarioSteps: string[],
251
+ memory: JourneyMemory,
252
+ previousReflection?: SelfReflection,
253
+ consecutiveFailures?: number
254
+ ): Promise<AgentContext> {
255
+ // Get fresh DOM
256
+ const currentPageInfo = await getEnhancedPageInfo(page);
257
+ const currentURL = page.url();
258
+
259
+ // Get recent steps
260
+ const recentSteps = memory.history.slice(-this.config.recentStepsCount);
261
+
262
+ // Build context
263
+ return {
264
+ overallGoal: scenarioSteps.join('\n'),
265
+ currentStepGoal,
266
+ stepNumber,
267
+ totalSteps,
268
+ completedSteps: scenarioSteps.slice(0, stepNumber - 1),
269
+ remainingSteps: scenarioSteps.slice(stepNumber),
270
+ currentPageInfo,
271
+ currentURL,
272
+ recentSteps,
273
+ experiences: memory.experiences,
274
+ extractedData: memory.extractedData,
275
+ previousIterationGuidance: previousReflection
276
+ };
277
+ }
278
+
279
+ /**
280
+ * Call agent to make decision
281
+ */
282
+ private async callAgent(
283
+ context: AgentContext,
284
+ jobId: string,
285
+ stepNumber: number,
286
+ iteration: number,
287
+ consecutiveFailures?: number
288
+ ): Promise<AgentDecision> {
289
+ // Build prompt
290
+ const systemPrompt = this.buildSystemPrompt();
291
+ const userPrompt = this.buildUserPrompt(context, consecutiveFailures);
292
+
293
+ try {
294
+ // Call LLM directly via provider
295
+ const llmRequest = {
296
+ model: DEFAULT_MODEL,
297
+ systemPrompt,
298
+ userPrompt
299
+ };
300
+
301
+ const response = await this.llmFacade.llmProvider.callLLM(llmRequest);
302
+
303
+ // Report token usage
304
+ if (response.usage && this.progressReporter?.onTokensUsed) {
305
+ const tokenUsage: TokenUsage = {
306
+ jobId,
307
+ stepNumber,
308
+ iteration,
309
+ inputTokens: response.usage.inputTokens,
310
+ outputTokens: response.usage.outputTokens,
311
+ includesImage: false,
312
+ model: DEFAULT_MODEL,
313
+ timestamp: Date.now()
314
+ };
315
+ // Only log token usage in development mode (not in staging/prod)
316
+ if (isDevelopmentMode()) {
317
+ this.logger?.(`[Orchestrator] 💰 Reporting token usage: ${tokenUsage.inputTokens} + ${tokenUsage.outputTokens}`, 'log');
318
+ }
319
+ await this.progressReporter.onTokensUsed(tokenUsage);
320
+ } else if (!response.usage && isDevelopmentMode()) {
321
+ // Only log missing usage data in development mode
322
+ this.logger?.(`[Orchestrator] ⚠ No usage data in LLM response`, 'warn');
323
+ }
324
+
325
+ // Parse response
326
+ return this.parseAgentDecision(response.answer);
327
+
328
+ } catch (error: any) {
329
+ this.logger?.(`[Orchestrator] ✗ Agent call failed: ${error.message}`, 'error');
330
+
331
+ // Return fallback decision
332
+ return {
333
+ status: 'stuck',
334
+ statusReasoning: `Agent call failed: ${error.message}`,
335
+ reasoning: 'LLM call failed'
336
+ };
337
+ }
338
+ }
339
+
340
+ /**
341
+ * Build system prompt with tool descriptions
342
+ */
343
+ private buildSystemPrompt(): string {
344
+ const toolDescriptions = this.toolRegistry.generateToolDescriptions();
345
+
346
+ return `You are an intelligent test automation agent that executes web scenarios using Playwright.
347
+
348
+ ${toolDescriptions}
349
+
350
+ YOUR RESPONSE FORMAT (JSON):
351
+ {
352
+ "toolCalls": [{"name": "tool_name", "params": {...}}], // Optional: Tools to use
353
+ "toolReasoning": "Why I need these tools", // If using tools
354
+ "needsToolResults": true/false, // Wait for tool results before commands?
355
+ "commands": ["await page.click(...)", ...], // Playwright commands to execute
356
+ "commandReasoning": "Why these commands", // If executing commands
357
+ "selfReflection": { // Guidance for your next iteration
358
+ "guidanceForNext": "What to focus on next",
359
+ "detectingLoop": false, // Set true if repeating same approach
360
+ "loopReasoning": "Why I think I'm looping" // If detecting loop
361
+ },
362
+ "experiences": ["App-specific learning"], // APP-SPECIFIC patterns (see rules below)
363
+ "status": "continue|complete|stuck|infeasible", // Your decision
364
+ "statusReasoning": "Why this status",
365
+ "reasoning": "Overall reasoning for this iteration",
366
+ "blockerDetected": { // OPTIONAL: If something blocks the current goal
367
+ "description": "What's blocking", // E.g., "Cookie consent modal"
368
+ "clearingCommands": ["await page.click(...)"] // Commands to clear blocker FIRST
369
+ },
370
+ "stepReEvaluation": { // OPTIONAL: If questioning step order (last resort)
371
+ "detected": true, // Set true if you think step order is wrong
372
+ "issue": "prior_incomplete|already_done|wrong_order", // What's the issue?
373
+ "explanation": "Why step order seems wrong" // Your reasoning
374
+ }
375
+ }
376
+
377
+ STATUS DECISION RULES (CRITICAL!):
378
+
379
+ - "complete": ALL commands succeeded AND step goal achieved
380
+ ❌ NEVER mark complete if ANY command failed!
381
+
382
+ - "continue": Command failed OR need more actions
383
+ Use when: Need to retry with different selector, wait for something, or continue execution
384
+
385
+ - "stuck": Tried 3+ iterations, all failed
386
+
387
+ - "infeasible": Goal truly impossible (element doesn't exist anywhere)
388
+
389
+ IF COMMAND FAILED → status must be "continue" (to retry) or "stuck" (if tried many times)
390
+ IF COMMAND SUCCEEDED → status can be "complete" (if goal done) or "continue" (if more needed)
391
+
392
+ STEP RE-EVALUATION (LAST RESORT - After 2+ Failures):
393
+
394
+ When you've failed multiple times on current step, QUESTION YOUR ASSUMPTIONS:
395
+
396
+ 1. **LOOK BACK**: Did prior steps really complete?
397
+ - Check COMPLETED steps list vs current page state
398
+ - Example: Says "Login complete" but you see login form → PRIOR STEP FAILED
399
+ - If prior step incomplete: Execute commands to complete it, explain in reasoning
400
+
401
+ 2. **LOOK FORWARD**: Is current step already done or no longer relevant?
402
+ - Check if current step goal already achieved (element exists, state correct)
403
+ - Example: Step says "Click Submit" but form already submitted → STEP ALREADY DONE
404
+ - If already done: Mark "complete" and explain why
405
+
406
+ 3. **LOOK AHEAD**: Is next step more feasible?
407
+ - Sometimes UI flows differently than expected
408
+ - Example: Current step "Click Edit" fails, but next step "Fill Name" elements are visible
409
+ - If can do future step: Note in reasoning, but PREFER sticking to order
410
+
411
+ STICK TO ORIGINAL PLAN unless clear evidence suggests otherwise.
412
+ Only re-evaluate after repeated failures on same step.
413
+
414
+ BLOCKER DETECTION (IMPORTANT!):
415
+
416
+ Use "blockerDetected" when the CURRENT STEP GOAL is not immediately feasible because something unexpected is in the way:
417
+
418
+ Examples of BLOCKERS:
419
+ - Cookie consent modal appears before login
420
+ - "What's New" tour overlay prevents clicking navigation
421
+ - Email verification prompt blocks form submission
422
+ - Session timeout modal requires re-login before continuing
423
+
424
+ When you detect a blocker:
425
+ 1. Set "blockerDetected.description" to describe what's blocking
426
+ 2. Provide "clearingCommands" to dismiss/handle the blocker
427
+ 3. Set your main "commands" to execute AFTER blocker is cleared
428
+ 4. Agent will execute: clearingCommands FIRST, then your commands
429
+
430
+ Example:
431
+ {
432
+ "blockerDetected": {
433
+ "description": "Cookie consent modal blocking page interaction",
434
+ "clearingCommands": ["await page.getByRole('button', {name: 'Accept'}).click()"]
435
+ },
436
+ "commands": ["await page.getByRole('textbox', {name: 'Email'}).fill('user@example.com')"],
437
+ ...
438
+ }
439
+
440
+ DON'T use blockerDetected for:
441
+ - Normal step execution (just use commands)
442
+ - Wrong selectors (that's not a blocker, just fix the selector)
443
+ - Missing elements (that's infeasible, not blocked)
444
+
445
+ EXPERIENCES - APP-SPECIFIC PATTERNS ONLY:
446
+
447
+ Guidelines:
448
+ - Concise - combine related info into single learning
449
+ - Focus on app quirks, not obvious practices
450
+ - Max 1-2 experiences per iteration
451
+ - Capture SEMANTIC SELECTOR patterns when discovered
452
+
453
+ ✅ GOOD - App-specific patterns:
454
+ - "Login form fields accessible via getByLabel: 'Email' and 'Password'"
455
+ - "Submit buttons consistently use role=button with text matching action"
456
+ - "Form uses custom dropdowns (role=combobox) instead of native selects"
457
+ - "Navigation items accessible via getByRole('link') with visible text"
458
+ - "Site uses data-testid attributes consistently (pattern: action-element-name)"
459
+ - "Input fields have clear placeholders - prefer getByPlaceholder over IDs"
460
+
461
+ ❌ BAD - Too specific or obvious:
462
+ - Listing individual field selectors (combine them)
463
+ - "Button exists with role=button" (obvious)
464
+ - Generic automation advice
465
+ - Noting auto-generated IDs like #«r3»-form-item (these are unreliable)
466
+
467
+ COMBINE related learnings instead of listing each element separately.
468
+ PRIORITIZE learning about semantic selector availability over CSS selectors.
469
+
470
+ CRITICAL RULES FOR DECISION MAKING:
471
+
472
+ 1. **BE FOCUSED AND CONCISE** (CRITICAL!):
473
+ - Read the CURRENT STEP GOAL carefully - ONLY do what's explicitly asked for in THAT step
474
+ - DON'T add extra actions, verifications, or defensive checks not mentioned in the step goal
475
+ - If step says "Send a message" → ONLY send the message, DON'T verify it appeared
476
+ - If step says "Click Submit" → ONLY click, DON'T wait or verify submission success
477
+ - ONLY add expect() assertions if the step explicitly mentions "verify", "check", "ensure", or "confirm"
478
+ - Keep commands minimal - no unnecessary preparatory or validation actions
479
+
480
+ 2. **COMMAND EFFICIENCY**:
481
+ - DON'T click on input field before filling it - just use .fill() directly
482
+ - DON'T do preparatory actions unless necessary
483
+ - Example: page.fill works without clicking the input first
484
+
485
+ 2b. **VARIABLE SCOPE & AVAILABLE FUNCTIONS** (CRITICAL!):
486
+ - Commands in SAME BATCH share scope - variables persist within iteration
487
+ - To save data for later steps: Use extract_data tool, then access via extractedData in context
488
+ - Within same batch: Variables work across commands
489
+ - AVAILABLE IN CONTEXT: page (Playwright Page object) and expect (Playwright assertions)
490
+ - You can use expect() directly - it is already imported and available
491
+
492
+ 3. **WHEN COMMAND FAILS** (most important!):
493
+ - DON'T repeat the same command/selector
494
+ - DO analyze the error message
495
+ - DO look at DOM snapshot for SEMANTIC selectors (getByRole, getByLabel, getByText)
496
+ - AVOID falling back to auto-generated IDs with unicode characters (e.g., #«r3»-form-item)
497
+ - DO take screenshot if DOM doesn't help
498
+ - "Element is not a <select>" → It's a custom dropdown, use .click() then select from menu
499
+ - "Timeout waiting for locator" → Try semantic selectors from ARIA tree instead of CSS IDs
500
+
501
+ 4. **CHECK YOUR RECENT STEPS FOR PATTERNS**:
502
+ - If you see same selector failed 2+ times → YOU ARE LOOPING!
503
+ - Set detectingLoop: true and explain why
504
+ - Take screenshot to see what's actually on the page
505
+ - Use different SEMANTIC selector from DOM/ARIA tree (getByRole, getByLabel, getByText, data-testid)
506
+ - AVOID repeating with similar auto-generated ID patterns (e.g., #«r3»-form-item → #«r4»-form-item)
507
+
508
+ 5. **USE DOM SNAPSHOT** - ONLY USE WHAT'S ACTUALLY THERE (CRITICAL!):
509
+ - You get INTERACTIVE ELEMENTS list showing: position, tag, id, text, SUGGESTED SELECTORS
510
+ - ONLY use selectors from this list - DO NOT invent selectors with text/names not in the list!
511
+ - ❌ WRONG: List shows button "Submit" but you use getByRole('button', {name: 'SomeOtherText'})
512
+ - ✅ RIGHT: Use EXACT selector from the provided list
513
+ - ✅ RIGHT: If list shows "#submit-btn" → use that
514
+ - ✅ RIGHT: If list shows "getByRole('button', {name: 'X'})" → use exactly that
515
+ - If element you need is NOT in list → take_screenshot to find it
516
+ - ARIA tree shows roles/names that exist - inventing roles/names not in tree will fail!
517
+
518
+ 5b. **SELECTOR PREFERENCE** (CRITICAL - Playwright Best Practices):
519
+ ALWAYS prefer user-friendly, semantic selectors in this order:
520
+
521
+ 🏆 BEST (use these first):
522
+ 1. page.getByRole('role', {name: 'text'}) → ✅ Accessible, semantic, resilient
523
+ 2. page.getByLabel('label text') → ✅ Great for form inputs
524
+ 3. page.getByPlaceholder('placeholder') → ✅ Good for inputs without labels
525
+ 4. page.getByText('visible text') → ✅ Clear and readable
526
+ 5. page.getByTestId('test-id') → ✅ Stable if available
527
+
528
+ ⚠️ AVOID (last resort only):
529
+ - CSS selectors with auto-generated IDs: #r3-form-item, #«r3»-form-item
530
+ - CSS selectors with unicode: #«r4»-form-item
531
+ - Complex CSS paths: div.ant-form > div:nth-child(2)
532
+
533
+ Examples:
534
+ ❌ BAD: await page.fill('#«r3»-form-item', 'alice@example.com')
535
+ ✅ GOOD: await page.getByLabel('Email').fill('alice@example.com')
536
+ ✅ GOOD: await page.getByRole('textbox', {name: 'Email'}).fill('alice@example.com')
537
+ ✅ GOOD: await page.getByPlaceholder('Enter your email').fill('alice@example.com')
538
+
539
+ ❌ BAD: await page.click('#submit-btn-23847')
540
+ ✅ GOOD: await page.getByRole('button', {name: 'Submit'}).click()
541
+ ✅ GOOD: await page.getByText('Submit').click()
542
+
543
+ WHY: Semantic selectors are more maintainable, self-documenting, and resilient to UI changes.
544
+ Auto-generated IDs (especially with unicode like «r3») break when component instances change.
545
+
546
+ 5c. **ASSERTIONS AND VERIFICATION** (CRITICAL - Use expect):
547
+ Use expect() for verification ONLY when the step explicitly asks for verification.
548
+
549
+ 🏆 WHEN TO USE expect():
550
+ - Step says "Verify message appears" → await expect(locator).toBeVisible()
551
+ - Step says "Check field is empty" → await expect(locator).toBeEmpty()
552
+ - Step says "Confirm URL changed" → await expect(page).toHaveURL('...')
553
+ - Step explicitly mentions: "verify", "check", "ensure", "confirm", "validate"
554
+
555
+ ❌ WHEN NOT TO USE expect():
556
+ - Step says "Send a message" → DON'T add expect() for message visibility
557
+ - Step says "Click Submit" → DON'T add expect() for button state or result
558
+ - Step says "Fill Email field" → DON'T add expect() to verify it's filled
559
+ - Step says "Navigate to Dashboard" → DON'T add expect() for URL (unless step asks)
560
+
561
+ 🏆 expect() syntax (when needed):
562
+ - await expect(locator).toBeVisible()
563
+ - await expect(locator).toHaveText('expected text')
564
+ - await expect(locator).toHaveValue('expected value')
565
+ - await expect(locator).toBeEmpty()
566
+ - await expect(page).toHaveURL('expected-url')
567
+
568
+ ❌ BAD - Manual checks:
569
+ const value = await page.getByPlaceholder('Message...').inputValue();
570
+ if (value !== '') throw new Error(\`Expected empty but found: '\${value}'\`);
571
+
572
+ ✅ GOOD - Use expect (but ONLY if step asks to verify):
573
+ await expect(page.getByPlaceholder('Message...')).toBeEmpty();
574
+
575
+ WHY: expect() has built-in retries and better error messages.
576
+ BUT: Only use it when verification is explicitly requested in the step goal.
577
+
578
+ 6. **TOOLS ARE READ-ONLY (Information Gathering)**:
579
+ AVAILABLE TOOLS (for information only):
580
+ - take_screenshot: Get visual context when DOM isn't enough or stuck 2+ times
581
+ * DEFAULT: Captures FULL PAGE (elements below fold included)
582
+ * Set isFullPage=false ONLY if you need just viewport
583
+ * Use for: Planning, finding hidden elements, verifying layout
584
+ - recall_history: Remember earlier steps in journey
585
+ - inspect_page: Force fresh DOM snapshot (rarely needed, already in context)
586
+ - check_page_ready: Verify page loaded (use sparingly!)
587
+ - extract_data: Save data from page to memory for later steps
588
+
589
+ ALL STATE CHANGES USE PLAYWRIGHT COMMANDS:
590
+ - Navigation: await page.goto('url')
591
+ - Go back: await page.goBack()
592
+ - Refresh: await page.reload()
593
+ - Fill inputs: await page.fill() or page.getByRole().fill()
594
+ - Click buttons: await page.click() or page.getByRole().click()
595
+ - Wait: await page.waitForURL(), page.waitForLoadState()
596
+
597
+ Tools gather info → Commands change state
598
+
599
+ 7. **ERROR-SPECIFIC RESPONSES**:
600
+ - "Timeout waiting for locator" → Selector doesn't exist, try different one from DOM
601
+ - "Element is not a <select>" → It's a custom dropdown (button with role=combobox), use .click()
602
+ - "Multiple elements" → Add more specific selector from DOM (use #id)
603
+
604
+ 8. **WHEN TO RUN COMMANDS vs TOOLS**:
605
+ - Confident about selectors from DOM → Run commands directly
606
+ - Unsure or failed 2+ times → Take screenshot first
607
+ - First iteration of a step → Usually can run commands from DOM`;
608
+ }
609
+
610
+ /**
611
+ * Build user prompt with context
612
+ */
613
+ private buildUserPrompt(context: AgentContext, consecutiveFailures?: number): string {
614
+ const parts: string[] = [];
615
+
616
+ parts.push('=== YOUR CURRENT CONTEXT ===\n');
617
+
618
+ // System warnings for accumulated failures
619
+ if (consecutiveFailures && consecutiveFailures >= 2) {
620
+ parts.push(`⚠️⚠️⚠️ SYSTEM WARNING ⚠️⚠️⚠️`);
621
+ parts.push(`You've failed ${consecutiveFailures} iterations in a row!`);
622
+ parts.push(`Next failure will force STUCK. You MUST:`);
623
+ parts.push(`1. Take screenshot if you haven't already`);
624
+ parts.push(`2. Try completely different selector strategy`);
625
+ parts.push(`3. QUESTION ASSUMPTIONS: Am I at the right step?`);
626
+ parts.push(` - LOOK BACK: Did prior steps really complete? Check page state vs COMPLETED list`);
627
+ parts.push(` - LOOK FORWARD: Is current step already done? Check if goal already achieved`);
628
+ parts.push(`4. OR mark status="stuck" if truly can't proceed`);
629
+ parts.push(`⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️\n`);
630
+ }
631
+
632
+ // Goals - make current step very prominent
633
+ parts.push(`🎯 CURRENT STEP GOAL (${context.stepNumber}/${context.totalSteps}):`);
634
+ parts.push(`${context.currentStepGoal}`);
635
+ parts.push(`^^ DO ONLY WHAT THIS STEP ASKS - NO EXTRA ACTIONS OR VERIFICATIONS ^^`);
636
+ parts.push(`^^ If step doesn't say "verify/check/confirm" → DON'T add expect() assertions ^^`);
637
+ parts.push(`^^ Mark "complete" ONLY if commands succeeded ^^`);
638
+ parts.push(``);
639
+ parts.push(`OVERALL SCENARIO: ${context.overallGoal}\n`);
640
+
641
+ if (context.completedSteps.length > 0) {
642
+ parts.push(`COMPLETED: ${context.completedSteps.join(', ')}`);
643
+ }
644
+ if (context.remainingSteps.length > 0) {
645
+ parts.push(`REMAINING: ${context.remainingSteps.join(', ')}\n`);
646
+ }
647
+
648
+ // Current page state
649
+ parts.push(`\nCURRENT PAGE:`);
650
+ parts.push(`URL: ${context.currentURL}`);
651
+ parts.push(`Title: ${context.currentPageInfo.title}`);
652
+ parts.push(`\nINTERACTIVE ELEMENTS (with positions and selectors):`);
653
+ parts.push(context.currentPageInfo.formattedElements);
654
+ parts.push(`\nARIA TREE (hierarchical structure):`);
655
+ parts.push(JSON.stringify(context.currentPageInfo.ariaSnapshot, null, 2).substring(0, 5000));
656
+ if (JSON.stringify(context.currentPageInfo.ariaSnapshot).length > 5000) {
657
+ parts.push('... (truncated)');
658
+ }
659
+ parts.push('')
660
+
661
+ // Recent steps
662
+ if (context.recentSteps.length > 0) {
663
+ parts.push(`\nRECENT STEPS (last ${context.recentSteps.length}):`);
664
+ for (const step of context.recentSteps) {
665
+ const status = step.result === 'success' ? '✓' : '✗';
666
+ parts.push(` ${status} ${step.stepNumber}.${step.iteration || ''} ${step.action}`);
667
+ parts.push(` Code: ${step.code}`);
668
+ if (step.result === 'failure' && step.error) {
669
+ parts.push(` ❌ ERROR: ${step.error}`);
670
+ parts.push(` ^^ THIS SELECTOR FAILED - TRY DIFFERENT APPROACH ^^`);
671
+ } else {
672
+ parts.push(` Result: ${step.observation}`);
673
+ }
674
+ }
675
+ parts.push('');
676
+
677
+ // Detect repeated failures
678
+ const recentFailures = context.recentSteps.filter(s => s.result === 'failure');
679
+ if (recentFailures.length >= 2) {
680
+ const sameSelector = recentFailures.slice(-2).every((s, i, arr) =>
681
+ i === 0 || s.code === arr[i-1].code
682
+ );
683
+ if (sameSelector) {
684
+ parts.push(`⚠️ WARNING: You've tried the same selector multiple times and it failed!`);
685
+ parts.push(` Last failed selector: ${recentFailures[recentFailures.length - 1].code}`);
686
+ parts.push(` YOU MUST try a completely different selector this time!\n`);
687
+ }
688
+ }
689
+ }
690
+
691
+ // Experiences
692
+ if (context.experiences.length > 0) {
693
+ parts.push(`\nLEARNINGS (${context.experiences.length}):`);
694
+ for (const exp of context.experiences) {
695
+ parts.push(` - ${exp}`);
696
+ }
697
+ parts.push('');
698
+ }
699
+
700
+ // Extracted data
701
+ if (Object.keys(context.extractedData).length > 0) {
702
+ parts.push(`\nEXTRACTED DATA:`);
703
+ for (const [key, value] of Object.entries(context.extractedData)) {
704
+ parts.push(` ${key}: ${value}`);
705
+ }
706
+ parts.push('');
707
+ }
708
+
709
+ // Previous reflection
710
+ if (context.previousIterationGuidance) {
711
+ parts.push(`\nYOUR PREVIOUS GUIDANCE TO YOURSELF:`);
712
+ parts.push(`Focus: ${context.previousIterationGuidance.guidanceForNext}`);
713
+ if (context.previousIterationGuidance.detectingLoop) {
714
+ parts.push(`⚠ YOU DETECTED A LOOP: ${context.previousIterationGuidance.loopReasoning}`);
715
+ }
716
+ parts.push('');
717
+ }
718
+
719
+ // Tool results (if any)
720
+ if (context.toolResults) {
721
+ parts.push(`\n========== TOOL RESULTS ==========`);
722
+ for (const [toolName, result] of Object.entries(context.toolResults)) {
723
+ parts.push(`\n${toolName}: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
724
+
725
+ if (result.learning) {
726
+ parts.push(`\n📊 ANALYSIS FROM ${toolName.toUpperCase()}:`);
727
+ parts.push(result.learning);
728
+ parts.push(`^^ USE THIS INFORMATION TO DECIDE YOUR NEXT COMMANDS ^^`);
729
+ }
730
+
731
+ if (result.error) {
732
+ parts.push(` Error: ${result.error}`);
733
+ }
734
+ }
735
+ parts.push(`\n===================================\n`);
736
+ }
737
+
738
+ parts.push('=== WHAT DO YOU WANT TO DO? ===');
739
+ parts.push('Respond with JSON following the format specified in the system prompt.');
740
+
741
+ return parts.join('\n');
742
+ }
743
+
744
+ /**
745
+ * Parse agent decision from LLM response
746
+ */
747
+ private parseAgentDecision(response: string): AgentDecision {
748
+ try {
749
+ // Extract JSON from response
750
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
751
+ if (!jsonMatch) {
752
+ throw new Error('No JSON found in response');
753
+ }
754
+
755
+ const parsed = JSON.parse(jsonMatch[0]);
756
+
757
+ // Validate required fields
758
+ if (!parsed.status || !parsed.reasoning) {
759
+ throw new Error('Missing required fields: status and reasoning');
760
+ }
761
+
762
+ return parsed as AgentDecision;
763
+
764
+ } catch (error: any) {
765
+ this.logger?.(`[Orchestrator] ✗ Failed to parse agent decision: ${error.message}`, 'error');
766
+ this.logger?.(`[Orchestrator] Raw response: ${response.substring(0, 200)}...`, 'log');
767
+
768
+ // Return fallback
769
+ return {
770
+ status: 'stuck',
771
+ statusReasoning: 'Failed to parse agent response',
772
+ reasoning: `Parse error: ${error.message}`
773
+ };
774
+ }
775
+ }
776
+
777
+ /**
778
+ * Execute tools
779
+ */
780
+ private async executeTools(
781
+ toolCalls: any[],
782
+ page: any,
783
+ memory: JourneyMemory,
784
+ stepNumber: number
785
+ ): Promise<Record<string, any>> {
786
+ this.logger?.(`[Orchestrator] 🔧 Executing ${toolCalls.length} tool(s)`);
787
+
788
+ const results: Record<string, any> = {};
789
+ const toolContext: ToolExecutionContext = {
790
+ page,
791
+ memory,
792
+ stepNumber,
793
+ logger: this.logger
794
+ };
795
+
796
+ for (const toolCall of toolCalls.slice(0, this.config.maxToolCallsPerIteration)) {
797
+ this.logger?.(`[Orchestrator] ▶ ${toolCall.name}(${JSON.stringify(toolCall.params).substring(0, 50)}...)`);
798
+
799
+ const result = await this.toolRegistry.execute(toolCall, toolContext);
800
+ results[toolCall.name] = result;
801
+
802
+ if (result.success) {
803
+ this.logger?.(`[Orchestrator] ✓ ${toolCall.name} succeeded`);
804
+ } else {
805
+ this.logger?.(`[Orchestrator] ✗ ${toolCall.name} failed: ${result.error}`, 'error');
806
+ }
807
+ }
808
+
809
+ return results;
810
+ }
811
+
812
+ /**
813
+ * Execute commands sequentially with SHARED context (variables persist across commands)
814
+ */
815
+ private async executeCommandsSequentially(
816
+ commands: string[],
817
+ page: any,
818
+ memory: JourneyMemory,
819
+ stepNumber: number,
820
+ iteration: number,
821
+ jobId: string
822
+ ): Promise<{ executed: string[]; allSucceeded: boolean }> {
823
+ this.logger?.(`[Orchestrator] 📝 Executing ${commands.length} command(s) in shared context`);
824
+
825
+ const executed: string[] = [];
826
+ const limitedCommands = commands.slice(0, this.config.maxCommandsPerIteration);
827
+
828
+ // Build execution with shared context (all commands share scope - variables persist)
829
+ const commandsWithTracking = limitedCommands.map((cmd, i) => {
830
+ return `
831
+ // Command ${i + 1}/${limitedCommands.length}
832
+ try {
833
+ ${cmd}
834
+ __results.push({ index: ${i}, success: true });
835
+ } catch (error) {
836
+ __results.push({ index: ${i}, success: false, error: error.message });
837
+ throw error; // Stop on first failure
838
+ }`;
839
+ }).join('\n');
840
+
841
+ const wrappedCode = `
842
+ const __results = [];
843
+ ${commandsWithTracking}
844
+ return __results;
845
+ `;
846
+
847
+ try {
848
+ // Execute in shared context - variables declared here persist for entire scenario
849
+ // Pass both page and expect to make Playwright assertions available
850
+ const func = new Function('page', 'expect', 'return (async () => { ' + wrappedCode + ' })()');
851
+ const results = await func(page, expect);
852
+
853
+ // Record results for each command
854
+ for (let i = 0; i < limitedCommands.length; i++) {
855
+ const cmd = limitedCommands[i];
856
+ const result = results[i];
857
+
858
+ if (result && result.success) {
859
+ this.logger?.(`[Orchestrator] ✓ [${i + 1}/${limitedCommands.length}] Success`);
860
+
861
+ // Record in history
862
+ memory.history.push({
863
+ stepNumber,
864
+ iteration,
865
+ action: `Command ${i + 1}/${limitedCommands.length}`,
866
+ code: cmd,
867
+ result: 'success',
868
+ observation: 'Executed successfully',
869
+ url: page.url(),
870
+ timestamp: Date.now()
871
+ });
872
+
873
+ executed.push(cmd);
874
+ }
875
+ }
876
+
877
+ // Cap history
878
+ if (memory.history.length > this.config.maxHistorySize) {
879
+ memory.history = memory.history.slice(-this.config.maxHistorySize);
880
+ }
881
+
882
+ return { executed, allSucceeded: true };
883
+
884
+ } catch (error: any) {
885
+ // One of the commands failed - find which one
886
+ const errorMessage = error.message || String(error);
887
+
888
+ // Capture page state for debug logging
889
+ let pageStateDebug = '';
890
+ if (this.debugMode) {
891
+ try {
892
+ const pageInfo = await getEnhancedPageInfo(page);
893
+ pageStateDebug = `
894
+ === DEBUG: PAGE STATE AT FAILURE ===
895
+ URL: ${page.url()}
896
+ Title: ${pageInfo.title}
897
+
898
+ INTERACTIVE ELEMENTS:
899
+ ${pageInfo.formattedElements}
900
+
901
+ ARIA SNAPSHOT:
902
+ ${JSON.stringify(pageInfo.ariaSnapshot, null, 2)}
903
+
904
+ ====================================`;
905
+ } catch (debugError: any) {
906
+ pageStateDebug = `Failed to capture page state: ${debugError.message}`;
907
+ }
908
+ }
909
+
910
+ // Record all that succeeded, then the failure
911
+ for (let i = 0; i < limitedCommands.length; i++) {
912
+ const cmd = limitedCommands[i];
913
+
914
+ // This is a failed command (error happened here or earlier)
915
+ if (executed.length <= i) {
916
+ this.logger?.(`[Orchestrator] ✗ [${i + 1}/${limitedCommands.length}] Failed: ${errorMessage}`, 'error');
917
+
918
+ // Log detailed debug info
919
+ if (this.debugMode && pageStateDebug) {
920
+ this.logger?.(pageStateDebug, 'debug');
921
+ }
922
+
923
+ memory.history.push({
924
+ stepNumber,
925
+ iteration,
926
+ action: `Command ${i + 1}/${limitedCommands.length} - FAILED`,
927
+ code: cmd,
928
+ result: 'failure',
929
+ observation: `Failed with error: ${errorMessage}. This selector likely doesn't exist or is incorrect.`,
930
+ error: errorMessage,
931
+ url: page.url(),
932
+ timestamp: Date.now()
933
+ });
934
+
935
+ if (i < limitedCommands.length - 1) {
936
+ this.logger?.(`[Orchestrator] ⚠ Skipping remaining ${limitedCommands.length - i - 1} command(s)`, 'warn');
937
+ }
938
+
939
+ break;
940
+ }
941
+ }
942
+
943
+ return { executed, allSucceeded: false };
944
+ }
945
+ }
946
+
947
+ /**
948
+ * Execute a single command
949
+ */
950
+ private async executeCommand(cmd: string, page: any): Promise<void> {
951
+ // Wrap in async function and execute
952
+ const wrapped = `(async () => { ${cmd} })()`;
953
+
954
+ try {
955
+ await eval(wrapped);
956
+ } catch (error: any) {
957
+ // If eval fails, try direct execution with page context
958
+ // Pass both page and expect to make Playwright assertions available
959
+ const func = new Function('page', 'expect', `return (async () => { ${cmd} })()`);
960
+ await func(page, expect);
961
+ }
962
+ }
963
+
964
+ /**
965
+ * Log agent decision
966
+ */
967
+ private logAgentDecision(decision: AgentDecision, iteration: number): void {
968
+ this.logger?.(`[Orchestrator] 💭 REASONING: ${decision.reasoning}`);
969
+
970
+ if (decision.selfReflection) {
971
+ this.logger?.(`[Orchestrator] 🧠 SELF-REFLECTION:`);
972
+ this.logger?.(`[Orchestrator] Next: ${decision.selfReflection.guidanceForNext}`);
973
+ if (decision.selfReflection.detectingLoop) {
974
+ this.logger?.(`[Orchestrator] 🔄 LOOP DETECTED: ${decision.selfReflection.loopReasoning}`, 'warn');
975
+ }
976
+ }
977
+
978
+ if (decision.toolCalls && decision.toolCalls.length > 0) {
979
+ this.logger?.(`[Orchestrator] 🔧 TOOLS: ${decision.toolCalls.map(t => t.name).join(', ')}`);
980
+ if (decision.toolReasoning) {
981
+ this.logger?.(`[Orchestrator] 📋 Why: ${decision.toolReasoning}`);
982
+ }
983
+ }
984
+
985
+ if (decision.blockerDetected) {
986
+ this.logger?.(`[Orchestrator] 🚧 BLOCKER: ${decision.blockerDetected.description}`, 'warn');
987
+ this.logger?.(`[Orchestrator] 🧹 Clearing with ${decision.blockerDetected.clearingCommands.length} command(s)`);
988
+ }
989
+
990
+ if (decision.stepReEvaluation?.detected) {
991
+ this.logger?.(`[Orchestrator] 🔍 STEP RE-EVALUATION: ${decision.stepReEvaluation.issue}`, 'warn');
992
+ this.logger?.(`[Orchestrator] 📝 Explanation: ${decision.stepReEvaluation.explanation}`);
993
+ }
994
+
995
+ if (decision.commands && decision.commands.length > 0) {
996
+ this.logger?.(`[Orchestrator] 📝 COMMANDS (${decision.commands.length}):`);
997
+ decision.commands.slice(0, 3).forEach((cmd, i) => {
998
+ this.logger?.(`[Orchestrator] ${i + 1}. ${cmd.substring(0, 80)}...`);
999
+ });
1000
+ if (decision.commands.length > 3) {
1001
+ this.logger?.(`[Orchestrator] ... and ${decision.commands.length - 3} more`);
1002
+ }
1003
+ if (decision.commandReasoning) {
1004
+ this.logger?.(`[Orchestrator] 💡 Why: ${decision.commandReasoning}`);
1005
+ }
1006
+ }
1007
+
1008
+ // Experiences will be logged when added to memory, no need to log here
1009
+ }
1010
+
1011
+ /**
1012
+ * Report step progress
1013
+ */
1014
+ private async reportStepProgress(
1015
+ jobId: string,
1016
+ stepNumber: number,
1017
+ description: string,
1018
+ decision: AgentDecision,
1019
+ iteration: number
1020
+ ): Promise<void> {
1021
+ if (!this.progressReporter?.onStepProgress) return;
1022
+
1023
+ await this.progressReporter.onStepProgress({
1024
+ jobId,
1025
+ stepNumber,
1026
+ description,
1027
+ status: decision.status === 'complete' ? StepExecutionStatus.SUCCESS :
1028
+ decision.status === 'stuck' || decision.status === 'infeasible' ? StepExecutionStatus.FAILURE :
1029
+ StepExecutionStatus.IN_PROGRESS,
1030
+ code: decision.commands?.join('\n'),
1031
+ // Include agent metadata for transparency
1032
+ agentIteration: iteration,
1033
+ agentReasoning: decision.reasoning,
1034
+ agentSelfReflection: decision.selfReflection,
1035
+ agentExperiences: decision.experiences,
1036
+ agentToolsUsed: decision.toolCalls?.map(t => t.name),
1037
+ agentStatus: decision.status
1038
+ });
1039
+ }
1040
+ }
1041
+
1042
+