explorbot 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/README.md +27 -1
  2. package/bin/explorbot-cli.ts +27 -18
  3. package/dist/bin/explorbot-cli.js +26 -18
  4. package/dist/package.json +2 -2
  5. package/dist/rules/navigator/output.md +9 -0
  6. package/dist/rules/navigator/verification-actions.md +2 -0
  7. package/dist/src/action-result.js +23 -1
  8. package/dist/src/action.js +46 -38
  9. package/dist/src/ai/bosun.js +11 -1
  10. package/dist/src/ai/conversation.js +39 -0
  11. package/dist/src/ai/historian/codeceptjs.js +109 -0
  12. package/dist/src/ai/historian/experience.js +320 -0
  13. package/dist/src/ai/historian/mixin.js +2 -0
  14. package/dist/src/ai/historian/playwright.js +145 -0
  15. package/dist/src/ai/historian/utils.js +18 -0
  16. package/dist/src/ai/historian.js +19 -405
  17. package/dist/src/ai/navigator.js +82 -29
  18. package/dist/src/ai/pilot.js +232 -13
  19. package/dist/src/ai/planner.js +29 -9
  20. package/dist/src/ai/provider.js +54 -17
  21. package/dist/src/ai/researcher.js +41 -32
  22. package/dist/src/ai/rules.js +26 -14
  23. package/dist/src/ai/tester.js +90 -26
  24. package/dist/src/ai/tools.js +13 -7
  25. package/dist/src/browser-server.js +16 -3
  26. package/dist/src/commands/add-rule-command.js +11 -8
  27. package/dist/src/commands/clean-command.js +2 -1
  28. package/dist/src/commands/explore-command.js +27 -15
  29. package/dist/src/commands/init-command.js +9 -8
  30. package/dist/src/commands/plan-command.js +32 -0
  31. package/dist/src/commands/plan-save-command.js +19 -7
  32. package/dist/src/commands/rerun-command.js +4 -0
  33. package/dist/src/components/App.js +15 -5
  34. package/dist/src/execution-controller.js +13 -2
  35. package/dist/src/experience-tracker.js +20 -64
  36. package/dist/src/explorbot.js +5 -8
  37. package/dist/src/explorer.js +9 -2
  38. package/dist/src/observability.js +50 -99
  39. package/dist/src/playwright-recorder.js +309 -0
  40. package/dist/src/test-plan.js +12 -0
  41. package/dist/src/utils/aria.js +37 -1
  42. package/dist/src/utils/error-page.js +20 -7
  43. package/dist/src/utils/next-steps.js +37 -0
  44. package/package.json +2 -2
  45. package/rules/navigator/output.md +9 -0
  46. package/rules/navigator/verification-actions.md +2 -0
  47. package/src/action-result.ts +26 -1
  48. package/src/action.ts +44 -37
  49. package/src/ai/bosun.ts +11 -1
  50. package/src/ai/conversation.ts +37 -0
  51. package/src/ai/historian/codeceptjs.ts +130 -0
  52. package/src/ai/historian/experience.ts +383 -0
  53. package/src/ai/historian/mixin.ts +4 -0
  54. package/src/ai/historian/playwright.ts +169 -0
  55. package/src/ai/historian/utils.ts +23 -0
  56. package/src/ai/historian.ts +35 -473
  57. package/src/ai/navigator.ts +82 -29
  58. package/src/ai/pilot.ts +237 -14
  59. package/src/ai/planner.ts +29 -9
  60. package/src/ai/provider.ts +51 -17
  61. package/src/ai/researcher.ts +45 -33
  62. package/src/ai/rules.ts +27 -14
  63. package/src/ai/tester.ts +94 -26
  64. package/src/ai/tools.ts +47 -25
  65. package/src/browser-server.ts +17 -3
  66. package/src/commands/add-rule-command.ts +11 -7
  67. package/src/commands/clean-command.ts +2 -1
  68. package/src/commands/explore-command.ts +29 -15
  69. package/src/commands/init-command.ts +9 -8
  70. package/src/commands/plan-command.ts +35 -0
  71. package/src/commands/plan-save-command.ts +18 -7
  72. package/src/commands/rerun-command.ts +5 -0
  73. package/src/components/App.tsx +16 -5
  74. package/src/config.ts +6 -1
  75. package/src/execution-controller.ts +14 -3
  76. package/src/experience-tracker.ts +21 -72
  77. package/src/explorbot.ts +5 -8
  78. package/src/explorer.ts +11 -2
  79. package/src/observability.ts +50 -109
  80. package/src/playwright-recorder.ts +305 -0
  81. package/src/test-plan.ts +12 -0
  82. package/src/utils/aria.ts +38 -1
  83. package/src/utils/error-page.ts +22 -7
  84. package/src/utils/next-steps.ts +51 -0
@@ -10,12 +10,13 @@ import { HooksRunner } from '../utils/hooks-runner.ts';
10
10
  import { createDebug, pluralize, tag } from '../utils/logger.js';
11
11
  import { loop, pause } from '../utils/loop.js';
12
12
  import { RulesLoader } from '../utils/rules-loader.ts';
13
+ import { extractStatePath } from '../utils/url-matcher.js';
13
14
  import type { Agent } from './agent.js';
14
15
  import type { Conversation } from './conversation.js';
15
16
  import { ExperienceCompactor } from './experience-compactor.js';
16
17
  import type { Provider } from './provider.js';
17
18
  import { Researcher } from './researcher.ts';
18
- import { actionRule, locatorRule } from './rules.js';
19
+ import { actionRule, locatorRule, unexpectedPopupRule } from './rules.js';
19
20
  import { isInteractive } from './task-agent.js';
20
21
  import { createAgentTools } from './tools.ts';
21
22
 
@@ -153,11 +154,10 @@ class Navigator implements Agent {
153
154
  }
154
155
 
155
156
  if (!actionResult.isInsideIframe) {
156
- const toc = this.experienceTracker.getExperienceTableOfContents(actionResult);
157
- if (toc.length > 0) {
158
- const totalSections = toc.reduce((sum, entry) => sum + entry.sections.length, 0);
159
- tag('substep').log(`Found ${toc.length} experience ${pluralize(toc.length, 'file')} (${totalSections} sections) for: ${actionResult.url}`);
160
- experience = renderExperienceToc(toc);
157
+ const successful = this.experienceTracker.getSuccessfulExperience(actionResult);
158
+ if (successful.length > 0) {
159
+ tag('substep').log(`Found ${successful.length} experience ${pluralize(successful.length, 'file')} for: ${actionResult.url}`);
160
+ experience = `<experience>\nPast successful recipes recorded from prior runs for this page. Prefer these solutions first if they match the goal.\n\n${successful.join('\n\n')}\n</experience>`;
161
161
  }
162
162
  }
163
163
 
@@ -186,6 +186,8 @@ class Navigator implements Agent {
186
186
 
187
187
  ${actionRule}
188
188
 
189
+ ${unexpectedPopupRule}
190
+
189
191
  ${RulesLoader.loadRules('navigator', ['multiple-locator', 'output'], actionResult.url || '').replace('{{maxAttempts}}', String(this.MAX_ATTEMPTS))}
190
192
 
191
193
  ${experience}
@@ -196,12 +198,14 @@ class Navigator implements Agent {
196
198
  const conversation = this.provider.startConversation(this.systemPrompt, 'navigator');
197
199
  conversation.addUserText(prompt);
198
200
 
199
- const tools = this.buildExperienceTools();
201
+ const tools = undefined;
200
202
 
201
203
  let codeBlocks: string[] = [];
202
204
  let htmlContextAdded = false;
203
205
  let codeBlockIndex = 0;
204
206
  let totalAttempts = 0;
207
+ const progressBlocks: string[] = [];
208
+ const batchFailures: Array<{ code: string; error: string }> = [];
205
209
 
206
210
  let resolved = false;
207
211
  await loop(
@@ -223,22 +227,24 @@ class Navigator implements Agent {
223
227
 
224
228
  const codeBlock = codeBlocks[codeBlockIndex];
225
229
  if (!codeBlock) {
230
+ if (batchFailures.length === 0 && htmlContextAdded) {
231
+ stop();
232
+ return;
233
+ }
234
+ tag('substep').log('Feeding failures back to AI for a new batch...');
235
+ let contextMsg = 'Previous solutions did not work. Analyze the failures and try DIFFERENT strategies (not syntactic variants of the same locator).\n\n';
236
+ if (batchFailures.length > 0) {
237
+ const lines = batchFailures.map((f) => `- \`${f.code.split('\n')[0]}\` → ${f.error}`).join('\n');
238
+ contextMsg += `<previous_failures>\n${lines}\n</previous_failures>\n\n`;
239
+ }
226
240
  if (!htmlContextAdded) {
227
241
  htmlContextAdded = true;
228
- tag('substep').log('Adding HTML context for better resolution...');
229
- conversation.addUserText(dedent`
230
- Previous solutions did not work. Here is the full HTML context:
231
-
232
- <page_html>
233
- ${await actionResult.combinedHtml()}
234
- </page_html>
235
-
236
- Please suggest new solutions based on this additional context.
237
- `);
238
- codeBlocks = [];
239
- return;
242
+ contextMsg += `Full HTML context:\n\n<page_html>\n${await actionResult.combinedHtml()}\n</page_html>\n\n`;
240
243
  }
241
- stop();
244
+ contextMsg += 'Propose new solutions. If errors mention "intercepts pointer events" or timeouts on visible elements, an overlay is blocking — dismiss it first (Escape, click outside, Close button) before retrying the original action.';
245
+ conversation.addUserText(contextMsg);
246
+ codeBlocks = [];
247
+ batchFailures.length = 0;
242
248
  return;
243
249
  }
244
250
  codeBlockIndex++;
@@ -246,24 +252,69 @@ class Navigator implements Agent {
246
252
 
247
253
  await this.explorer.switchToMainFrame();
248
254
 
255
+ const prevHash = action.actionResult?.getStateHash() ?? actionResult.getStateHash();
256
+
249
257
  debugLog(`Attempting resolution: ${codeBlock}`);
250
- resolved = await action.attempt(codeBlock, message);
258
+ const attemptOk = await action.attempt(codeBlock, message);
259
+
260
+ const page = action.playwrightHelper?.page;
261
+ if (page) {
262
+ try {
263
+ await page.waitForLoadState('load', { timeout: 5000 });
264
+ } catch {
265
+ // Navigation did not reach 'load' state within timeout; continue and verify URL
266
+ }
267
+ }
268
+
269
+ if (!attemptOk) {
270
+ const raw = action.lastError?.message || 'attempt failed';
271
+ const firstMeaningful = raw.split('\n').find((l) => l.trim() && !l.trim().startsWith('at ')) || raw;
272
+ const shortErr = firstMeaningful.replace(/\s+/g, ' ').trim().slice(0, 220);
273
+ batchFailures.push({ code: codeBlock, error: shortErr });
274
+ }
251
275
 
252
276
  if (expectedUrl) {
253
- await (action.getActor() as any).wait(2);
277
+ if (page) {
278
+ try {
279
+ await page.waitForURL((url: URL) => normalizeUrl(url.pathname) === normalizeUrl(expectedUrl), { timeout: 5000 });
280
+ } catch {
281
+ // URL did not transition to expectedUrl within timeout
282
+ }
283
+ }
254
284
  const freshState = await action.capturePageState();
285
+ const urlMatches = normalizeUrl(freshState.url || '') === normalizeUrl(expectedUrl);
286
+ const stateChanged = freshState.getStateHash() !== actionResult.getStateHash();
287
+ resolved = urlMatches && stateChanged;
255
288
 
256
- if (normalizeUrl(freshState.url || '') === normalizeUrl(expectedUrl)) {
257
- resolved = true;
258
- } else if (resolved) {
289
+ if (!resolved && attemptOk) {
259
290
  tag('warning').log(`URL verification failed: expected ${expectedUrl}, got ${freshState.url}`);
260
- resolved = false;
261
291
  }
292
+ if (freshState.getStateHash() !== prevHash && (attemptOk || urlMatches)) {
293
+ progressBlocks.push(codeBlock);
294
+ }
295
+ } else {
296
+ resolved = attemptOk;
297
+ if (attemptOk) progressBlocks.push(codeBlock);
262
298
  }
263
299
 
264
300
  if (resolved) {
265
301
  tag('success').log('Navigation resolved successfully');
266
- this.experienceTracker.writeAction(actionResult, { title: message, code: codeBlock });
302
+ let scenario = message.split('\n')[0];
303
+ if (expectedUrl) {
304
+ const fromPath = extractStatePath(actionResult.url || '');
305
+ const toPath = extractStatePath(expectedUrl);
306
+ scenario = `reach ${toPath} from ${fromPath}`;
307
+ }
308
+ const recipe = progressBlocks
309
+ .join('\n')
310
+ .split('\n')
311
+ .filter((line) => !/^\s*I\.amOnPage\s*\(/.test(line))
312
+ .join('\n')
313
+ .trim();
314
+ if (recipe) {
315
+ const body = `## FLOW: ${scenario}\n\n* ${scenario}\n\n\`\`\`js\n${recipe}\n\`\`\`\n\n---\n`;
316
+ this.experienceTracker.writeFlow(actionResult, body);
317
+ }
267
318
  stop();
268
319
  return;
269
320
  }
@@ -455,7 +506,7 @@ class Navigator implements Agent {
455
506
  return suggestion;
456
507
  }
457
508
 
458
- async verifyState(message: string, actionResult: ActionResult): Promise<{ verified: boolean; successfulCodes: string[]; totalAttempted: number }> {
509
+ async verifyState(message: string, actionResult: ActionResult): Promise<{ verified: boolean; successfulCodes: string[]; assertionSteps: Array<{ name: string; args: any[] }>; totalAttempted: number }> {
459
510
  tag('info').log('AI Navigator verifying state at', actionResult.url);
460
511
  debugLog('Verification message:', message);
461
512
 
@@ -524,6 +575,7 @@ class Navigator implements Agent {
524
575
 
525
576
  let codeBlocks: string[] = [];
526
577
  const successfulCodes: string[] = [];
578
+ const assertionSteps: Array<{ name: string; args: any[] }> = [];
527
579
 
528
580
  const action = this.explorer.createAction();
529
581
 
@@ -555,6 +607,7 @@ class Navigator implements Agent {
555
607
  if (verified) {
556
608
  tag('success').log('Verification passed');
557
609
  successfulCodes.push(codeBlock);
610
+ assertionSteps.push(...action.assertionSteps);
558
611
  }
559
612
  },
560
613
  {
@@ -574,7 +627,7 @@ class Navigator implements Agent {
574
627
  actionResult.addVerification(message, verified);
575
628
  this.explorer.getStateManager().updateState(actionResult);
576
629
 
577
- return { verified, successfulCodes, totalAttempted };
630
+ return { verified, successfulCodes, assertionSteps, totalAttempted };
578
631
  }
579
632
  }
580
633
 
package/src/ai/pilot.ts CHANGED
@@ -7,6 +7,7 @@ import { type ExperienceTracker, renderExperienceToc } from '../experience-track
7
7
  import type Explorer from '../explorer.ts';
8
8
  import { type Test, TestResult } from '../test-plan.ts';
9
9
  import { collectInteractiveNodes, detectFocusArea, extractFocusedElement } from '../utils/aria.ts';
10
+ import { ErrorPageError } from '../utils/error-page.ts';
10
11
  import { createDebug, tag } from '../utils/logger.ts';
11
12
 
12
13
  const debugLog = createDebug('explorbot:pilot');
@@ -14,6 +15,7 @@ import { truncateJson } from '../utils/strings.ts';
14
15
  import type { Agent } from './agent.ts';
15
16
  import type { Conversation } from './conversation.ts';
16
17
  import type { Fisherman } from './fisherman.ts';
18
+ import type { Navigator } from './navigator.ts';
17
19
  import type { Provider } from './provider.ts';
18
20
  import type { Researcher } from './researcher.ts';
19
21
  import { isInteractive } from './task-agent.ts';
@@ -56,25 +58,30 @@ export class Pilot implements Agent {
56
58
  return this.conversation.getLastMessage() || null;
57
59
  }
58
60
 
59
- async reviewStop(task: Test, currentState: ActionResult, testerConversation: Conversation): Promise<boolean> {
60
- return this.reviewDecision('stop', task, currentState, testerConversation);
61
+ async reviewStop(task: Test, currentState: ActionResult, testerConversation: Conversation, navigator?: Navigator): Promise<boolean> {
62
+ return this.reviewDecision('stop', task, currentState, testerConversation, navigator);
61
63
  }
62
64
 
63
- async reviewFinish(task: Test, currentState: ActionResult, testerConversation: Conversation): Promise<boolean> {
64
- return this.reviewDecision('finish', task, currentState, testerConversation);
65
+ async reviewFinish(task: Test, currentState: ActionResult, testerConversation: Conversation, navigator?: Navigator): Promise<boolean> {
66
+ return this.reviewDecision('finish', task, currentState, testerConversation, navigator);
65
67
  }
66
68
 
67
- async reviewCompletion(task: Test, currentState: ActionResult, testerConversation: Conversation): Promise<boolean> {
69
+ async reviewCompletion(task: Test, currentState: ActionResult, testerConversation: Conversation, navigator?: Navigator): Promise<boolean> {
68
70
  const verdictType = task.hasAchievedAny() ? 'finish' : 'stop';
69
- return this.reviewDecision(verdictType, task, currentState, testerConversation);
71
+ return this.reviewDecision(verdictType, task, currentState, testerConversation, navigator);
70
72
  }
71
73
 
72
- async finalReview(task: Test, currentState: ActionResult, testerConversation: Conversation): Promise<boolean> {
74
+ async finalReview(task: Test, currentState: ActionResult, testerConversation: Conversation, navigator?: Navigator): Promise<boolean> {
73
75
  if (task.hasFinished) return false;
74
- return this.reviewCompletion(task, currentState, testerConversation);
76
+ return this.reviewCompletion(task, currentState, testerConversation, navigator);
75
77
  }
76
78
 
77
- private async reviewDecision(type: 'finish' | 'stop', task: Test, currentState: ActionResult, testerConversation: Conversation): Promise<boolean> {
79
+ async reviewReset(task: Test, currentState: ActionResult, reason: string, testerConversation: Conversation): Promise<boolean> {
80
+ return this.reviewResetDecision(task, currentState, reason, testerConversation);
81
+ }
82
+
83
+ private async reviewDecision(type: 'finish' | 'stop', task: Test, currentState: ActionResult, testerConversation: Conversation, navigator?: Navigator): Promise<boolean> {
84
+ if (task.hasFinished) return false;
78
85
  tag('substep').log(`Pilot reviewing ${type} verdict...`);
79
86
 
80
87
  const sessionLog = this.formatSessionLog(testerConversation);
@@ -98,6 +105,12 @@ export class Pilot implements Agent {
98
105
  decision: z.enum(['pass', 'fail', 'continue', 'skipped']).describe('pass = test succeeded, fail = test failed, continue = tester should keep going, skipped = scenario is irrelevant OR systematic execution failures prevented testing'),
99
106
  reason: z.string().describe('What happened and why (1-2 sentences). Do NOT repeat the decision status (e.g. "scenario goal achieved/not achieved") — just explain the evidence. For continue: explain why rejected and suggest alternatives.'),
100
107
  guidance: z.string().nullable().describe('Required for "continue": specific actionable instruction for the tester — what exactly to verify, retry differently, or complete next. Be concrete.'),
108
+ requestVerification: z
109
+ .string()
110
+ .nullable()
111
+ .describe(
112
+ 'REQUIRED whenever decision is "pass" — provide a specific assertion that proves the scenario goal on the current page (e.g., "New test suite \\"Foo\\" is visible in the suites list"). The system runs it and bakes the resulting assertion into the generated test file; without it the test file has no verifiable expect(). Also use when evidence is insufficient before deciding pass/fail. Leave null for "continue", "fail", or "skipped".'
113
+ ),
101
114
  });
102
115
 
103
116
  const userContent = dedent`
@@ -126,6 +139,12 @@ export class Pilot implements Agent {
126
139
  - "continue" if tester hasn't completed the scenario goal yet — even if milestones were checked
127
140
  - If evidence is mixed, but final state indicates goal completion, choose "pass"
128
141
  - If evidence is mixed and final state is unclear, prefer "continue" over "fail"
142
+
143
+ When deciding "pass", you MUST also set requestVerification to a CodeceptJS assertion that
144
+ proves the scenario goal on the current page. Choose the strongest single evidence (a unique
145
+ element/text that exists ONLY because the scenario succeeded). The assertion is executed and
146
+ then converted into the spec file's expect() — without it the generated test has nothing to
147
+ assert and is worthless.
129
148
  `;
130
149
 
131
150
  const messages = [
@@ -148,6 +167,29 @@ export class Pilot implements Agent {
148
167
  return false;
149
168
  }
150
169
 
170
+ if (result.requestVerification && navigator) {
171
+ tag('substep').log(`Pilot requesting verification: ${result.requestVerification}`);
172
+ try {
173
+ const verifyResult = await navigator.verifyState(result.requestVerification, currentState);
174
+ if (verifyResult.verified) {
175
+ if (verifyResult.assertionSteps?.length) {
176
+ this.explorer.getPlaywrightRecorder().recordVerification(verifyResult.assertionSteps);
177
+ }
178
+ tag('substep').log(`Pilot verified: ${result.requestVerification}`);
179
+ } else {
180
+ tag('substep').log(`Pilot verification failed: ${result.requestVerification}`);
181
+ if (result.decision === 'pass') {
182
+ const flipMessage = `Verification "${result.requestVerification}" did not match the page. Adjust approach and re-verify before finishing.`;
183
+ result.decision = 'continue';
184
+ result.reason = flipMessage;
185
+ result.guidance = result.guidance ?? flipMessage;
186
+ }
187
+ }
188
+ } catch (verifyErr: any) {
189
+ tag('warning').log(`Pilot verification errored: ${verifyErr.message}`);
190
+ }
191
+ }
192
+
151
193
  tag('info').log(`Pilot: ${result.decision} — ${result.reason}`);
152
194
  task.summary = result.reason;
153
195
 
@@ -180,6 +222,142 @@ export class Pilot implements Agent {
180
222
  }
181
223
  }
182
224
 
225
+ private async reviewResetDecision(task: Test, currentState: ActionResult, reason: string, testerConversation: Conversation): Promise<boolean> {
226
+ if (task.hasFinished) return false;
227
+ tag('substep').log(`Pilot reviewing reset (count=${task.resetCount})...`);
228
+
229
+ const sessionLog = this.formatSessionLog(testerConversation);
230
+ const stateContext = this.buildStateContext(currentState);
231
+ const notes = task.notesToString() || 'No notes recorded.';
232
+
233
+ const schema = z.object({
234
+ decision: z.enum(['allow', 'fail', 'continue', 'skipped']).describe('allow = reset proceeds, fail = test failed (stop looping), continue = veto reset, tester should act on current page instead, skipped = scenario is irrelevant or cannot be executed'),
235
+ reason: z.string().describe('What evidence justifies this decision (1-2 sentences). Do not restate the decision.'),
236
+ guidance: z.string().nullable().describe('Required for "continue": concrete instruction for what the tester should do instead of resetting (e.g. which tool to call, what to verify).'),
237
+ });
238
+
239
+ const userContent = dedent`
240
+ Tester requested reset. Previous reset count: ${task.resetCount - 1}.
241
+
242
+ Reason given by tester: ${reason || '(none)'}
243
+
244
+ <state>
245
+ ${stateContext}
246
+ </state>
247
+
248
+ ${this.formatExpectations(task)}
249
+
250
+ <notes>
251
+ ${notes}
252
+ </notes>
253
+
254
+ <session_log>
255
+ ${sessionLog || 'No actions recorded'}
256
+ </session_log>
257
+
258
+ Decide:
259
+ - "allow" — the reset is legitimate (navigation dead-end, wrong page, irrecoverable error on current page).
260
+ - "continue" — veto the reset; something on the current page can still be used to progress or verify. Provide guidance.
261
+ - "fail" — reset-looping: tester has already reset and the underlying obstacle will not change. Stop the test as failed.
262
+ - "skipped" — the scenario is inapplicable to this application or cannot be executed here.
263
+ `;
264
+
265
+ const messages = [
266
+ {
267
+ role: 'system' as const,
268
+ content: this.buildResetSystemPrompt(task),
269
+ },
270
+ { role: 'user' as const, content: userContent },
271
+ ];
272
+
273
+ try {
274
+ const response = await this.provider.generateObject(messages, schema, this.provider.getAgenticModel('pilot'), {
275
+ agentName: 'pilot',
276
+ experimental_telemetry: { functionId: 'pilot.reviewReset' },
277
+ });
278
+
279
+ const result = response?.object;
280
+ if (!result) {
281
+ return true;
282
+ }
283
+
284
+ tag('info').log(`Pilot reset verdict: ${result.decision} — ${result.reason}`);
285
+
286
+ if (result.decision === 'allow') {
287
+ tag('substep').log(`Pilot allowed reset: ${result.reason}`);
288
+ return true;
289
+ }
290
+
291
+ if (result.decision === 'fail') {
292
+ task.addNote(`Pilot: reset refused — ${result.reason}`, TestResult.FAILED);
293
+ task.finish(TestResult.FAILED);
294
+ return false;
295
+ }
296
+
297
+ if (result.decision === 'skipped') {
298
+ task.addNote(`Pilot: skipped — ${result.reason}`, TestResult.SKIPPED);
299
+ task.finish(TestResult.SKIPPED);
300
+ return false;
301
+ }
302
+
303
+ tag('substep').log(`Pilot vetoed reset: ${result.reason}`);
304
+ const guidanceText = result.guidance ? `\n\nWhat to do instead: ${result.guidance}` : '';
305
+ testerConversation.addUserText(`Pilot vetoed reset: ${result.reason}${guidanceText}`);
306
+ return false;
307
+ } catch (error: any) {
308
+ tag('warning').log(`Pilot reset review failed: ${error.message}`);
309
+ return true;
310
+ }
311
+ }
312
+
313
+ private buildResetSystemPrompt(task: Test): string {
314
+ return dedent`
315
+ You are Pilot — the supervisor that decides whether a reset is legitimate.
316
+ Tester wants to reset (navigate back to the start URL and discard progress).
317
+
318
+ SCENARIO: ${task.scenario}
319
+
320
+ Reset is DESTRUCTIVE. It abandons all work done in this iteration. In stateful apps, any
321
+ side effects (records created, forms submitted) persist on the server — resetting does not
322
+ undo them. Unnecessary resets create duplicate data and loop forever.
323
+
324
+ LEGITIMATE RESET (decide "allow"):
325
+ - The current page is unrelated to the scenario and no path leads back.
326
+ - Navigation is stuck in an error state with no recoverable action.
327
+ - The tester arrived on a page that cannot host the scenario at all.
328
+
329
+ ILLEGITIMATE RESET (decide "continue"):
330
+ - The previous action already succeeded (URL changed to a success/detail page, record visible,
331
+ confirmation shown) and tester wants to redo it because an assertion did not match.
332
+ The work is done — verify, record, or finish instead of restarting.
333
+ - A single expectation / milestone does not match app reality but the scenario goal may still
334
+ have been achieved. Do not redo — instruct the tester to verify the actual outcome.
335
+ - Tester wants to "try again with different input" after a form was submitted. Submitting
336
+ again creates a duplicate; guide toward editing the existing record or accepting the state.
337
+
338
+ RESET-LOOP (decide "fail"):
339
+ - resetCount >= 2 and the previous resets did not change the underlying situation.
340
+ - The same flow has been attempted twice with the same failure mode.
341
+ - Repeating the reset cannot produce new information.
342
+
343
+ SCENARIO INAPPLICABLE (decide "skipped"):
344
+ - The feature the scenario targets does not exist on this app, or prerequisites cannot be met.
345
+
346
+ PRIORITY:
347
+ 1) Evidence of successful side effects in session_log (URL transition, new record visible).
348
+ If present, almost never allow the reset — the work is done.
349
+ 2) resetCount. Each prior reset raises the bar for allowing another.
350
+ 3) Tester's stated reason. Weigh it against the observed evidence, do not trust it blindly.
351
+
352
+ GUIDANCE FIELD (required when decision is "continue"):
353
+ Give a specific next action on the current page: which tool to call, what to verify, or how to
354
+ record the outcome. Do not suggest repeating actions that already succeeded.
355
+
356
+ EXPECTED RESULTS (milestones, not the goal):
357
+ ${task.expected.map((e) => `- ${e}`).join('\n')}
358
+ `;
359
+ }
360
+
183
361
  private buildVerdictSystemPrompt(type: string, task: Test): string {
184
362
  return dedent`
185
363
  You are Pilot — the final decision maker for test pass/fail.
@@ -281,10 +459,14 @@ export class Pilot implements Agent {
281
459
  the elements needed for the scenario. The page summary does not list every element.
282
460
  Prefer interacting with the current page over navigating away.
283
461
 
462
+ If you load a recipe via learn_experience, do NOT rewrite its code in your plan — the
463
+ raw recipe is forwarded to Tester automatically. Reference it by step ("apply recipe
464
+ steps 1–3, then…") and call out anywhere your scenario diverges from it.
465
+
284
466
  Be concise and specific. Tester will follow your plan.
285
467
  `,
286
468
  'pilot.planTest',
287
- { tools: true, maxToolRoundtrips: 3, task }
469
+ { tools: true, planningOnly: true, maxToolRoundtrips: 3, task }
288
470
  );
289
471
  }
290
472
 
@@ -377,7 +559,7 @@ export class Pilot implements Agent {
377
559
  return `CHECKED: ${checked.length > 0 ? checked.join(', ') : 'none'}\nREMAINING: ${remaining.length > 0 ? remaining.join(', ') : 'none'}`;
378
560
  }
379
561
 
380
- private async sendToPilot(userText: string, functionId: string, opts: { tools?: boolean; maxToolRoundtrips?: number; task?: Test } = {}): Promise<string> {
562
+ private async sendToPilot(userText: string, functionId: string, opts: { tools?: boolean; planningOnly?: boolean; maxToolRoundtrips?: number; task?: Test } = {}): Promise<string> {
381
563
  debugLog(`sendToPilot: ${functionId}, tools: ${!!opts.tools}, roundtrips: ${opts.maxToolRoundtrips ?? 0}`);
382
564
 
383
565
  let finalUserText = userText;
@@ -388,7 +570,10 @@ export class Pilot implements Agent {
388
570
  }
389
571
  }
390
572
  this.conversation!.addUserText(finalUserText);
391
- let tools = opts.tools ? this.agentTools : undefined;
573
+ let tools: any;
574
+ if (opts.tools) {
575
+ tools = opts.planningOnly ? this.pickPlanningTools() : this.agentTools;
576
+ }
392
577
 
393
578
  if (opts.tools && opts.task) {
394
579
  tools = { ...tools, ...this.buildPreconditionTool(opts.task) };
@@ -399,7 +584,19 @@ export class Pilot implements Agent {
399
584
  agentName: 'pilot',
400
585
  experimental_telemetry: { functionId },
401
586
  });
402
- return result?.response?.text || '';
587
+ const text = result?.response?.text || '';
588
+ const learned = (result?.toolExecutions || []).filter((e: any) => e.toolName === 'learn_experience' && e.output?.content).map((e: any) => e.output.content);
589
+ if (learned.length === 0) return text;
590
+ return dedent`
591
+ ${text}
592
+
593
+ <applied_experience>
594
+ Recipes from prior successful runs that Pilot judged relevant. Locators worked then; the page may have changed since.
595
+ Treat code blocks below as a starting hypothesis. If a locator misses, fall back to ARIA/UI-map.
596
+
597
+ ${learned.join('\n\n')}
598
+ </applied_experience>
599
+ `;
403
600
  }
404
601
 
405
602
  private getExperienceToc(): string {
@@ -411,6 +608,19 @@ export class Pilot implements Agent {
411
608
  return renderExperienceToc(toc);
412
609
  }
413
610
 
611
+ private pickPlanningTools() {
612
+ const { see, context, verify, research, getVisitedStates, xpathCheck, learn_experience } = this.agentTools ?? {};
613
+ const planning: Record<string, unknown> = {};
614
+ if (see) planning.see = see;
615
+ if (context) planning.context = context;
616
+ if (verify) planning.verify = verify;
617
+ if (research) planning.research = research;
618
+ if (getVisitedStates) planning.getVisitedStates = getVisitedStates;
619
+ if (xpathCheck) planning.xpathCheck = xpathCheck;
620
+ if (learn_experience) planning.learn_experience = learn_experience;
621
+ return planning;
622
+ }
623
+
414
624
  private buildPreconditionTool(task: Test) {
415
625
  return {
416
626
  precondition: tool({
@@ -558,7 +768,13 @@ export class Pilot implements Agent {
558
768
  }
559
769
 
560
770
  if (text.includes('ATTACH_UI_MAP')) {
561
- const uiMap = await this.researcher.research(currentState);
771
+ let uiMap = '';
772
+ try {
773
+ uiMap = await this.researcher.research(currentState);
774
+ } catch (err) {
775
+ if (!(err instanceof ErrorPageError)) throw err;
776
+ tag('warning').log(`Pilot UI map skipped: ${err.message}`);
777
+ }
562
778
  if (uiMap) {
563
779
  parts.push(dedent`
564
780
  <page_ui_map>
@@ -727,6 +943,13 @@ export class Pilot implements Agent {
727
943
  - If the goal was achieved by a previous action (SUCCESS in recent_actions with confirming ariaDiff): instruct Tester to verify() the result and finish(). Do NOT repeat the same action.
728
944
  - If Tester keeps re-opening the same panel and re-submitting the same data — STOP. The action was already completed.
729
945
 
946
+ Action-goal alignment — classify every recent successful action:
947
+ - GOAL-ADVANCING: creates, edits, removes, submits, or verifies the scenario's subject data (the object the scenario actually changes).
948
+ - VIEW-ONLY: toggles layout, filters, tabs, segment controls, sort orders, collapse/expand — changes which data is shown without modifying it.
949
+ - A single VIEW-ONLY action is legitimate when needed to reveal a target element for the next GOAL-ADVANCING action.
950
+ - A run of two or more consecutive successful VIEW-ONLY actions with no interleaved GOAL-ADVANCING action is thrashing — Tester is exploring UI instead of executing the scenario. Redirect Tester to the specific mutation or verification the scenario requires.
951
+ - VIEW-ONLY actions also tend to produce large page diffs with many htmlParts; if you see that pattern repeatedly in recent_actions, treat it as evidence of thrashing.
952
+
730
953
  Navigation awareness — always compare current page url to START URL:
731
954
  - subpage navigation (deeper path from START URL) — OK, scenario may need sub-pages
732
955
  - outer-page navigation (parent/sibling path from START URL) — SUSPICIOUS. The scenario target is on the START page. Do NOT rationalize leaving it. Instruct Tester to back() or reset().
package/src/ai/planner.ts CHANGED
@@ -447,6 +447,34 @@ export class Planner extends PlannerBase implements Agent {
447
447
  const titleListing = allTests.map((t) => `- "${t.scenario}" [${t.result || 'pending'}]`).join('\n');
448
448
  const compactContext = planToCompactAiContext(this.currentPlan);
449
449
 
450
+ let planningStrategy: string;
451
+ if (feature) {
452
+ planningStrategy = dedent`
453
+ <planning_strategy>
454
+ Stay strictly inside the "${feature}" feature area. Do NOT switch to a different, unrelated feature even if it has no coverage.
455
+ Propose ${this.MIN_TASKS}-${this.MAX_TASKS} additional scenarios for "${feature}" that are not already in the tested list.
456
+ Use the <approach> above to decide which new angles to explore — different controls, inputs, states, outcome categories, or combinations — all within "${feature}".
457
+ Return an empty scenarios array only when no genuinely new scenario for "${feature}" remains.
458
+ </planning_strategy>
459
+ `;
460
+ } else {
461
+ let extendedResearchHint = '';
462
+ if (mdq(plannerResearch).query('section("Extended Research")').count() > 0) {
463
+ extendedResearchHint = 'IMPORTANT: The research contains "Extended Research" sections with dropdowns, modals, and panels. Prioritize testing features from Extended Research that have no coverage yet.';
464
+ }
465
+ planningStrategy = dedent`
466
+ <planning_strategy>
467
+ Find a feature area in the research that has NO or minimal test coverage.
468
+ Pick that ONE feature and propose ${this.MIN_TASKS}-${this.MAX_TASKS} tests for it.
469
+ ${extendedResearchHint}
470
+
471
+ Follow the <approach> described above when proposing tests for this feature.
472
+
473
+ If ALL features across ALL research sections are covered, return empty scenarios array.
474
+ </planning_strategy>
475
+ `;
476
+ }
477
+
450
478
  conversation.addUserText(dedent`
451
479
  CRITICAL: This plan already has tests.
452
480
 
@@ -466,15 +494,7 @@ export class Planner extends PlannerBase implements Agent {
466
494
  ${compactContext}
467
495
  </tested_scenarios>
468
496
 
469
- <planning_strategy>
470
- Find a feature area in the research that has NO or minimal test coverage.
471
- Pick that ONE feature and propose ${this.MIN_TASKS}-${this.MAX_TASKS} tests for it.
472
- ${mdq(plannerResearch).query('section("Extended Research")').count() > 0 ? 'IMPORTANT: The research contains "Extended Research" sections with dropdowns, modals, and panels. Prioritize testing features from Extended Research that have no coverage yet.' : ''}
473
-
474
- Follow the <approach> described above when proposing tests for this feature.
475
-
476
- If ALL features across ALL research sections are covered, return empty scenarios array.
477
- </planning_strategy>
497
+ ${planningStrategy}
478
498
 
479
499
  <context_from_previous_tests>
480
500
  During testing, the following pages were visited: