explorbot 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +37 -1
  2. package/bin/explorbot-cli.ts +27 -18
  3. package/dist/bin/explorbot-cli.js +26 -18
  4. package/dist/package.json +3 -3
  5. package/dist/rules/navigator/output.md +9 -0
  6. package/dist/rules/navigator/verification-actions.md +2 -0
  7. package/dist/src/action-result.js +23 -1
  8. package/dist/src/action.js +51 -42
  9. package/dist/src/ai/bosun.js +11 -1
  10. package/dist/src/ai/conversation.js +39 -0
  11. package/dist/src/ai/historian/codeceptjs.js +109 -0
  12. package/dist/src/ai/historian/experience.js +321 -0
  13. package/dist/src/ai/historian/mixin.js +2 -0
  14. package/dist/src/ai/historian/playwright.js +145 -0
  15. package/dist/src/ai/historian/screencast.js +121 -0
  16. package/dist/src/ai/historian/utils.js +18 -0
  17. package/dist/src/ai/historian.js +21 -405
  18. package/dist/src/ai/navigator.js +82 -29
  19. package/dist/src/ai/pilot.js +232 -13
  20. package/dist/src/ai/planner.js +29 -9
  21. package/dist/src/ai/provider.js +54 -17
  22. package/dist/src/ai/researcher.js +41 -32
  23. package/dist/src/ai/rules.js +26 -14
  24. package/dist/src/ai/tester.js +90 -26
  25. package/dist/src/ai/tools.js +13 -7
  26. package/dist/src/browser-server.js +16 -3
  27. package/dist/src/commands/add-rule-command.js +11 -8
  28. package/dist/src/commands/clean-command.js +2 -1
  29. package/dist/src/commands/explore-command.js +43 -15
  30. package/dist/src/commands/init-command.js +9 -8
  31. package/dist/src/commands/plan-command.js +32 -0
  32. package/dist/src/commands/plan-save-command.js +19 -7
  33. package/dist/src/commands/rerun-command.js +4 -0
  34. package/dist/src/components/App.js +15 -5
  35. package/dist/src/execution-controller.js +13 -2
  36. package/dist/src/experience-tracker.js +20 -64
  37. package/dist/src/explorbot.js +8 -8
  38. package/dist/src/explorer.js +11 -3
  39. package/dist/src/observability.js +50 -99
  40. package/dist/src/playwright-recorder.js +309 -0
  41. package/dist/src/reporter.js +4 -1
  42. package/dist/src/test-plan.js +12 -0
  43. package/dist/src/utils/aria.js +37 -1
  44. package/dist/src/utils/error-page.js +20 -7
  45. package/dist/src/utils/next-steps.js +37 -0
  46. package/dist/src/utils/strings.js +15 -0
  47. package/package.json +3 -3
  48. package/rules/navigator/output.md +9 -0
  49. package/rules/navigator/verification-actions.md +2 -0
  50. package/src/action-result.ts +26 -1
  51. package/src/action.ts +49 -41
  52. package/src/ai/bosun.ts +11 -1
  53. package/src/ai/conversation.ts +37 -0
  54. package/src/ai/historian/codeceptjs.ts +130 -0
  55. package/src/ai/historian/experience.ts +384 -0
  56. package/src/ai/historian/mixin.ts +4 -0
  57. package/src/ai/historian/playwright.ts +169 -0
  58. package/src/ai/historian/screencast.ts +133 -0
  59. package/src/ai/historian/utils.ts +23 -0
  60. package/src/ai/historian.ts +37 -473
  61. package/src/ai/navigator.ts +82 -29
  62. package/src/ai/pilot.ts +237 -14
  63. package/src/ai/planner.ts +29 -9
  64. package/src/ai/provider.ts +51 -17
  65. package/src/ai/researcher.ts +45 -33
  66. package/src/ai/rules.ts +27 -14
  67. package/src/ai/tester.ts +94 -26
  68. package/src/ai/tools.ts +47 -25
  69. package/src/browser-server.ts +17 -3
  70. package/src/commands/add-rule-command.ts +11 -7
  71. package/src/commands/clean-command.ts +2 -1
  72. package/src/commands/explore-command.ts +46 -14
  73. package/src/commands/init-command.ts +9 -8
  74. package/src/commands/plan-command.ts +35 -0
  75. package/src/commands/plan-save-command.ts +18 -7
  76. package/src/commands/rerun-command.ts +5 -0
  77. package/src/components/App.tsx +16 -5
  78. package/src/config.ts +12 -1
  79. package/src/execution-controller.ts +14 -3
  80. package/src/experience-tracker.ts +21 -72
  81. package/src/explorbot.ts +8 -8
  82. package/src/explorer.ts +13 -3
  83. package/src/observability.ts +50 -109
  84. package/src/playwright-recorder.ts +305 -0
  85. package/src/reporter.ts +4 -1
  86. package/src/test-plan.ts +12 -0
  87. package/src/utils/aria.ts +38 -1
  88. package/src/utils/error-page.ts +22 -7
  89. package/src/utils/next-steps.ts +51 -0
  90. package/src/utils/strings.ts +17 -0
@@ -8,8 +8,9 @@ import { HooksRunner } from "../utils/hooks-runner.js";
8
8
  import { createDebug, pluralize, tag } from '../utils/logger.js';
9
9
  import { loop, pause } from '../utils/loop.js';
10
10
  import { RulesLoader } from "../utils/rules-loader.js";
11
+ import { extractStatePath } from '../utils/url-matcher.js';
11
12
  import { Researcher } from "./researcher.js";
12
- import { actionRule, locatorRule } from './rules.js';
13
+ import { actionRule, locatorRule, unexpectedPopupRule } from './rules.js';
13
14
  import { isInteractive } from './task-agent.js';
14
15
  import { createAgentTools } from "./tools.js";
15
16
  const debugLog = createDebug('explorbot:navigator');
@@ -131,11 +132,10 @@ class Navigator {
131
132
  </hint>`;
132
133
  }
133
134
  if (!actionResult.isInsideIframe) {
134
- const toc = this.experienceTracker.getExperienceTableOfContents(actionResult);
135
- if (toc.length > 0) {
136
- const totalSections = toc.reduce((sum, entry) => sum + entry.sections.length, 0);
137
- tag('substep').log(`Found ${toc.length} experience ${pluralize(toc.length, 'file')} (${totalSections} sections) for: ${actionResult.url}`);
138
- experience = renderExperienceToc(toc);
135
+ const successful = this.experienceTracker.getSuccessfulExperience(actionResult);
136
+ if (successful.length > 0) {
137
+ tag('substep').log(`Found ${successful.length} experience ${pluralize(successful.length, 'file')} for: ${actionResult.url}`);
138
+ experience = `<experience>\nPast successful recipes recorded from prior runs for this page. Prefer these solutions first if they match the goal.\n\n${successful.join('\n\n')}\n</experience>`;
139
139
  }
140
140
  }
141
141
  const prompt = dedent `
@@ -163,6 +163,8 @@ class Navigator {
163
163
 
164
164
  ${actionRule}
165
165
 
166
+ ${unexpectedPopupRule}
167
+
166
168
  ${RulesLoader.loadRules('navigator', ['multiple-locator', 'output'], actionResult.url || '').replace('{{maxAttempts}}', String(this.MAX_ATTEMPTS))}
167
169
 
168
170
  ${experience}
@@ -171,11 +173,13 @@ class Navigator {
171
173
  `;
172
174
  const conversation = this.provider.startConversation(this.systemPrompt, 'navigator');
173
175
  conversation.addUserText(prompt);
174
- const tools = this.buildExperienceTools();
176
+ const tools = undefined;
175
177
  let codeBlocks = [];
176
178
  let htmlContextAdded = false;
177
179
  let codeBlockIndex = 0;
178
180
  let totalAttempts = 0;
181
+ const progressBlocks = [];
182
+ const batchFailures = [];
179
183
  let resolved = false;
180
184
  await loop(async ({ stop }) => {
181
185
  if (codeBlocks.length === 0) {
@@ -194,43 +198,90 @@ class Navigator {
194
198
  }
195
199
  const codeBlock = codeBlocks[codeBlockIndex];
196
200
  if (!codeBlock) {
201
+ if (batchFailures.length === 0 && htmlContextAdded) {
202
+ stop();
203
+ return;
204
+ }
205
+ tag('substep').log('Feeding failures back to AI for a new batch...');
206
+ let contextMsg = 'Previous solutions did not work. Analyze the failures and try DIFFERENT strategies (not syntactic variants of the same locator).\n\n';
207
+ if (batchFailures.length > 0) {
208
+ const lines = batchFailures.map((f) => `- \`${f.code.split('\n')[0]}\` → ${f.error}`).join('\n');
209
+ contextMsg += `<previous_failures>\n${lines}\n</previous_failures>\n\n`;
210
+ }
197
211
  if (!htmlContextAdded) {
198
212
  htmlContextAdded = true;
199
- tag('substep').log('Adding HTML context for better resolution...');
200
- conversation.addUserText(dedent `
201
- Previous solutions did not work. Here is the full HTML context:
202
-
203
- <page_html>
204
- ${await actionResult.combinedHtml()}
205
- </page_html>
206
-
207
- Please suggest new solutions based on this additional context.
208
- `);
209
- codeBlocks = [];
210
- return;
213
+ contextMsg += `Full HTML context:\n\n<page_html>\n${await actionResult.combinedHtml()}\n</page_html>\n\n`;
211
214
  }
212
- stop();
215
+ contextMsg += 'Propose new solutions. If errors mention "intercepts pointer events" or timeouts on visible elements, an overlay is blocking — dismiss it first (Escape, click outside, Close button) before retrying the original action.';
216
+ conversation.addUserText(contextMsg);
217
+ codeBlocks = [];
218
+ batchFailures.length = 0;
213
219
  return;
214
220
  }
215
221
  codeBlockIndex++;
216
222
  totalAttempts++;
217
223
  await this.explorer.switchToMainFrame();
224
+ const prevHash = action.actionResult?.getStateHash() ?? actionResult.getStateHash();
218
225
  debugLog(`Attempting resolution: ${codeBlock}`);
219
- resolved = await action.attempt(codeBlock, message);
226
+ const attemptOk = await action.attempt(codeBlock, message);
227
+ const page = action.playwrightHelper?.page;
228
+ if (page) {
229
+ try {
230
+ await page.waitForLoadState('load', { timeout: 5000 });
231
+ }
232
+ catch {
233
+ // Navigation did not reach 'load' state within timeout; continue and verify URL
234
+ }
235
+ }
236
+ if (!attemptOk) {
237
+ const raw = action.lastError?.message || 'attempt failed';
238
+ const firstMeaningful = raw.split('\n').find((l) => l.trim() && !l.trim().startsWith('at ')) || raw;
239
+ const shortErr = firstMeaningful.replace(/\s+/g, ' ').trim().slice(0, 220);
240
+ batchFailures.push({ code: codeBlock, error: shortErr });
241
+ }
220
242
  if (expectedUrl) {
221
- await action.getActor().wait(2);
222
- const freshState = await action.capturePageState();
223
- if (normalizeUrl(freshState.url || '') === normalizeUrl(expectedUrl)) {
224
- resolved = true;
243
+ if (page) {
244
+ try {
245
+ await page.waitForURL((url) => normalizeUrl(url.pathname) === normalizeUrl(expectedUrl), { timeout: 5000 });
246
+ }
247
+ catch {
248
+ // URL did not transition to expectedUrl within timeout
249
+ }
225
250
  }
226
- else if (resolved) {
251
+ const freshState = await action.capturePageState();
252
+ const urlMatches = normalizeUrl(freshState.url || '') === normalizeUrl(expectedUrl);
253
+ const stateChanged = freshState.getStateHash() !== actionResult.getStateHash();
254
+ resolved = urlMatches && stateChanged;
255
+ if (!resolved && attemptOk) {
227
256
  tag('warning').log(`URL verification failed: expected ${expectedUrl}, got ${freshState.url}`);
228
- resolved = false;
257
+ }
258
+ if (freshState.getStateHash() !== prevHash && (attemptOk || urlMatches)) {
259
+ progressBlocks.push(codeBlock);
229
260
  }
230
261
  }
262
+ else {
263
+ resolved = attemptOk;
264
+ if (attemptOk)
265
+ progressBlocks.push(codeBlock);
266
+ }
231
267
  if (resolved) {
232
268
  tag('success').log('Navigation resolved successfully');
233
- this.experienceTracker.writeAction(actionResult, { title: message, code: codeBlock });
269
+ let scenario = message.split('\n')[0];
270
+ if (expectedUrl) {
271
+ const fromPath = extractStatePath(actionResult.url || '');
272
+ const toPath = extractStatePath(expectedUrl);
273
+ scenario = `reach ${toPath} from ${fromPath}`;
274
+ }
275
+ const recipe = progressBlocks
276
+ .join('\n')
277
+ .split('\n')
278
+ .filter((line) => !/^\s*I\.amOnPage\s*\(/.test(line))
279
+ .join('\n')
280
+ .trim();
281
+ if (recipe) {
282
+ const body = `## FLOW: ${scenario}\n\n* ${scenario}\n\n\`\`\`js\n${recipe}\n\`\`\`\n\n---\n`;
283
+ this.experienceTracker.writeFlow(actionResult, body);
284
+ }
234
285
  stop();
235
286
  return;
236
287
  }
@@ -455,6 +506,7 @@ class Navigator {
455
506
  const tools = this.buildExperienceTools();
456
507
  let codeBlocks = [];
457
508
  const successfulCodes = [];
509
+ const assertionSteps = [];
458
510
  const action = this.explorer.createAction();
459
511
  await loop(async ({ stop, iteration }) => {
460
512
  if (codeBlocks.length === 0) {
@@ -479,6 +531,7 @@ class Navigator {
479
531
  if (verified) {
480
532
  tag('success').log('Verification passed');
481
533
  successfulCodes.push(codeBlock);
534
+ assertionSteps.push(...action.assertionSteps);
482
535
  }
483
536
  }, {
484
537
  maxAttempts: this.MAX_ATTEMPTS,
@@ -493,7 +546,7 @@ class Navigator {
493
546
  const verified = totalAttempted <= 1 ? successfulCodes.length > 0 : successfulCodes.length > totalAttempted / 2;
494
547
  actionResult.addVerification(message, verified);
495
548
  this.explorer.getStateManager().updateState(actionResult);
496
- return { verified, successfulCodes, totalAttempted };
549
+ return { verified, successfulCodes, assertionSteps, totalAttempted };
497
550
  }
498
551
  }
499
552
  export { Navigator };
@@ -6,6 +6,7 @@ import { ConfigParser } from "../config.js";
6
6
  import { renderExperienceToc } from "../experience-tracker.js";
7
7
  import { TestResult } from "../test-plan.js";
8
8
  import { collectInteractiveNodes, detectFocusArea, extractFocusedElement } from "../utils/aria.js";
9
+ import { ErrorPageError } from "../utils/error-page.js";
9
10
  import { createDebug, tag } from "../utils/logger.js";
10
11
  const debugLog = createDebug('explorbot:pilot');
11
12
  import { truncateJson } from "../utils/strings.js";
@@ -42,22 +43,27 @@ export class Pilot {
42
43
  return null;
43
44
  return this.conversation.getLastMessage() || null;
44
45
  }
45
- async reviewStop(task, currentState, testerConversation) {
46
- return this.reviewDecision('stop', task, currentState, testerConversation);
46
+ async reviewStop(task, currentState, testerConversation, navigator) {
47
+ return this.reviewDecision('stop', task, currentState, testerConversation, navigator);
47
48
  }
48
- async reviewFinish(task, currentState, testerConversation) {
49
- return this.reviewDecision('finish', task, currentState, testerConversation);
49
+ async reviewFinish(task, currentState, testerConversation, navigator) {
50
+ return this.reviewDecision('finish', task, currentState, testerConversation, navigator);
50
51
  }
51
- async reviewCompletion(task, currentState, testerConversation) {
52
+ async reviewCompletion(task, currentState, testerConversation, navigator) {
52
53
  const verdictType = task.hasAchievedAny() ? 'finish' : 'stop';
53
- return this.reviewDecision(verdictType, task, currentState, testerConversation);
54
+ return this.reviewDecision(verdictType, task, currentState, testerConversation, navigator);
54
55
  }
55
- async finalReview(task, currentState, testerConversation) {
56
+ async finalReview(task, currentState, testerConversation, navigator) {
56
57
  if (task.hasFinished)
57
58
  return false;
58
- return this.reviewCompletion(task, currentState, testerConversation);
59
+ return this.reviewCompletion(task, currentState, testerConversation, navigator);
59
60
  }
60
- async reviewDecision(type, task, currentState, testerConversation) {
61
+ async reviewReset(task, currentState, reason, testerConversation) {
62
+ return this.reviewResetDecision(task, currentState, reason, testerConversation);
63
+ }
64
+ async reviewDecision(type, task, currentState, testerConversation, navigator) {
65
+ if (task.hasFinished)
66
+ return false;
61
67
  tag('substep').log(`Pilot reviewing ${type} verdict...`);
62
68
  const sessionLog = this.formatSessionLog(testerConversation);
63
69
  const stateContext = this.buildStateContext(currentState);
@@ -79,6 +85,10 @@ export class Pilot {
79
85
  decision: z.enum(['pass', 'fail', 'continue', 'skipped']).describe('pass = test succeeded, fail = test failed, continue = tester should keep going, skipped = scenario is irrelevant OR systematic execution failures prevented testing'),
80
86
  reason: z.string().describe('What happened and why (1-2 sentences). Do NOT repeat the decision status (e.g. "scenario goal achieved/not achieved") — just explain the evidence. For continue: explain why rejected and suggest alternatives.'),
81
87
  guidance: z.string().nullable().describe('Required for "continue": specific actionable instruction for the tester — what exactly to verify, retry differently, or complete next. Be concrete.'),
88
+ requestVerification: z
89
+ .string()
90
+ .nullable()
91
+ .describe('REQUIRED whenever decision is "pass" — provide a specific assertion that proves the scenario goal on the current page (e.g., "New test suite \\"Foo\\" is visible in the suites list"). The system runs it and bakes the resulting assertion into the generated test file; without it the test file has no verifiable expect(). Also use when evidence is insufficient before deciding pass/fail. Leave null for "continue", "fail", or "skipped".'),
82
92
  });
83
93
  const userContent = dedent `
84
94
  Tester wants to ${type} the test.
@@ -106,6 +116,12 @@ export class Pilot {
106
116
  - "continue" if tester hasn't completed the scenario goal yet — even if milestones were checked
107
117
  - If evidence is mixed, but final state indicates goal completion, choose "pass"
108
118
  - If evidence is mixed and final state is unclear, prefer "continue" over "fail"
119
+
120
+ When deciding "pass", you MUST also set requestVerification to a CodeceptJS assertion that
121
+ proves the scenario goal on the current page. Choose the strongest single evidence (a unique
122
+ element/text that exists ONLY because the scenario succeeded). The assertion is executed and
123
+ then converted into the spec file's expect() — without it the generated test has nothing to
124
+ assert and is worthless.
109
125
  `;
110
126
  const messages = [
111
127
  {
@@ -124,6 +140,30 @@ export class Pilot {
124
140
  task.finish(TestResult.FAILED);
125
141
  return false;
126
142
  }
143
+ if (result.requestVerification && navigator) {
144
+ tag('substep').log(`Pilot requesting verification: ${result.requestVerification}`);
145
+ try {
146
+ const verifyResult = await navigator.verifyState(result.requestVerification, currentState);
147
+ if (verifyResult.verified) {
148
+ if (verifyResult.assertionSteps?.length) {
149
+ this.explorer.getPlaywrightRecorder().recordVerification(verifyResult.assertionSteps);
150
+ }
151
+ tag('substep').log(`Pilot verified: ${result.requestVerification}`);
152
+ }
153
+ else {
154
+ tag('substep').log(`Pilot verification failed: ${result.requestVerification}`);
155
+ if (result.decision === 'pass') {
156
+ const flipMessage = `Verification "${result.requestVerification}" did not match the page. Adjust approach and re-verify before finishing.`;
157
+ result.decision = 'continue';
158
+ result.reason = flipMessage;
159
+ result.guidance = result.guidance ?? flipMessage;
160
+ }
161
+ }
162
+ }
163
+ catch (verifyErr) {
164
+ tag('warning').log(`Pilot verification errored: ${verifyErr.message}`);
165
+ }
166
+ }
127
167
  tag('info').log(`Pilot: ${result.decision} — ${result.reason}`);
128
168
  task.summary = result.reason;
129
169
  if (result.decision === 'pass') {
@@ -152,6 +192,131 @@ export class Pilot {
152
192
  return false;
153
193
  }
154
194
  }
195
+ async reviewResetDecision(task, currentState, reason, testerConversation) {
196
+ if (task.hasFinished)
197
+ return false;
198
+ tag('substep').log(`Pilot reviewing reset (count=${task.resetCount})...`);
199
+ const sessionLog = this.formatSessionLog(testerConversation);
200
+ const stateContext = this.buildStateContext(currentState);
201
+ const notes = task.notesToString() || 'No notes recorded.';
202
+ const schema = z.object({
203
+ decision: z.enum(['allow', 'fail', 'continue', 'skipped']).describe('allow = reset proceeds, fail = test failed (stop looping), continue = veto reset, tester should act on current page instead, skipped = scenario is irrelevant or cannot be executed'),
204
+ reason: z.string().describe('What evidence justifies this decision (1-2 sentences). Do not restate the decision.'),
205
+ guidance: z.string().nullable().describe('Required for "continue": concrete instruction for what the tester should do instead of resetting (e.g. which tool to call, what to verify).'),
206
+ });
207
+ const userContent = dedent `
208
+ Tester requested reset. Previous reset count: ${task.resetCount - 1}.
209
+
210
+ Reason given by tester: ${reason || '(none)'}
211
+
212
+ <state>
213
+ ${stateContext}
214
+ </state>
215
+
216
+ ${this.formatExpectations(task)}
217
+
218
+ <notes>
219
+ ${notes}
220
+ </notes>
221
+
222
+ <session_log>
223
+ ${sessionLog || 'No actions recorded'}
224
+ </session_log>
225
+
226
+ Decide:
227
+ - "allow" — the reset is legitimate (navigation dead-end, wrong page, irrecoverable error on current page).
228
+ - "continue" — veto the reset; something on the current page can still be used to progress or verify. Provide guidance.
229
+ - "fail" — reset-looping: tester has already reset and the underlying obstacle will not change. Stop the test as failed.
230
+ - "skipped" — the scenario is inapplicable to this application or cannot be executed here.
231
+ `;
232
+ const messages = [
233
+ {
234
+ role: 'system',
235
+ content: this.buildResetSystemPrompt(task),
236
+ },
237
+ { role: 'user', content: userContent },
238
+ ];
239
+ try {
240
+ const response = await this.provider.generateObject(messages, schema, this.provider.getAgenticModel('pilot'), {
241
+ agentName: 'pilot',
242
+ experimental_telemetry: { functionId: 'pilot.reviewReset' },
243
+ });
244
+ const result = response?.object;
245
+ if (!result) {
246
+ return true;
247
+ }
248
+ tag('info').log(`Pilot reset verdict: ${result.decision} — ${result.reason}`);
249
+ if (result.decision === 'allow') {
250
+ tag('substep').log(`Pilot allowed reset: ${result.reason}`);
251
+ return true;
252
+ }
253
+ if (result.decision === 'fail') {
254
+ task.addNote(`Pilot: reset refused — ${result.reason}`, TestResult.FAILED);
255
+ task.finish(TestResult.FAILED);
256
+ return false;
257
+ }
258
+ if (result.decision === 'skipped') {
259
+ task.addNote(`Pilot: skipped — ${result.reason}`, TestResult.SKIPPED);
260
+ task.finish(TestResult.SKIPPED);
261
+ return false;
262
+ }
263
+ tag('substep').log(`Pilot vetoed reset: ${result.reason}`);
264
+ const guidanceText = result.guidance ? `\n\nWhat to do instead: ${result.guidance}` : '';
265
+ testerConversation.addUserText(`Pilot vetoed reset: ${result.reason}${guidanceText}`);
266
+ return false;
267
+ }
268
+ catch (error) {
269
+ tag('warning').log(`Pilot reset review failed: ${error.message}`);
270
+ return true;
271
+ }
272
+ }
273
+ buildResetSystemPrompt(task) {
274
+ return dedent `
275
+ You are Pilot — the supervisor that decides whether a reset is legitimate.
276
+ Tester wants to reset (navigate back to the start URL and discard progress).
277
+
278
+ SCENARIO: ${task.scenario}
279
+
280
+ Reset is DESTRUCTIVE. It abandons all work done in this iteration. In stateful apps, any
281
+ side effects (records created, forms submitted) persist on the server — resetting does not
282
+ undo them. Unnecessary resets create duplicate data and loop forever.
283
+
284
+ LEGITIMATE RESET (decide "allow"):
285
+ - The current page is unrelated to the scenario and no path leads back.
286
+ - Navigation is stuck in an error state with no recoverable action.
287
+ - The tester arrived on a page that cannot host the scenario at all.
288
+
289
+ ILLEGITIMATE RESET (decide "continue"):
290
+ - The previous action already succeeded (URL changed to a success/detail page, record visible,
291
+ confirmation shown) and tester wants to redo it because an assertion did not match.
292
+ The work is done — verify, record, or finish instead of restarting.
293
+ - A single expectation / milestone does not match app reality but the scenario goal may still
294
+ have been achieved. Do not redo — instruct the tester to verify the actual outcome.
295
+ - Tester wants to "try again with different input" after a form was submitted. Submitting
296
+ again creates a duplicate; guide toward editing the existing record or accepting the state.
297
+
298
+ RESET-LOOP (decide "fail"):
299
+ - resetCount >= 2 and the previous resets did not change the underlying situation.
300
+ - The same flow has been attempted twice with the same failure mode.
301
+ - Repeating the reset cannot produce new information.
302
+
303
+ SCENARIO INAPPLICABLE (decide "skipped"):
304
+ - The feature the scenario targets does not exist on this app, or prerequisites cannot be met.
305
+
306
+ PRIORITY:
307
+ 1) Evidence of successful side effects in session_log (URL transition, new record visible).
308
+ If present, almost never allow the reset — the work is done.
309
+ 2) resetCount. Each prior reset raises the bar for allowing another.
310
+ 3) Tester's stated reason. Weigh it against the observed evidence, do not trust it blindly.
311
+
312
+ GUIDANCE FIELD (required when decision is "continue"):
313
+ Give a specific next action on the current page: which tool to call, what to verify, or how to
314
+ record the outcome. Do not suggest repeating actions that already succeeded.
315
+
316
+ EXPECTED RESULTS (milestones, not the goal):
317
+ ${task.expected.map((e) => `- ${e}`).join('\n')}
318
+ `;
319
+ }
155
320
  buildVerdictSystemPrompt(type, task) {
156
321
  return dedent `
157
322
  You are Pilot — the final decision maker for test pass/fail.
@@ -248,8 +413,12 @@ export class Pilot {
248
413
  the elements needed for the scenario. The page summary does not list every element.
249
414
  Prefer interacting with the current page over navigating away.
250
415
 
416
+ If you load a recipe via learn_experience, do NOT rewrite its code in your plan — the
417
+ raw recipe is forwarded to Tester automatically. Reference it by step ("apply recipe
418
+ steps 1–3, then…") and call out anywhere your scenario diverges from it.
419
+
251
420
  Be concise and specific. Tester will follow your plan.
252
- `, 'pilot.planTest', { tools: true, maxToolRoundtrips: 3, task });
421
+ `, 'pilot.planTest', { tools: true, planningOnly: true, maxToolRoundtrips: 3, task });
253
422
  }
254
423
  async reviewNewPage(task, currentState) {
255
424
  if (!this.conversation)
@@ -329,7 +498,10 @@ export class Pilot {
329
498
  }
330
499
  }
331
500
  this.conversation.addUserText(finalUserText);
332
- let tools = opts.tools ? this.agentTools : undefined;
501
+ let tools;
502
+ if (opts.tools) {
503
+ tools = opts.planningOnly ? this.pickPlanningTools() : this.agentTools;
504
+ }
333
505
  if (opts.tools && opts.task) {
334
506
  tools = { ...tools, ...this.buildPreconditionTool(opts.task) };
335
507
  }
@@ -338,7 +510,20 @@ export class Pilot {
338
510
  agentName: 'pilot',
339
511
  experimental_telemetry: { functionId },
340
512
  });
341
- return result?.response?.text || '';
513
+ const text = result?.response?.text || '';
514
+ const learned = (result?.toolExecutions || []).filter((e) => e.toolName === 'learn_experience' && e.output?.content).map((e) => e.output.content);
515
+ if (learned.length === 0)
516
+ return text;
517
+ return dedent `
518
+ ${text}
519
+
520
+ <applied_experience>
521
+ Recipes from prior successful runs that Pilot judged relevant. Locators worked then; the page may have changed since.
522
+ Treat code blocks below as a starting hypothesis. If a locator misses, fall back to ARIA/UI-map.
523
+
524
+ ${learned.join('\n\n')}
525
+ </applied_experience>
526
+ `;
342
527
  }
343
528
  getExperienceToc() {
344
529
  if (!this.experienceTracker)
@@ -350,6 +535,25 @@ export class Pilot {
350
535
  const toc = this.experienceTracker.getExperienceTableOfContents(actionResult);
351
536
  return renderExperienceToc(toc);
352
537
  }
538
+ pickPlanningTools() {
539
+ const { see, context, verify, research, getVisitedStates, xpathCheck, learn_experience } = this.agentTools ?? {};
540
+ const planning = {};
541
+ if (see)
542
+ planning.see = see;
543
+ if (context)
544
+ planning.context = context;
545
+ if (verify)
546
+ planning.verify = verify;
547
+ if (research)
548
+ planning.research = research;
549
+ if (getVisitedStates)
550
+ planning.getVisitedStates = getVisitedStates;
551
+ if (xpathCheck)
552
+ planning.xpathCheck = xpathCheck;
553
+ if (learn_experience)
554
+ planning.learn_experience = learn_experience;
555
+ return planning;
556
+ }
353
557
  buildPreconditionTool(task) {
354
558
  return {
355
559
  precondition: tool({
@@ -483,7 +687,15 @@ export class Pilot {
483
687
  }
484
688
  }
485
689
  if (text.includes('ATTACH_UI_MAP')) {
486
- const uiMap = await this.researcher.research(currentState);
690
+ let uiMap = '';
691
+ try {
692
+ uiMap = await this.researcher.research(currentState);
693
+ }
694
+ catch (err) {
695
+ if (!(err instanceof ErrorPageError))
696
+ throw err;
697
+ tag('warning').log(`Pilot UI map skipped: ${err.message}`);
698
+ }
487
699
  if (uiMap) {
488
700
  parts.push(dedent `
489
701
  <page_ui_map>
@@ -635,6 +847,13 @@ export class Pilot {
635
847
  - If the goal was achieved by a previous action (SUCCESS in recent_actions with confirming ariaDiff): instruct Tester to verify() the result and finish(). Do NOT repeat the same action.
636
848
  - If Tester keeps re-opening the same panel and re-submitting the same data — STOP. The action was already completed.
637
849
 
850
+ Action-goal alignment — classify every recent successful action:
851
+ - GOAL-ADVANCING: creates, edits, removes, submits, or verifies the scenario's subject data (the object the scenario actually changes).
852
+ - VIEW-ONLY: toggles layout, filters, tabs, segment controls, sort orders, collapse/expand — changes which data is shown without modifying it.
853
+ - A single VIEW-ONLY action is legitimate when needed to reveal a target element for the next GOAL-ADVANCING action.
854
+ - A run of two or more consecutive successful VIEW-ONLY actions with no interleaved GOAL-ADVANCING action is thrashing — Tester is exploring UI instead of executing the scenario. Redirect Tester to the specific mutation or verification the scenario requires.
855
+ - VIEW-ONLY actions also tend to produce large page diffs with many htmlParts; if you see that pattern repeatedly in recent_actions, treat it as evidence of thrashing.
856
+
638
857
  Navigation awareness — always compare current page url to START URL:
639
858
  - subpage navigation (deeper path from START URL) — OK, scenario may need sub-pages
640
859
  - outer-page navigation (parent/sibling path from START URL) — SUSPICIOUS. The scenario target is on the START page. Do NOT rationalize leaving it. Instruct Tester to back() or reset().
@@ -400,6 +400,34 @@ export class Planner extends PlannerBase {
400
400
  const allTests = this.currentPlan.getAllTests();
401
401
  const titleListing = allTests.map((t) => `- "${t.scenario}" [${t.result || 'pending'}]`).join('\n');
402
402
  const compactContext = planToCompactAiContext(this.currentPlan);
403
+ let planningStrategy;
404
+ if (feature) {
405
+ planningStrategy = dedent `
406
+ <planning_strategy>
407
+ Stay strictly inside the "${feature}" feature area. Do NOT switch to a different, unrelated feature even if it has no coverage.
408
+ Propose ${this.MIN_TASKS}-${this.MAX_TASKS} additional scenarios for "${feature}" that are not already in the tested list.
409
+ Use the <approach> above to decide which new angles to explore — different controls, inputs, states, outcome categories, or combinations — all within "${feature}".
410
+ Return an empty scenarios array only when no genuinely new scenario for "${feature}" remains.
411
+ </planning_strategy>
412
+ `;
413
+ }
414
+ else {
415
+ let extendedResearchHint = '';
416
+ if (mdq(plannerResearch).query('section("Extended Research")').count() > 0) {
417
+ extendedResearchHint = 'IMPORTANT: The research contains "Extended Research" sections with dropdowns, modals, and panels. Prioritize testing features from Extended Research that have no coverage yet.';
418
+ }
419
+ planningStrategy = dedent `
420
+ <planning_strategy>
421
+ Find a feature area in the research that has NO or minimal test coverage.
422
+ Pick that ONE feature and propose ${this.MIN_TASKS}-${this.MAX_TASKS} tests for it.
423
+ ${extendedResearchHint}
424
+
425
+ Follow the <approach> described above when proposing tests for this feature.
426
+
427
+ If ALL features across ALL research sections are covered, return empty scenarios array.
428
+ </planning_strategy>
429
+ `;
430
+ }
403
431
  conversation.addUserText(dedent `
404
432
  CRITICAL: This plan already has tests.
405
433
 
@@ -419,15 +447,7 @@ export class Planner extends PlannerBase {
419
447
  ${compactContext}
420
448
  </tested_scenarios>
421
449
 
422
- <planning_strategy>
423
- Find a feature area in the research that has NO or minimal test coverage.
424
- Pick that ONE feature and propose ${this.MIN_TASKS}-${this.MAX_TASKS} tests for it.
425
- ${mdq(plannerResearch).query('section("Extended Research")').count() > 0 ? 'IMPORTANT: The research contains "Extended Research" sections with dropdowns, modals, and panels. Prioritize testing features from Extended Research that have no coverage yet.' : ''}
426
-
427
- Follow the <approach> described above when proposing tests for this feature.
428
-
429
- If ALL features across ALL research sections are covered, return empty scenarios array.
430
- </planning_strategy>
450
+ ${planningStrategy}
431
451
 
432
452
  <context_from_previous_tests>
433
453
  During testing, the following pages were visited: