explorbot 0.1.12 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/bin/explorbot-cli.ts +21 -21
  2. package/dist/bin/explorbot-cli.js +3 -3
  3. package/dist/package.json +4 -2
  4. package/dist/rules/researcher/container-rules.md +2 -0
  5. package/dist/src/action-result.js +2 -1
  6. package/dist/src/action.js +3 -8
  7. package/dist/src/ai/captain.js +0 -2
  8. package/dist/src/ai/conversation.js +20 -4
  9. package/dist/src/ai/driller.js +1108 -0
  10. package/dist/src/ai/historian/utils.js +8 -1
  11. package/dist/src/ai/pilot.js +214 -267
  12. package/dist/src/ai/provider.js +25 -12
  13. package/dist/src/ai/quartermaster.js +2 -2
  14. package/dist/src/ai/rules.js +5 -5
  15. package/dist/src/ai/session-analyst.js +122 -0
  16. package/dist/src/ai/tester.js +69 -22
  17. package/dist/src/ai/tools.js +19 -4
  18. package/dist/src/commands/base-command.js +6 -6
  19. package/dist/src/commands/drill-command.js +3 -2
  20. package/dist/src/commands/exit-command.js +1 -0
  21. package/dist/src/commands/explore-command.js +9 -2
  22. package/dist/src/components/AddRule.js +1 -1
  23. package/dist/src/components/StatusPane.js +6 -1
  24. package/dist/src/experience-tracker.js +9 -0
  25. package/dist/src/explorbot.js +48 -8
  26. package/dist/src/explorer.js +11 -13
  27. package/dist/src/reporter.js +105 -4
  28. package/dist/src/state-manager.js +4 -3
  29. package/dist/src/stats.js +7 -1
  30. package/dist/src/test-plan.js +47 -3
  31. package/dist/src/utils/aria.js +354 -529
  32. package/dist/src/utils/hooks-runner.js +2 -8
  33. package/dist/src/utils/html.js +371 -0
  34. package/dist/src/utils/unique-names.js +12 -1
  35. package/dist/src/utils/url-matcher.js +6 -1
  36. package/dist/src/utils/web-element.js +27 -24
  37. package/dist/src/utils/xpath.js +1 -1
  38. package/package.json +4 -2
  39. package/rules/researcher/container-rules.md +2 -0
  40. package/src/action-result.ts +2 -1
  41. package/src/action.ts +3 -10
  42. package/src/ai/captain.ts +0 -2
  43. package/src/ai/conversation.ts +21 -4
  44. package/src/ai/driller.ts +1194 -0
  45. package/src/ai/historian/utils.ts +8 -1
  46. package/src/ai/pilot.ts +215 -265
  47. package/src/ai/provider.ts +24 -12
  48. package/src/ai/quartermaster.ts +2 -2
  49. package/src/ai/rules.ts +5 -5
  50. package/src/ai/session-analyst.ts +139 -0
  51. package/src/ai/tester.ts +63 -20
  52. package/src/ai/tools.ts +18 -4
  53. package/src/commands/base-command.ts +6 -6
  54. package/src/commands/drill-command.ts +3 -2
  55. package/src/commands/exit-command.ts +1 -0
  56. package/src/commands/explore-command.ts +10 -2
  57. package/src/components/AddRule.tsx +1 -1
  58. package/src/components/StatusPane.tsx +6 -3
  59. package/src/config.ts +4 -0
  60. package/src/experience-tracker.ts +9 -0
  61. package/src/explorbot.ts +55 -10
  62. package/src/explorer.ts +10 -12
  63. package/src/reporter.ts +108 -4
  64. package/src/state-manager.ts +4 -3
  65. package/src/stats.ts +10 -1
  66. package/src/test-plan.ts +62 -3
  67. package/src/utils/aria.ts +367 -537
  68. package/src/utils/hooks-runner.ts +2 -6
  69. package/src/utils/html.ts +381 -0
  70. package/src/utils/unique-names.ts +13 -0
  71. package/src/utils/url-matcher.ts +5 -1
  72. package/src/utils/web-element.ts +31 -28
  73. package/src/utils/xpath.ts +1 -1
  74. package/dist/src/ai/bosun.js +0 -456
  75. package/src/ai/bosun.ts +0 -571
@@ -69,16 +69,17 @@ export class Pilot {
69
69
  const stateContext = this.buildStateContext(currentState);
70
70
  const notes = task.notesToString() || 'No notes recorded.';
71
71
  let visualAnalysis = '';
72
+ let screenshotState = null;
72
73
  if (this.provider.hasVision()) {
73
74
  try {
74
75
  const action = this.explorer.createAction();
75
- const screenshotState = await action.caputrePageWithScreenshot();
76
+ screenshotState = await action.caputrePageWithScreenshot();
76
77
  if (screenshotState.screenshot) {
77
78
  visualAnalysis = (await this.researcher.answerQuestionAboutScreenshot(screenshotState, `Describe current page state relevant to: ${task.scenario}`)) || '';
78
79
  }
79
80
  }
80
81
  catch {
81
- // vision not available, continue without
82
+ screenshotState = null;
82
83
  }
83
84
  }
84
85
  const schema = z.object({
@@ -88,7 +89,7 @@ export class Pilot {
88
89
  requestVerification: z
89
90
  .string()
90
91
  .nullable()
91
- .describe('REQUIRED whenever decision is "pass" — provide a specific assertion that proves the scenario goal on the current page (e.g., "New test suite \\"Foo\\" is visible in the suites list"). The system runs it and bakes the resulting assertion into the generated test file; without it the test file has no verifiable expect(). Also use when evidence is insufficient before deciding pass/fail. Leave null for "continue", "fail", or "skipped".'),
92
+ .describe('REQUIRED whenever decision is "pass" — a one-sentence natural-language claim about the current page that, if true, proves the scenario goal (e.g., "New test suite \\"Foo\\" is visible in the suites list"). NOT code: do not write I.*, expect(), .then(), grabTitle, or any JavaScript. Navigator translates the claim into CodeceptJS assertions and runs them; passing assertions are saved to the generated test file. Also use when evidence is insufficient before deciding pass/fail. Leave null for "continue", "fail", or "skipped".'),
92
93
  });
93
94
  const userContent = dedent `
94
95
  Tester wants to ${type} the test.
@@ -109,19 +110,20 @@ export class Pilot {
109
110
  ${sessionLog || 'No actions recorded'}
110
111
  </session_log>
111
112
 
112
- Decide:
113
- - "pass" ONLY if the SCENARIO GOAL is fully accomplished (not just milestones)
114
- - "fail" if the scenario was attempted but failed
115
- - "skipped" if the scenario is irrelevant/inapplicable OR systematic execution failures prevented testing (e.g., repeated LLM errors, navigation crashes, tool failures unrelated to the scenario)
116
- - "continue" if tester hasn't completed the scenario goal yet even if milestones were checked
117
- - If evidence is mixed, but final state indicates goal completion, choose "pass"
118
- - If evidence is mixed and final state is unclear, prefer "continue" over "fail"
119
-
120
- When deciding "pass", you MUST also set requestVerification to a CodeceptJS assertion that
121
- proves the scenario goal on the current page. Choose the strongest single evidence (a unique
122
- element/text that exists ONLY because the scenario succeeded). The assertion is executed and
123
- then converted into the spec file's expect() without it the generated test has nothing to
124
- assert and is worthless.
113
+ Decide and commit. "continue" extends the loop and burns iterations — choose it only when
114
+ evidence is genuinely insufficient to call pass/fail, not as a safety hedge.
115
+ - "pass" if final state proves the SCENARIO GOAL is accomplished. Set requestVerification.
116
+ - "fail" if scenario was attempted but goal not achieved.
117
+ - "skipped" if scenario is irrelevant/inapplicable, OR systematic infrastructure failures.
118
+ - "continue" only when a concrete missing piece of evidence (a verify/see) would change your verdict.
119
+ - Mixed evidence + final state shows success → pass. Mixed + final state unclear continue with guidance.
120
+
121
+ When deciding "pass", you MUST also set requestVerification to a one-sentence natural-language
122
+ claim about the current page (e.g., "New test suite Foo is visible in the suites list"). NOT
123
+ code do not write I.*, expect(), .then(), or any JavaScript. Choose the strongest single
124
+ piece of evidence (a unique element/text that exists ONLY because the scenario succeeded).
125
+ Navigator translates the claim into CodeceptJS assertions; without it the generated test has
126
+ nothing to assert and is worthless.
125
127
  `;
126
128
  const messages = [
127
129
  {
@@ -140,44 +142,28 @@ export class Pilot {
140
142
  task.finish(TestResult.FAILED);
141
143
  return false;
142
144
  }
143
- if (result.requestVerification && navigator) {
145
+ if (result.decision === 'pass' && result.requestVerification && navigator) {
144
146
  tag('substep').log(`Pilot requesting verification: ${result.requestVerification}`);
145
- try {
146
- const verifyResult = await navigator.verifyState(result.requestVerification, currentState);
147
- if (verifyResult.verified) {
148
- if (verifyResult.assertionSteps?.length) {
149
- this.explorer.getPlaywrightRecorder().recordVerification(verifyResult.assertionSteps);
150
- }
151
- tag('substep').log(`Pilot verified: ${result.requestVerification}`);
152
- }
153
- else {
154
- tag('substep').log(`Pilot verification failed: ${result.requestVerification}`);
155
- if (result.decision === 'pass') {
156
- const flipMessage = `Verification "${result.requestVerification}" did not match the page. Adjust approach and re-verify before finishing.`;
157
- result.decision = 'continue';
158
- result.reason = flipMessage;
159
- result.guidance = result.guidance ?? flipMessage;
160
- }
161
- }
162
- }
163
- catch (verifyErr) {
164
- tag('warning').log(`Pilot verification errored: ${verifyErr.message}`);
147
+ const verifyResult = await navigator.verifyState(result.requestVerification, currentState).catch(() => null);
148
+ if (verifyResult?.verified && verifyResult.assertionSteps?.length) {
149
+ this.explorer.getPlaywrightRecorder().recordVerification(verifyResult.assertionSteps);
165
150
  }
166
151
  }
167
152
  tag('info').log(`Pilot: ${result.decision} — ${result.reason}`);
168
153
  task.summary = result.reason;
154
+ const verdictState = screenshotState || currentState;
169
155
  if (result.decision === 'pass') {
170
- task.addNote(`Pilot: ${result.reason}`, TestResult.PASSED);
156
+ task.setVerification(`Pilot: ${result.reason}`, TestResult.PASSED, verdictState);
171
157
  task.finish(TestResult.PASSED);
172
158
  return false;
173
159
  }
174
160
  if (result.decision === 'fail') {
175
- task.addNote(`Pilot: ${result.reason}`, TestResult.FAILED);
161
+ task.setVerification(`Pilot: ${result.reason}`, TestResult.FAILED, verdictState);
176
162
  task.finish(TestResult.FAILED);
177
163
  return false;
178
164
  }
179
165
  if (result.decision === 'skipped') {
180
- task.addNote(`Pilot: skipped — ${result.reason}`, TestResult.SKIPPED);
166
+ task.setVerification(`Pilot: skipped — ${result.reason}`, TestResult.SKIPPED, verdictState);
181
167
  task.finish(TestResult.SKIPPED);
182
168
  return false;
183
169
  }
@@ -270,107 +256,89 @@ export class Pilot {
270
256
  return true;
271
257
  }
272
258
  }
273
- buildResetSystemPrompt(task) {
259
+ buildSharedEvidenceRules(task) {
274
260
  return dedent `
275
- You are Pilot — the supervisor that decides whether a reset is legitimate.
276
- Tester wants to reset (navigate back to the start URL and discard progress).
277
-
278
261
  SCENARIO: ${task.scenario}
279
262
 
280
- Reset is DESTRUCTIVE. It abandons all work done in this iteration. In stateful apps, any
281
- side effects (records created, forms submitted) persist on the server — resetting does not
282
- undo them. Unnecessary resets create duplicate data and loop forever.
263
+ EVIDENCE PRIORITY (strict):
264
+ 1) Final observable state proving the scenario goal
265
+ 2) verify()/see() results in the LAST few actions before stop/finish
266
+ 3) Intermediate action outcomes (diagnostic, not decisive)
267
+ Mixed evidence with a clear final-state success → pass. Mixed with unclear final state → continue.
268
+
269
+ EVIDENCE SOURCES disagree often: verify(), see(), visual_analysis, session_log. No single source
270
+ overrides the others — weigh them together. Tester's record() notes are the LEAST reliable; always
271
+ cross-check against actual actions and state. Visual screenshot analysis is strong for UI state
272
+ (active tabs, visible counts, colors).
273
+
274
+ SCENARIO TITLE defines what must happen. Action verbs require persisted evidence:
275
+ - "Create X" → X must exist (visible, redirected to its page, or success message). Opening a form is NOT enough.
276
+ - "Delete X" → X must be gone. Clicking delete is NOT enough.
277
+ - "Edit X" → updated value must be persisted (visible in list/detail). Opening edit is NOT enough; redirect after save with the new value visible IS enough.
278
+ - Negative tests ("without a name", "invalid", "duplicate", "unauthorized") → success means the system PREVENTED the action with validation/error.
279
+
280
+ PROVENANCE for create/edit scenarios: the task prompt instructs the tester to inject the
281
+ session marker "${task.sessionName ?? ''}" into newly created or edited free-text values.
282
+ When that marker COULD be injected, the entity used as proof MUST contain it. A record
283
+ matching the goal by text alone but missing the marker is a stale leftover from a prior
284
+ run — it is NOT evidence the current scenario produced anything. Vote \`fail\`, not \`pass\`.
285
+ This does not apply when the field is restricted (numeric only, enum, etc.) or when the
286
+ session_log shows no fillField/type/select actions were attempted at all (in that case
287
+ the scenario clearly didn't run — also vote \`fail\`).
288
+
289
+ Expected results are MILESTONES, not the goal. Never fail because a milestone (toast, icon, styling)
290
+ didn't match if the scenario goal IS accomplished.
283
291
 
284
- LEGITIMATE RESET (decide "allow"):
285
- - The current page is unrelated to the scenario and no path leads back.
286
- - Navigation is stuck in an error state with no recoverable action.
287
- - The tester arrived on a page that cannot host the scenario at all.
292
+ ${this.buildDeletionScope(task)}
288
293
 
289
- ILLEGITIMATE RESET (decide "continue"):
290
- - The previous action already succeeded (URL changed to a success/detail page, record visible,
291
- confirmation shown) and tester wants to redo it because an assertion did not match.
292
- The work is done — verify, record, or finish instead of restarting.
293
- - A single expectation / milestone does not match app reality but the scenario goal may still
294
- have been achieved. Do not redo — instruct the tester to verify the actual outcome.
295
- - Tester wants to "try again with different input" after a form was submitted. Submitting
296
- again creates a duplicate; guide toward editing the existing record or accepting the state.
294
+ EXPECTED RESULTS (milestones):
295
+ ${task.expected.map((e) => `- ${e}`).join('\n')}
296
+ `;
297
+ }
298
+ buildResetSystemPrompt(task) {
299
+ return dedent `
300
+ You are Pilot decide whether a reset is legitimate. Reset is DESTRUCTIVE: it abandons this
301
+ iteration's work, but server-side side effects (records created, forms submitted) persist.
302
+ Unnecessary resets create duplicate data and infinite loops.
297
303
 
298
- RESET-LOOP (decide "fail"):
299
- - resetCount >= 2 and the previous resets did not change the underlying situation.
300
- - The same flow has been attempted twice with the same failure mode.
301
- - Repeating the reset cannot produce new information.
304
+ ${this.buildSharedEvidenceRules(task)}
302
305
 
303
- SCENARIO INAPPLICABLE (decide "skipped"):
304
- - The feature the scenario targets does not exist on this app, or prerequisites cannot be met.
306
+ DECISION:
307
+ - "allow": current page cannot host the scenario, irrecoverable error, or no path back.
308
+ - "continue": prior action already succeeded (URL changed, record visible, confirmation shown) — verify/finish instead. Or scenario goal may already be met; instruct tester to verify the actual outcome rather than redo. Provide guidance.
309
+ - "fail": resetCount >= 2 and underlying situation hasn't changed; same flow tried twice with same failure mode.
310
+ - "skipped": feature doesn't exist on this app or prerequisites can't be met.
305
311
 
306
312
  PRIORITY:
307
- 1) Evidence of successful side effects in session_log (URL transition, new record visible).
308
- If present, almost never allow the reset the work is done.
309
- 2) resetCount. Each prior reset raises the bar for allowing another.
310
- 3) Tester's stated reason. Weigh it against the observed evidence, do not trust it blindly.
311
-
312
- GUIDANCE FIELD (required when decision is "continue"):
313
- Give a specific next action on the current page: which tool to call, what to verify, or how to
314
- record the outcome. Do not suggest repeating actions that already succeeded.
313
+ 1) Successful side effects in session_log almost never allow reset.
314
+ 2) resetCount each prior reset raises the bar.
315
+ 3) Tester's stated reason weigh against evidence, don't trust blindly.
315
316
 
316
- EXPECTED RESULTS (milestones, not the goal):
317
- ${task.expected.map((e) => `- ${e}`).join('\n')}
317
+ GUIDANCE (required for "continue"): a specific next action on the current page — which tool, what
318
+ to verify, how to record. Do not suggest repeating actions that already succeeded.
318
319
  `;
319
320
  }
320
321
  buildVerdictSystemPrompt(type, task) {
321
322
  return dedent `
322
- You are Pilot — the final decision maker for test pass/fail.
323
- Tester has requested to ${type} the test. Review the evidence and decide.
324
-
325
- SCENARIO: ${task.scenario}
326
-
327
- The SCENARIO is the primary goal. The test can only pass if the scenario goal is fully accomplished.
328
- PRIORITY ORDER (strict):
329
- 1) Final observable state proving the scenario goal
330
- 2) Verification evidence (if provided)
331
- 3) Intermediate action/step outcomes
332
- If final state evidence proves the scenario goal, PASS even when some intermediate actions failed.
333
- Do not fail only because a specific click failed, no toast appeared, or navigation was different than expected.
334
- Intermediate failures are diagnostic, not decisive, when end state confirms success.
335
- Expected results are helpful milestones but they DO NOT override the scenario goal.
336
- NEVER fail a test because an expected result (milestone) was not met when the scenario goal itself IS accomplished.
337
- The SCENARIO TITLE defines what must happen. If the title says "Create X and verify it appears" and X was created and appears that's a PASS, even if some milestone about icons/status/styling was not met.
338
- If the scenario says "Create X", then X must be created — opening a form or navigating to /new URL is NOT enough. There must be evidence that the item now exists: visible on page, redirected to the item's page, or a success/confirmation message appeared.
339
- If the scenario says "Delete X", then X must be deleted — clicking delete button is not enough. There must be evidence the item is gone.
340
- If the scenario says "Edit X", then changes must be saved — opening an edit form is NOT enough.
341
- For edit/update/rename scenarios, persisted updated value visible in list/detail view is valid save evidence, even without toast and even if page redirected away from edit view.
342
- DO NOT trust Tester's self-assessment in notes (like "scenario goal achieved"). Verify against actual actions and state.
343
- EVIDENCE SOURCES: verify(), see(), visual_analysis, and action results in session_log are all evidence. They may disagree — analyze all of them together to reach your decision. No single source automatically overrides the others. Visual analysis from screenshots is strong evidence for UI state (active tabs, visible items, counts, colors). Tester's self-assessment in record() notes is the least reliable — always cross-check against actual evidence.
344
- SESSION LOG shows ALL actions grouped by URL. If the scenario requires changing data (edit/create/delete) but all form/click actions FAILED, the test cannot pass — even if a verify() found matching content that existed before the test.
345
-
346
- VERIFICATION RULE: Only the LAST few actions before finish/stop count as verification evidence.
347
- - If verify() or see() is among the last actions → use its result as evidence.
348
- - If no verification was done → prefer "continue" with guidance telling tester what to verify.
349
- - If verify assertion describes a state that was ALREADY TRUE before the test started, the verification proves nothing — reject with "continue".
350
-
351
- GUIDANCE FIELD: When decision is "continue", you MUST provide "guidance" — a specific actionable instruction:
352
- - If evidence is insufficient: tell tester to verify with see()/verify(), specify WHAT to check
353
- - If approach was wrong: tell tester to try a different method, suggest which one
354
- - If remaining steps exist: tell tester which steps to complete next
355
- Be concrete. Example: "Use see() to check if the description text appears in the Description tab panel" not "verify the result".
356
- Do NOT tell tester to redo the same actions that already succeeded.
357
-
358
- NEGATIVE TESTS: Some scenarios test that something CANNOT or SHOULD NOT happen.
359
- Patterns: "without a name", "with invalid data", "empty field", "wrong password", "unauthorized", "duplicate".
360
- For negative tests, success means the system PREVENTED the action — error messages, validation, disabled buttons.
361
- Example: "Create X without a name" PASSES if X was NOT created and validation appeared.
362
-
363
- SKIPPED TESTS: Choose "skipped" in two cases:
364
- 1) Scenario is irrelevant: feature doesn't exist on the page, required UI elements are completely absent, scenario prerequisites cannot be met.
365
- 2) Systematic execution failures: repeated LLM/API errors, navigation crashes, tool failures unrelated to the scenario itself. These are infrastructure problems, not test failures.
366
- Do NOT use "skipped" when the feature exists but the test just failed to interact with it — that's "fail" or "continue".
367
-
368
- ${this.buildDeletionScope(task)}
369
-
370
- REASON FORMAT: The "reason" field goes into the test report. Do NOT start with "The scenario goal was/was not achieved" or similar status phrases — the decision field already conveys that. Instead, state what happened: what was verified, what failed, or what evidence was found.
371
-
372
- EXPECTED RESULTS (milestones, not the goal):
373
- ${task.expected.map((e) => `- ${e}`).join('\n')}
323
+ You are Pilot — final decision maker for test pass/fail. Tester requested ${type}. Review the
324
+ evidence and commit to a verdict; "continue" only when evidence is genuinely insufficient.
325
+
326
+ ${this.buildSharedEvidenceRules(task)}
327
+
328
+ DECISION:
329
+ - "pass": scenario goal is fully accomplished. Set requestVerification to a one-sentence claim about
330
+ the current page that proves it (a unique element/text that exists ONLY because the scenario succeeded).
331
+ Pick assertions DOM can express; for non-DOM regions (iframes, canvas, Monaco/CodeMirror), target a
332
+ stable landmark (container, ARIA role) instead of literal inner text. Your "pass" stands even if the
333
+ DOM assertion can't be made.
334
+ - "fail": scenario was attempted but the goal was not achieved.
335
+ - "skipped": scenario is irrelevant to the app, OR systematic infrastructure failures (LLM errors,
336
+ crashes) prevented testing. NOT for "test failed to interact" that's "fail" or "continue".
337
+ - "continue": tester hasn't completed the goal; provide concrete guidance (which tool, what to check).
338
+ If a verify() asserted a state that was ALREADY TRUE before the test, it proves nothing reject.
339
+
340
+ reason field: do NOT restate the decision ("scenario goal achieved/not achieved"). State what happened
341
+ what was verified, what failed, what evidence was found.
374
342
  `;
375
343
  }
376
344
  async planTest(task, currentState) {
@@ -380,7 +348,9 @@ export class Pilot {
380
348
  allowNewResearch: false,
381
349
  });
382
350
  const agenticModel = this.provider.getAgenticModel('pilot');
383
- this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState, pageSummary), 'pilot', agenticModel);
351
+ this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState), 'pilot', agenticModel);
352
+ this.conversation.markLastMessageCacheable();
353
+ this.conversation.protectPrefix(1);
384
354
  const stateContext = this.buildStateContext(currentState);
385
355
  return this.sendToPilot(dedent `
386
356
  <state>
@@ -420,7 +390,7 @@ export class Pilot {
420
390
  Be concise and specific. Tester will follow your plan.
421
391
  `, 'pilot.planTest', { tools: true, planningOnly: true, maxToolRoundtrips: 3, task });
422
392
  }
423
- async reviewNewPage(task, currentState) {
393
+ async reviewNewPage(task, currentState, testerConversation) {
424
394
  if (!this.conversation)
425
395
  return '';
426
396
  tag('substep').log('Pilot reviewing new page...');
@@ -430,7 +400,13 @@ export class Pilot {
430
400
  if (!pageSummary)
431
401
  return '';
432
402
  const stateContext = this.buildStateContext(currentState);
403
+ const toolCalls = testerConversation
404
+ .getToolExecutions()
405
+ .filter((t) => t.wasSuccessful)
406
+ .slice(-this.stepsToReview);
407
+ const actionsContext = this.formatActions(toolCalls);
433
408
  this.conversation.cleanupTag('page_summary', '...trimmed...', 1);
409
+ this.conversation.cleanupTag('recent_actions', '...trimmed...', 2);
434
410
  return this.sendToPilot(dedent `
435
411
  Navigated to new page.
436
412
  START URL: ${task.startUrl}
@@ -443,6 +419,10 @@ export class Pilot {
443
419
  ${pageSummary}
444
420
  </page_summary>
445
421
 
422
+ <recent_actions>
423
+ ${actionsContext || 'None'}
424
+ </recent_actions>
425
+
446
426
  ${this.formatExpectations(task)}
447
427
 
448
428
  First: evaluate whether this navigation makes sense for the scenario goal. If the page is unrelated, instruct Tester to back() or reset(). Then plan next steps.
@@ -451,11 +431,9 @@ export class Pilot {
451
431
  async analyzeProgress(task, currentState, testerConversation) {
452
432
  tag('substep').log('Pilot analyzing progress...');
453
433
  if (!this.conversation) {
454
- const pageSummary = await this.researcher.summary(currentState, {
455
- allowNewResearch: false,
456
- });
457
434
  const agenticModel = this.provider.getAgenticModel('pilot');
458
- this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState, pageSummary), 'pilot', agenticModel);
435
+ this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState), 'pilot', agenticModel);
436
+ this.conversation.markLastMessageCacheable();
459
437
  }
460
438
  const toolCalls = testerConversation.getToolExecutions().slice(-this.stepsToReview);
461
439
  const actionsContext = this.formatActions(toolCalls);
@@ -508,6 +486,7 @@ export class Pilot {
508
486
  const result = await this.provider.invokeConversation(this.conversation, tools, {
509
487
  maxToolRoundtrips: opts.maxToolRoundtrips ?? 0,
510
488
  agentName: 'pilot',
489
+ stopWhen: opts.task ? () => opts.task.hasFinished : undefined,
511
490
  experimental_telemetry: { functionId },
512
491
  });
513
492
  const text = result?.response?.text || '';
@@ -566,12 +545,18 @@ export class Pilot {
566
545
  tag('info').log(`Precondition: ${description}`);
567
546
  debugLog(`precondition: ${description}, fisherman: ${this.fisherman?.isAvailable() ? 'available' : 'none'}`);
568
547
  if (!this.fisherman || !this.fisherman.isAvailable()) {
548
+ const skipReason = await this.checkDataAvailability(task, description, 'Fisherman not available');
549
+ if (skipReason)
550
+ return { noted: true, prepared: false, skipped: true, reason: skipReason };
569
551
  return { noted: true, prepared: false, reason: 'Fisherman not available' };
570
552
  }
571
553
  const result = await this.fisherman.prepareData(description, task.startUrl, task.sessionName);
572
554
  if (!result.success || result.created.length === 0) {
573
555
  if (result.summary)
574
556
  tag('warning').log(`Precondition failed: ${result.summary}`);
557
+ const skipReason = await this.checkDataAvailability(task, description, result.summary);
558
+ if (skipReason)
559
+ return { noted: true, prepared: false, skipped: true, reason: skipReason };
575
560
  return { noted: true, prepared: false, reason: result.summary };
576
561
  }
577
562
  const items = result.created.map((c) => {
@@ -590,6 +575,36 @@ export class Pilot {
590
575
  }),
591
576
  };
592
577
  }
578
+ async checkDataAvailability(task, requestedData, fishermanReason) {
579
+ if (!this.provider.hasVision())
580
+ return null;
581
+ const action = this.explorer.createAction();
582
+ const screenshotState = await action.caputrePageWithScreenshot().catch(() => null);
583
+ if (!screenshotState?.screenshot)
584
+ return null;
585
+ const question = dedent `
586
+ Test scenario: "${task.scenario}"
587
+ Data we tried to create automatically (and failed): ${requestedData}
588
+ Failure reason: ${fishermanReason || 'unknown'}
589
+
590
+ Looking at the current page only, can this scenario still be carried out?
591
+ - YES if the page already shows the items the scenario will act on, OR if the page exposes a UI control that creates such items (an "Add", "New", "+" button, an empty-state CTA, etc.).
592
+ - NO if the scenario needs items that aren't visible AND there is no way to create them from this page (e.g. a filter/search/select scenario over an empty list with no creation affordance).
593
+
594
+ Reply with YES or NO on the first line, then a one-sentence reason on the second line.
595
+ `;
596
+ const answer = await this.researcher.answerQuestionAboutScreenshot(screenshotState, question);
597
+ if (!answer)
598
+ return null;
599
+ const firstLine = answer.split('\n')[0]?.trim().toUpperCase() ?? '';
600
+ if (!firstLine.startsWith('NO'))
601
+ return null;
602
+ const reason = answer.split('\n').slice(1).join(' ').trim() || 'Required data is absent and cannot be created from this page';
603
+ task.setVerification(`Pilot: skipped — ${reason}`, TestResult.SKIPPED, screenshotState);
604
+ task.finish(TestResult.SKIPPED);
605
+ tag('info').log(`Pilot: precondition failed and page lacks required data — skipping test (${reason})`);
606
+ return reason;
607
+ }
593
608
  buildStateContext(state) {
594
609
  const lines = [];
595
610
  lines.push(`url: ${state.url}`);
@@ -741,12 +756,13 @@ export class Pilot {
741
756
  }
742
757
  }
743
758
  const analysisText = exec.output?.analysis;
744
- const resultMessage = analysisText ? (analysisText.length > 500 ? `${analysisText.slice(0, 500)}...` : analysisText) : exec.output?.message || exec.output?.result;
759
+ const resultMessage = analysisText ? (analysisText.length > 300 ? `${analysisText.slice(0, 300)}...` : analysisText) : exec.output?.message || exec.output?.result;
745
760
  if (resultMessage && (CHECK_TOOLS.includes(exec.toolName) || !exec.wasSuccessful)) {
746
761
  line += `\n result: ${resultMessage}`;
747
762
  }
748
763
  groups.get(currentUrl).lines.push(line);
749
764
  }
765
+ const PER_GROUP_CAP = 25;
750
766
  const parts = [];
751
767
  for (const [url, group] of groups) {
752
768
  const header = [url];
@@ -757,7 +773,11 @@ export class Pilot {
757
773
  if (group.h3)
758
774
  header.push(` h3: ${group.h3}`);
759
775
  header.push('');
760
- const lines = group.lines.map((l) => ` ${l}`);
776
+ const omitted = Math.max(0, group.lines.length - PER_GROUP_CAP);
777
+ const visibleLines = omitted > 0 ? group.lines.slice(-PER_GROUP_CAP) : group.lines;
778
+ const lines = visibleLines.map((l) => ` ${l}`);
779
+ if (omitted > 0)
780
+ lines.unshift(` [...${omitted} earlier action(s) omitted...]`);
761
781
  parts.push([...header, ...lines].join('\n'));
762
782
  }
763
783
  return parts.join('\n\n');
@@ -814,11 +834,11 @@ export class Pilot {
814
834
  }
815
835
  return '';
816
836
  }
817
- getSystemPrompt(task, initialState, pageSummary) {
837
+ getSystemPrompt(task, initialState) {
818
838
  const interactive = isInteractive();
819
839
  const stepsText = task.plannedSteps.length > 0 ? task.plannedSteps.map((s, i) => `${i + 1}. ${s}`).join('\n') : 'No planned steps';
820
840
  return dedent `
821
- You are Pilot - a supervisor that detects problems and intervenes only when needed.
841
+ You are Pilot a supervisor that detects problems and intervenes only when needed.
822
842
 
823
843
  SCENARIO: ${task.scenario}
824
844
  START URL: ${initialState.url}
@@ -830,136 +850,63 @@ export class Pilot {
830
850
  PLANNED STEPS:
831
851
  ${stepsText}
832
852
 
833
- ${pageSummary ? `PAGE SUMMARY:\n${pageSummary}` : ''}
834
-
835
- Your job:
836
- 1. Plan test execution by reviewing page elements and scenario requirements
837
- 2. When Tester navigates to a new page, review available elements and plan next steps
838
- 3. Detect when Tester is stuck: repeated failures, loops, or wrong direction
839
- 4. Track which expectations have been checked and which remain
840
- 5. When problems are detected, suggest concrete alternative approaches
841
- 6. When everything is going well, give brief encouragement and let Tester continue
842
- 7. Before suggesting navigation to another page, assume the current page may already have what the scenario needs. The page summary is incomplete — not every element is listed. Prefer exploring the current page first.
843
-
844
- Already-achieved state detection:
845
- - When planning or reviewing, check if the scenario goal is ALREADY met in the current state (page_summary, ariaDiff, or state context).
846
- - If the goal appears already achieved at start: adapt the scenario suggest different input values or data to make the test meaningful.
847
- - If the goal was achieved by a previous action (SUCCESS in recent_actions with confirming ariaDiff): instruct Tester to verify() the result and finish(). Do NOT repeat the same action.
848
- - If Tester keeps re-opening the same panel and re-submitting the same data — STOP. The action was already completed.
849
-
850
- Action-goal alignmentclassify every recent successful action:
851
- - GOAL-ADVANCING: creates, edits, removes, submits, or verifies the scenario's subject data (the object the scenario actually changes).
852
- - VIEW-ONLY: toggles layout, filters, tabs, segment controls, sort orders, collapse/expand changes which data is shown without modifying it.
853
- - A single VIEW-ONLY action is legitimate when needed to reveal a target element for the next GOAL-ADVANCING action.
854
- - A run of two or more consecutive successful VIEW-ONLY actions with no interleaved GOAL-ADVANCING action is thrashing — Tester is exploring UI instead of executing the scenario. Redirect Tester to the specific mutation or verification the scenario requires.
855
- - VIEW-ONLY actions also tend to produce large page diffs with many htmlParts; if you see that pattern repeatedly in recent_actions, treat it as evidence of thrashing.
856
-
857
- Navigation awareness always compare current page url to START URL:
858
- - subpage navigation (deeper path from START URL) OK, scenario may need sub-pages
859
- - outer-page navigation (parent/sibling path from START URL) SUSPICIOUS. The scenario target is on the START page. Do NOT rationalize leaving it. Instruct Tester to back() or reset().
860
- - outer-site navigation (different domain) WRONG. Instruct Tester to reset() immediately.
861
-
862
- IMPORTANT Tool usage policy:
863
- - DO NOT use tools (see, context) when Tester is making progress and no failures are recorded
864
- - Tester already has full ARIA and HTML context do not duplicate that work
865
- - ONLY use see/context tools when Tester has failed 2+ times on the same element or action
866
- - Use xpathCheck proactively when Tester fails to find an element even ONCE (element not found error)
867
- - If Tester's ARIA locator used wrong role (e.g. "textbox" instead of "combobox"), use xpathCheck to identify the correct element
868
- - After finding the element via xpathCheck, include the discovered locator in your NEXT instruction
869
- ${interactive ? '- Use askUser() only as last resort when automated recovery has failed' : ''}
870
-
871
- Diagnosing failures — use <state> context:
872
- - Button click failed AND that button is in "disabled buttons" → button is disabled, not missing. Check "active form" for unfilled [required] fields. Instruct Tester to fill required fields first.
873
- - Form submit failed → check "active form" for fields that may need values. Instruct Tester to fill them before retrying submit.
874
- - "modal: none" but Tester tries to interact with a modal → modal was closed or never opened. Instruct Tester to re-trigger the modal.
875
- - Actions succeed but ariaDiff is empty → action may have worked without visible DOM changes. Check result message before assuming failure.
876
- - Multiple elements matched (MultipleElementsFound) → use xpathCheck() to inspect the matched elements and determine which one is correct. Then instruct Tester with a precise locator or suggest visualClick() to click the right element by visual appearance.
877
- - Tester navigated to a page unrelated to the scenario (e.g., settings instead of feature page) use getVisitedStates() to check which pages were visited, then suggest back() to return to a relevant page, or reset() if multiple wrong navigations occurred. Do NOT try navigating back via breadcrumbs or links — SPA frameworks make manual back-navigation unreliable.
878
- - If diagnosis is unclear, ariaDiff is empty, and your previous advice didn't help → suggest Tester use see() to visually inspect the page. But ONLY as a last resort after other diagnostics failed.
879
- - Click succeeded but ariaDiff shows elements unrelated to tester's intention (e.g., clicked "Edit" but dropdown appeared) → wrong button or unexpected behavior. Instruct Tester to Escape and try a different approach.
880
- - form(I.type()) succeeded I.type() sends keys to whatever is focused, no guarantee it's the right field. Instruct Tester to verify with see() that text appeared in the correct field. If targetedHtml shows a button/link, text went to wrong element — click the correct field first and retry.
881
- - ariaDiff shows 5+ elements removed/added after clicking content → page entered a different mode (editor, panel, modal). Instruct Tester to call context() to see current state before guessing selectors.
882
- - Dropdown/select opened but contains NO options, or a list/table is empty when items were expected → data doesn't exist yet. Call precondition() to create the missing items (labels, categories, etc.), then instruct Tester to retry.
883
- - Tester tries to select/filter/assign something but the option list is empty or expected value is not present → missing auxiliary data. Call precondition() to create it.
884
-
885
- Detecting logically wrong successes review "executed", "element", and "skipped" fields:
886
- - Click SUCCESS but "executed" command differs from "explanation" intent → wrong element was clicked. The intended element wasn't found and a different one was clicked instead.
887
- - Click SUCCESS with "skipped" commands listed → earlier attempts failed, fell through to a different locator. Check if the successful locator actually targets the intended element.
888
- - form(I.type()) SUCCESS but "element" shows a button/link instead of input → text went to wrong element. Instruct Tester to click the correct input first.
889
- - Action SUCCESS but ariaDiff shows changes unrelated to the stated goal → action hit the wrong target. Instruct Tester to undo (Escape/back) and retry with precise locator.
890
- - If Tester's explanation mentions TWO distinct actions in ONE tool call → flag this. Each distinct action should be a separate tool call. Instruct Tester to split into individual steps.
891
-
892
- Complex component patterns — when Tester fails to interact with dropdowns/selects:
893
- - Search-and-select dropdowns require a SEQUENCE: click/focus the trigger input, type to filter, then click an option from the dropdown list. Instruct Tester to split this into separate tool calls.
894
- - If Tester clicks a generic dropdown trigger and ariaDiff shows unrelated options → wrong dropdown was triggered. Instruct Tester to use a more specific selector with container context.
895
- - If Tester types into an input but no dropdown appears → they may need to click the trigger element first. Suggest using context() to check the current DOM state.
896
-
897
- Tester ignoring visible elements:
898
- - If <state> shows "active form" fields but Tester is clicking elements not found in ARIA, or trying buttons that don't exist → Tester is ignoring interactive elements that are actually on the page. Instruct Tester to focus on the elements listed in "active form" — these are the real interactive controls on the current page. The UI map may be outdated.
899
-
900
- When Tester IS stuck finding an element, use xpathCheck() with COMBINED XPaths:
901
- - NEVER guess one exact text. UI labels differ from scenario wording.
902
- - Combine multiple guesses into ONE XPath using "or" operator.
903
- - Include: synonyms, partial text, aria-label, title, role, icon classes.
904
- - Example: looking for a "create project" button:
905
- //*[(contains(., "Create project") or contains(., "New project") or contains(., "Add project") or contains(@aria-label, "project")) or (contains(., "project") and (contains(@class, "add") or contains(@class, "plus") or contains(@class, "create") or .//*[contains(@class, "plus") or contains(@class, "add") or contains(@class, "icon-add")]))][@role="button" or @role="link" or self::button or self::a]
906
- - Key: combine text synonyms + icon classes on children (.//*[contains(@class,...)]) + aria attributes
907
- - If no results, broaden: drop the role filter, or search by role only, then check results for relevant text.
908
- - After finding candidates, narrow down and include discovered XPath in NEXT instruction.
909
-
910
- If you need more page context, mention ATTACH_HTML, ATTACH_ARIA, or ATTACH_UI_MAP — but only when recent actions show failures.
911
-
912
-
913
- Available Tester tools:
914
- - click(locator) — click elements
915
- - pressKey(key) — keyboard keys
916
- - form(code) — execute multiple commands (fillField, type, selectOption, attachFile)
917
- - see(request) — visual screenshot analysis
918
- - verify(assertion) — AI-powered DOM assertion (uses I.see, I.seeElement, I.seeInField, I.dontSee)
919
- - context() — fresh HTML/ARIA snapshot
920
- - research() — get UI map
921
- - xpathCheck(xpath) — find elements by XPath
922
- - visualClick(element) — coordinate-based click
923
- - back() — return to previous page
924
- - getVisitedStates() — list all visited pages (deduped by URL)
925
- - reset() — return to initial page
926
- - stop(reason) — abort test
927
- - finish(verify) — complete test successfully
928
- - record(notes) — document findings
929
-
930
- YOUR tools (Pilot-only):
931
- - precondition(description) — create FRESH test data via API that the test will act on. Do NOT request users.
932
-
933
- PRECONDITIONS — when and what to create:
934
- Preconditions create NEW disposable items that the test will modify, delete, or interact with.
935
-
936
- Ask yourself: "What object will this test change/delete/use? Create THAT."
937
-
938
- When to call precondition():
939
- - Scenario edits/deletes/modifies an item → create a disposable target
940
- - Scenario needs auxiliary data (labels, categories, statuses to filter by)
941
- - Tester failed because required data is missing (empty dropdown, no items to select)
942
-
943
- When to SKIP precondition():
944
- - Scenario is "Create X" — the test itself creates the item, no precondition needed
945
- - Current page already shows the exact data needed (check <state> h1/title and <page_summary>)
946
- - Scenario tests navigation, search UI, or viewing — no data mutation involved
947
-
948
- Examples — when to create:
949
- - "Edit test description" → precondition("1 test") — the test will edit this item
950
- - "Delete a comment" → precondition("1 comment") — the test will delete this item
951
- - "Assign a label to item" → precondition("1 item and 1 label named Bug") — test assigns the label
952
- - "Filter by status" → precondition("3 items: 2 with status Open, 1 with status Closed")
953
-
954
- Examples — when to skip:
955
- - "Create a new blog post" → SKIP, the test creates it
956
- - "Edit blog post" while on a blog post page → SKIP, data already exists
957
- - "View dashboard" → SKIP, no data mutation
958
-
959
- WRONG: precondition("1 test suite named Updated Suite with existing tests") — describes the page, not what to create
960
- RIGHT: precondition("1 test") — create a fresh test that the scenario will edit
961
-
962
- Keep descriptions short and specific.
853
+ Your job: plan, review new pages, detect stuck patterns, suggest concrete next steps. Track which
854
+ expectations are checked. When things go well, encourage briefly and let Tester continue. The current
855
+ page is usually richer than the page summary lists — prefer exploring it before navigating away.
856
+
857
+ Already-achieved detection: if the scenario goal is met in the current state (page_summary, ariaDiff,
858
+ state), instruct Tester to verify() and finish(). If goal was already true at the start, propose
859
+ different input data so the test is meaningful. If Tester repeats the same successful action, STOP.
860
+
861
+ Action classification: GOAL-ADVANCING actions mutate the scenario's subject data (create/edit/delete/submit/verify).
862
+ VIEW-ONLY actions toggle filters/tabs/sort/collapse without changing data. One VIEW-ONLY to reveal a
863
+ target is fine; ≥2 consecutive VIEW-ONLY actions with no GOAL-ADVANCING action in between is thrashing
864
+ redirect Tester to the actual mutation or verification. Repeated large htmlParts diffs are a thrashing signal.
865
+
866
+ Navigation: compare current url to START URL. Subpage = OK. Parent/sibling = suspicious, instruct
867
+ back()/reset(). Different domain = wrong, reset() immediately.
868
+
869
+ Tool usage policy:
870
+ - When Tester is making progress with no failures, do NOT call see/context/research Tester already has ARIA/HTML.
871
+ - Use see/context only after 2+ failures on the same element or action.
872
+ - Use xpathCheck proactively on the FIRST element-not-found error or when ARIA role looks wrong; pass the discovered locator into your next instruction.
873
+ ${interactive ? '- Use askUser() only as last resort.' : ''}
874
+
875
+ Diagnostic patterns (use <state>, executed/element/skipped fields, ariaDiff):
876
+ - Click failed + button in "disabled buttons" → required field missing. Instruct fill first.
877
+ - "modal: none" but Tester targets a modal modal closed; re-trigger.
878
+ - Action SUCCESS but ariaDiff empty may have worked without visible DOM change; check result message.
879
+ - MultipleElementsFound xpathCheck() to identify the right one, then precise locator or visualClick().
880
+ - Wrong page (settings vs feature) getVisitedStates() then back() or reset(). Don't try breadcrumbs (SPA back-nav is unreliable).
881
+ - Click SUCCESS but executed locator ≠ explanation intent, or "skipped" attempts present → wrong element clicked.
882
+ - form(I.type()) SUCCESS but "element" shows a button/link → keys went to wrong element; click the input first.
883
+ - ariaDiff shows 5+ added/removed page entered new mode (editor/modal); call context() before guessing selectors.
884
+ - Empty dropdown/list when items expected missing data; call precondition() to create it.
885
+ - Search-and-select needs SEQUENCE: focus trigger type to filter click option. Tell Tester to split into separate tool calls.
886
+ - Multi-action explanation in one tool call instruct Tester to split.
887
+
888
+ xpathCheck strategy when stuck: never guess one exact text. Combine synonyms, aria-label, title,
889
+ role, icon classes with "or" in one XPath. If empty, broaden (drop role filter). Pass discovered
890
+ XPath into NEXT instruction.
891
+
892
+ To request more context, mention ATTACH_HTML, ATTACH_ARIA, or ATTACH_UI_MAP only when recent actions show failures.
893
+
894
+ Tester tools: click, pressKey, form, see, verify, context, research, xpathCheck, visualClick,
895
+ back, getVisitedStates, reset, stop, finish, record.
896
+
897
+ YOUR Pilot-only tool: precondition(description) create FRESH disposable test data via API. Never
898
+ request users. Use when:
899
+ - Scenario edits/deletes/modifies an item create a disposable target ("1 post").
900
+ - Scenario needs auxiliary data (labels, categories, statuses for filtering).
901
+ - Tester failed because required data is missing (empty dropdown, empty list).
902
+
903
+ Skip precondition() when:
904
+ - Scenario is "Create X" — the test creates it itself.
905
+ - Current page already shows the exact data needed.
906
+ - Scenario tests navigation, search UI, or viewing.
907
+
908
+ Describe WHAT to create, not what exists. RIGHT: precondition("1 test"). WRONG:
909
+ precondition("1 test suite named Updated Suite with existing tests"). Keep descriptions short.
963
910
 
964
911
  Response format:
965
912
  PROGRESS: <1 sentence assessment>