explorbot 0.1.12 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/explorbot-cli.ts +21 -21
- package/dist/bin/explorbot-cli.js +3 -3
- package/dist/package.json +4 -2
- package/dist/rules/researcher/container-rules.md +2 -0
- package/dist/src/action-result.js +2 -1
- package/dist/src/action.js +3 -8
- package/dist/src/ai/captain.js +0 -2
- package/dist/src/ai/conversation.js +20 -4
- package/dist/src/ai/driller.js +1108 -0
- package/dist/src/ai/historian/utils.js +8 -1
- package/dist/src/ai/pilot.js +214 -267
- package/dist/src/ai/provider.js +25 -12
- package/dist/src/ai/quartermaster.js +2 -2
- package/dist/src/ai/rules.js +5 -5
- package/dist/src/ai/session-analyst.js +122 -0
- package/dist/src/ai/tester.js +69 -22
- package/dist/src/ai/tools.js +19 -4
- package/dist/src/commands/base-command.js +6 -6
- package/dist/src/commands/drill-command.js +3 -2
- package/dist/src/commands/exit-command.js +1 -0
- package/dist/src/commands/explore-command.js +9 -2
- package/dist/src/components/AddRule.js +1 -1
- package/dist/src/components/StatusPane.js +6 -1
- package/dist/src/experience-tracker.js +9 -0
- package/dist/src/explorbot.js +48 -8
- package/dist/src/explorer.js +11 -13
- package/dist/src/reporter.js +105 -4
- package/dist/src/state-manager.js +4 -3
- package/dist/src/stats.js +7 -1
- package/dist/src/test-plan.js +47 -3
- package/dist/src/utils/aria.js +354 -529
- package/dist/src/utils/hooks-runner.js +2 -8
- package/dist/src/utils/html.js +371 -0
- package/dist/src/utils/unique-names.js +12 -1
- package/dist/src/utils/url-matcher.js +6 -1
- package/dist/src/utils/web-element.js +27 -24
- package/dist/src/utils/xpath.js +1 -1
- package/package.json +4 -2
- package/rules/researcher/container-rules.md +2 -0
- package/src/action-result.ts +2 -1
- package/src/action.ts +3 -10
- package/src/ai/captain.ts +0 -2
- package/src/ai/conversation.ts +21 -4
- package/src/ai/driller.ts +1194 -0
- package/src/ai/historian/utils.ts +8 -1
- package/src/ai/pilot.ts +215 -265
- package/src/ai/provider.ts +24 -12
- package/src/ai/quartermaster.ts +2 -2
- package/src/ai/rules.ts +5 -5
- package/src/ai/session-analyst.ts +139 -0
- package/src/ai/tester.ts +63 -20
- package/src/ai/tools.ts +18 -4
- package/src/commands/base-command.ts +6 -6
- package/src/commands/drill-command.ts +3 -2
- package/src/commands/exit-command.ts +1 -0
- package/src/commands/explore-command.ts +10 -2
- package/src/components/AddRule.tsx +1 -1
- package/src/components/StatusPane.tsx +6 -3
- package/src/config.ts +4 -0
- package/src/experience-tracker.ts +9 -0
- package/src/explorbot.ts +55 -10
- package/src/explorer.ts +10 -12
- package/src/reporter.ts +108 -4
- package/src/state-manager.ts +4 -3
- package/src/stats.ts +10 -1
- package/src/test-plan.ts +62 -3
- package/src/utils/aria.ts +367 -537
- package/src/utils/hooks-runner.ts +2 -6
- package/src/utils/html.ts +381 -0
- package/src/utils/unique-names.ts +13 -0
- package/src/utils/url-matcher.ts +5 -1
- package/src/utils/web-element.ts +31 -28
- package/src/utils/xpath.ts +1 -1
- package/dist/src/ai/bosun.js +0 -456
- package/src/ai/bosun.ts +0 -571
package/src/ai/pilot.ts
CHANGED
|
@@ -89,15 +89,16 @@ export class Pilot implements Agent {
|
|
|
89
89
|
const notes = task.notesToString() || 'No notes recorded.';
|
|
90
90
|
|
|
91
91
|
let visualAnalysis = '';
|
|
92
|
+
let screenshotState: ActionResult | null = null;
|
|
92
93
|
if (this.provider.hasVision()) {
|
|
93
94
|
try {
|
|
94
95
|
const action = this.explorer.createAction();
|
|
95
|
-
|
|
96
|
+
screenshotState = await action.caputrePageWithScreenshot();
|
|
96
97
|
if (screenshotState.screenshot) {
|
|
97
98
|
visualAnalysis = (await this.researcher.answerQuestionAboutScreenshot(screenshotState, `Describe current page state relevant to: ${task.scenario}`)) || '';
|
|
98
99
|
}
|
|
99
100
|
} catch {
|
|
100
|
-
|
|
101
|
+
screenshotState = null;
|
|
101
102
|
}
|
|
102
103
|
}
|
|
103
104
|
|
|
@@ -109,7 +110,7 @@ export class Pilot implements Agent {
|
|
|
109
110
|
.string()
|
|
110
111
|
.nullable()
|
|
111
112
|
.describe(
|
|
112
|
-
'REQUIRED whenever decision is "pass" —
|
|
113
|
+
'REQUIRED whenever decision is "pass" — a one-sentence natural-language claim about the current page that, if true, proves the scenario goal (e.g., "New test suite \\"Foo\\" is visible in the suites list"). NOT code: do not write I.*, expect(), .then(), grabTitle, or any JavaScript. Navigator translates the claim into CodeceptJS assertions and runs them; passing assertions are saved to the generated test file. Also use when evidence is insufficient before deciding pass/fail. Leave null for "continue", "fail", or "skipped".'
|
|
113
114
|
),
|
|
114
115
|
});
|
|
115
116
|
|
|
@@ -132,19 +133,20 @@ export class Pilot implements Agent {
|
|
|
132
133
|
${sessionLog || 'No actions recorded'}
|
|
133
134
|
</session_log>
|
|
134
135
|
|
|
135
|
-
Decide
|
|
136
|
-
|
|
137
|
-
- "
|
|
138
|
-
- "
|
|
139
|
-
- "
|
|
140
|
-
-
|
|
141
|
-
-
|
|
142
|
-
|
|
143
|
-
When deciding "pass", you MUST also set requestVerification to a
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
136
|
+
Decide and commit. "continue" extends the loop and burns iterations — choose it only when
|
|
137
|
+
evidence is genuinely insufficient to call pass/fail, not as a safety hedge.
|
|
138
|
+
- "pass" if final state proves the SCENARIO GOAL is accomplished. Set requestVerification.
|
|
139
|
+
- "fail" if scenario was attempted but goal not achieved.
|
|
140
|
+
- "skipped" if scenario is irrelevant/inapplicable, OR systematic infrastructure failures.
|
|
141
|
+
- "continue" only when a concrete missing piece of evidence (a verify/see) would change your verdict.
|
|
142
|
+
- Mixed evidence + final state shows success → pass. Mixed + final state unclear → continue with guidance.
|
|
143
|
+
|
|
144
|
+
When deciding "pass", you MUST also set requestVerification to a one-sentence natural-language
|
|
145
|
+
claim about the current page (e.g., "New test suite Foo is visible in the suites list"). NOT
|
|
146
|
+
code — do not write I.*, expect(), .then(), or any JavaScript. Choose the strongest single
|
|
147
|
+
piece of evidence (a unique element/text that exists ONLY because the scenario succeeded).
|
|
148
|
+
Navigator translates the claim into CodeceptJS assertions; without it the generated test has
|
|
149
|
+
nothing to assert and is worthless.
|
|
148
150
|
`;
|
|
149
151
|
|
|
150
152
|
const messages = [
|
|
@@ -167,46 +169,33 @@ export class Pilot implements Agent {
|
|
|
167
169
|
return false;
|
|
168
170
|
}
|
|
169
171
|
|
|
170
|
-
if (result.requestVerification && navigator) {
|
|
172
|
+
if (result.decision === 'pass' && result.requestVerification && navigator) {
|
|
171
173
|
tag('substep').log(`Pilot requesting verification: ${result.requestVerification}`);
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
if (verifyResult.assertionSteps?.length) {
|
|
176
|
-
this.explorer.getPlaywrightRecorder().recordVerification(verifyResult.assertionSteps);
|
|
177
|
-
}
|
|
178
|
-
tag('substep').log(`Pilot verified: ${result.requestVerification}`);
|
|
179
|
-
} else {
|
|
180
|
-
tag('substep').log(`Pilot verification failed: ${result.requestVerification}`);
|
|
181
|
-
if (result.decision === 'pass') {
|
|
182
|
-
const flipMessage = `Verification "${result.requestVerification}" did not match the page. Adjust approach and re-verify before finishing.`;
|
|
183
|
-
result.decision = 'continue';
|
|
184
|
-
result.reason = flipMessage;
|
|
185
|
-
result.guidance = result.guidance ?? flipMessage;
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
} catch (verifyErr: any) {
|
|
189
|
-
tag('warning').log(`Pilot verification errored: ${verifyErr.message}`);
|
|
174
|
+
const verifyResult = await navigator.verifyState(result.requestVerification, currentState).catch(() => null);
|
|
175
|
+
if (verifyResult?.verified && verifyResult.assertionSteps?.length) {
|
|
176
|
+
this.explorer.getPlaywrightRecorder().recordVerification(verifyResult.assertionSteps);
|
|
190
177
|
}
|
|
191
178
|
}
|
|
192
179
|
|
|
193
180
|
tag('info').log(`Pilot: ${result.decision} — ${result.reason}`);
|
|
194
181
|
task.summary = result.reason;
|
|
195
182
|
|
|
183
|
+
const verdictState = screenshotState || currentState;
|
|
184
|
+
|
|
196
185
|
if (result.decision === 'pass') {
|
|
197
|
-
task.
|
|
186
|
+
task.setVerification(`Pilot: ${result.reason}`, TestResult.PASSED, verdictState);
|
|
198
187
|
task.finish(TestResult.PASSED);
|
|
199
188
|
return false;
|
|
200
189
|
}
|
|
201
190
|
|
|
202
191
|
if (result.decision === 'fail') {
|
|
203
|
-
task.
|
|
192
|
+
task.setVerification(`Pilot: ${result.reason}`, TestResult.FAILED, verdictState);
|
|
204
193
|
task.finish(TestResult.FAILED);
|
|
205
194
|
return false;
|
|
206
195
|
}
|
|
207
196
|
|
|
208
197
|
if (result.decision === 'skipped') {
|
|
209
|
-
task.
|
|
198
|
+
task.setVerification(`Pilot: skipped — ${result.reason}`, TestResult.SKIPPED, verdictState);
|
|
210
199
|
task.finish(TestResult.SKIPPED);
|
|
211
200
|
return false;
|
|
212
201
|
}
|
|
@@ -310,108 +299,91 @@ export class Pilot implements Agent {
|
|
|
310
299
|
}
|
|
311
300
|
}
|
|
312
301
|
|
|
313
|
-
private
|
|
302
|
+
private buildSharedEvidenceRules(task: Test): string {
|
|
314
303
|
return dedent`
|
|
315
|
-
You are Pilot — the supervisor that decides whether a reset is legitimate.
|
|
316
|
-
Tester wants to reset (navigate back to the start URL and discard progress).
|
|
317
|
-
|
|
318
304
|
SCENARIO: ${task.scenario}
|
|
319
305
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
-
|
|
334
|
-
|
|
335
|
-
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
If present, almost never allow the reset — the work is done.
|
|
349
|
-
2) resetCount. Each prior reset raises the bar for allowing another.
|
|
350
|
-
3) Tester's stated reason. Weigh it against the observed evidence, do not trust it blindly.
|
|
306
|
+
EVIDENCE PRIORITY (strict):
|
|
307
|
+
1) Final observable state proving the scenario goal
|
|
308
|
+
2) verify()/see() results in the LAST few actions before stop/finish
|
|
309
|
+
3) Intermediate action outcomes (diagnostic, not decisive)
|
|
310
|
+
Mixed evidence with a clear final-state success → pass. Mixed with unclear final state → continue.
|
|
311
|
+
|
|
312
|
+
EVIDENCE SOURCES disagree often: verify(), see(), visual_analysis, session_log. No single source
|
|
313
|
+
overrides the others — weigh them together. Tester's record() notes are the LEAST reliable; always
|
|
314
|
+
cross-check against actual actions and state. Visual screenshot analysis is strong for UI state
|
|
315
|
+
(active tabs, visible counts, colors).
|
|
316
|
+
|
|
317
|
+
SCENARIO TITLE defines what must happen. Action verbs require persisted evidence:
|
|
318
|
+
- "Create X" → X must exist (visible, redirected to its page, or success message). Opening a form is NOT enough.
|
|
319
|
+
- "Delete X" → X must be gone. Clicking delete is NOT enough.
|
|
320
|
+
- "Edit X" → updated value must be persisted (visible in list/detail). Opening edit is NOT enough; redirect after save with the new value visible IS enough.
|
|
321
|
+
- Negative tests ("without a name", "invalid", "duplicate", "unauthorized") → success means the system PREVENTED the action with validation/error.
|
|
322
|
+
|
|
323
|
+
PROVENANCE for create/edit scenarios: the task prompt instructs the tester to inject the
|
|
324
|
+
session marker "${task.sessionName ?? ''}" into newly created or edited free-text values.
|
|
325
|
+
When that marker COULD be injected, the entity used as proof MUST contain it. A record
|
|
326
|
+
matching the goal by text alone but missing the marker is a stale leftover from a prior
|
|
327
|
+
run — it is NOT evidence the current scenario produced anything. Vote \`fail\`, not \`pass\`.
|
|
328
|
+
This does not apply when the field is restricted (numeric only, enum, etc.) or when the
|
|
329
|
+
session_log shows no fillField/type/select actions were attempted at all (in that case
|
|
330
|
+
the scenario clearly didn't run — also vote \`fail\`).
|
|
331
|
+
|
|
332
|
+
Expected results are MILESTONES, not the goal. Never fail because a milestone (toast, icon, styling)
|
|
333
|
+
didn't match if the scenario goal IS accomplished.
|
|
351
334
|
|
|
352
|
-
|
|
353
|
-
Give a specific next action on the current page: which tool to call, what to verify, or how to
|
|
354
|
-
record the outcome. Do not suggest repeating actions that already succeeded.
|
|
335
|
+
${this.buildDeletionScope(task)}
|
|
355
336
|
|
|
356
|
-
EXPECTED RESULTS (milestones
|
|
337
|
+
EXPECTED RESULTS (milestones):
|
|
357
338
|
${task.expected.map((e) => `- ${e}`).join('\n')}
|
|
358
339
|
`;
|
|
359
340
|
}
|
|
360
341
|
|
|
361
|
-
private
|
|
342
|
+
private buildResetSystemPrompt(task: Test): string {
|
|
362
343
|
return dedent`
|
|
363
|
-
You are Pilot —
|
|
364
|
-
|
|
344
|
+
You are Pilot — decide whether a reset is legitimate. Reset is DESTRUCTIVE: it abandons this
|
|
345
|
+
iteration's work, but server-side side effects (records created, forms submitted) persist.
|
|
346
|
+
Unnecessary resets create duplicate data and infinite loops.
|
|
365
347
|
|
|
366
|
-
|
|
348
|
+
${this.buildSharedEvidenceRules(task)}
|
|
367
349
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
2
|
|
372
|
-
|
|
373
|
-
If final state evidence proves the scenario goal, PASS even when some intermediate actions failed.
|
|
374
|
-
Do not fail only because a specific click failed, no toast appeared, or navigation was different than expected.
|
|
375
|
-
Intermediate failures are diagnostic, not decisive, when end state confirms success.
|
|
376
|
-
Expected results are helpful milestones but they DO NOT override the scenario goal.
|
|
377
|
-
NEVER fail a test because an expected result (milestone) was not met when the scenario goal itself IS accomplished.
|
|
378
|
-
The SCENARIO TITLE defines what must happen. If the title says "Create X and verify it appears" and X was created and appears — that's a PASS, even if some milestone about icons/status/styling was not met.
|
|
379
|
-
If the scenario says "Create X", then X must be created — opening a form or navigating to /new URL is NOT enough. There must be evidence that the item now exists: visible on page, redirected to the item's page, or a success/confirmation message appeared.
|
|
380
|
-
If the scenario says "Delete X", then X must be deleted — clicking delete button is not enough. There must be evidence the item is gone.
|
|
381
|
-
If the scenario says "Edit X", then changes must be saved — opening an edit form is NOT enough.
|
|
382
|
-
For edit/update/rename scenarios, persisted updated value visible in list/detail view is valid save evidence, even without toast and even if page redirected away from edit view.
|
|
383
|
-
DO NOT trust Tester's self-assessment in notes (like "scenario goal achieved"). Verify against actual actions and state.
|
|
384
|
-
EVIDENCE SOURCES: verify(), see(), visual_analysis, and action results in session_log are all evidence. They may disagree — analyze all of them together to reach your decision. No single source automatically overrides the others. Visual analysis from screenshots is strong evidence for UI state (active tabs, visible items, counts, colors). Tester's self-assessment in record() notes is the least reliable — always cross-check against actual evidence.
|
|
385
|
-
SESSION LOG shows ALL actions grouped by URL. If the scenario requires changing data (edit/create/delete) but all form/click actions FAILED, the test cannot pass — even if a verify() found matching content that existed before the test.
|
|
386
|
-
|
|
387
|
-
VERIFICATION RULE: Only the LAST few actions before finish/stop count as verification evidence.
|
|
388
|
-
- If verify() or see() is among the last actions → use its result as evidence.
|
|
389
|
-
- If no verification was done → prefer "continue" with guidance telling tester what to verify.
|
|
390
|
-
- If verify assertion describes a state that was ALREADY TRUE before the test started, the verification proves nothing — reject with "continue".
|
|
391
|
-
|
|
392
|
-
GUIDANCE FIELD: When decision is "continue", you MUST provide "guidance" — a specific actionable instruction:
|
|
393
|
-
- If evidence is insufficient: tell tester to verify with see()/verify(), specify WHAT to check
|
|
394
|
-
- If approach was wrong: tell tester to try a different method, suggest which one
|
|
395
|
-
- If remaining steps exist: tell tester which steps to complete next
|
|
396
|
-
Be concrete. Example: "Use see() to check if the description text appears in the Description tab panel" not "verify the result".
|
|
397
|
-
Do NOT tell tester to redo the same actions that already succeeded.
|
|
398
|
-
|
|
399
|
-
NEGATIVE TESTS: Some scenarios test that something CANNOT or SHOULD NOT happen.
|
|
400
|
-
Patterns: "without a name", "with invalid data", "empty field", "wrong password", "unauthorized", "duplicate".
|
|
401
|
-
For negative tests, success means the system PREVENTED the action — error messages, validation, disabled buttons.
|
|
402
|
-
Example: "Create X without a name" PASSES if X was NOT created and validation appeared.
|
|
403
|
-
|
|
404
|
-
SKIPPED TESTS: Choose "skipped" in two cases:
|
|
405
|
-
1) Scenario is irrelevant: feature doesn't exist on the page, required UI elements are completely absent, scenario prerequisites cannot be met.
|
|
406
|
-
2) Systematic execution failures: repeated LLM/API errors, navigation crashes, tool failures unrelated to the scenario itself. These are infrastructure problems, not test failures.
|
|
407
|
-
Do NOT use "skipped" when the feature exists but the test just failed to interact with it — that's "fail" or "continue".
|
|
350
|
+
DECISION:
|
|
351
|
+
- "allow": current page cannot host the scenario, irrecoverable error, or no path back.
|
|
352
|
+
- "continue": prior action already succeeded (URL changed, record visible, confirmation shown) — verify/finish instead. Or scenario goal may already be met; instruct tester to verify the actual outcome rather than redo. Provide guidance.
|
|
353
|
+
- "fail": resetCount >= 2 and underlying situation hasn't changed; same flow tried twice with same failure mode.
|
|
354
|
+
- "skipped": feature doesn't exist on this app or prerequisites can't be met.
|
|
408
355
|
|
|
409
|
-
|
|
356
|
+
PRIORITY:
|
|
357
|
+
1) Successful side effects in session_log → almost never allow reset.
|
|
358
|
+
2) resetCount — each prior reset raises the bar.
|
|
359
|
+
3) Tester's stated reason — weigh against evidence, don't trust blindly.
|
|
410
360
|
|
|
411
|
-
|
|
361
|
+
GUIDANCE (required for "continue"): a specific next action on the current page — which tool, what
|
|
362
|
+
to verify, how to record. Do not suggest repeating actions that already succeeded.
|
|
363
|
+
`;
|
|
364
|
+
}
|
|
412
365
|
|
|
413
|
-
|
|
414
|
-
|
|
366
|
+
private buildVerdictSystemPrompt(type: string, task: Test): string {
|
|
367
|
+
return dedent`
|
|
368
|
+
You are Pilot — final decision maker for test pass/fail. Tester requested ${type}. Review the
|
|
369
|
+
evidence and commit to a verdict; "continue" only when evidence is genuinely insufficient.
|
|
370
|
+
|
|
371
|
+
${this.buildSharedEvidenceRules(task)}
|
|
372
|
+
|
|
373
|
+
DECISION:
|
|
374
|
+
- "pass": scenario goal is fully accomplished. Set requestVerification to a one-sentence claim about
|
|
375
|
+
the current page that proves it (a unique element/text that exists ONLY because the scenario succeeded).
|
|
376
|
+
Pick assertions DOM can express; for non-DOM regions (iframes, canvas, Monaco/CodeMirror), target a
|
|
377
|
+
stable landmark (container, ARIA role) instead of literal inner text. Your "pass" stands even if the
|
|
378
|
+
DOM assertion can't be made.
|
|
379
|
+
- "fail": scenario was attempted but the goal was not achieved.
|
|
380
|
+
- "skipped": scenario is irrelevant to the app, OR systematic infrastructure failures (LLM errors,
|
|
381
|
+
crashes) prevented testing. NOT for "test failed to interact" — that's "fail" or "continue".
|
|
382
|
+
- "continue": tester hasn't completed the goal; provide concrete guidance (which tool, what to check).
|
|
383
|
+
If a verify() asserted a state that was ALREADY TRUE before the test, it proves nothing — reject.
|
|
384
|
+
|
|
385
|
+
reason field: do NOT restate the decision ("scenario goal achieved/not achieved"). State what happened —
|
|
386
|
+
what was verified, what failed, what evidence was found.
|
|
415
387
|
`;
|
|
416
388
|
}
|
|
417
389
|
|
|
@@ -423,7 +395,9 @@ export class Pilot implements Agent {
|
|
|
423
395
|
allowNewResearch: false,
|
|
424
396
|
});
|
|
425
397
|
const agenticModel = this.provider.getAgenticModel('pilot');
|
|
426
|
-
this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState
|
|
398
|
+
this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState), 'pilot', agenticModel);
|
|
399
|
+
this.conversation.markLastMessageCacheable();
|
|
400
|
+
this.conversation.protectPrefix(1);
|
|
427
401
|
|
|
428
402
|
const stateContext = this.buildStateContext(currentState);
|
|
429
403
|
|
|
@@ -470,7 +444,7 @@ export class Pilot implements Agent {
|
|
|
470
444
|
);
|
|
471
445
|
}
|
|
472
446
|
|
|
473
|
-
async reviewNewPage(task: Test, currentState: ActionResult): Promise<string> {
|
|
447
|
+
async reviewNewPage(task: Test, currentState: ActionResult, testerConversation: Conversation): Promise<string> {
|
|
474
448
|
if (!this.conversation) return '';
|
|
475
449
|
|
|
476
450
|
tag('substep').log('Pilot reviewing new page...');
|
|
@@ -481,8 +455,14 @@ export class Pilot implements Agent {
|
|
|
481
455
|
if (!pageSummary) return '';
|
|
482
456
|
|
|
483
457
|
const stateContext = this.buildStateContext(currentState);
|
|
458
|
+
const toolCalls = testerConversation
|
|
459
|
+
.getToolExecutions()
|
|
460
|
+
.filter((t: any) => t.wasSuccessful)
|
|
461
|
+
.slice(-this.stepsToReview);
|
|
462
|
+
const actionsContext = this.formatActions(toolCalls);
|
|
484
463
|
|
|
485
464
|
this.conversation.cleanupTag('page_summary', '...trimmed...', 1);
|
|
465
|
+
this.conversation.cleanupTag('recent_actions', '...trimmed...', 2);
|
|
486
466
|
|
|
487
467
|
return this.sendToPilot(
|
|
488
468
|
dedent`
|
|
@@ -497,6 +477,10 @@ export class Pilot implements Agent {
|
|
|
497
477
|
${pageSummary}
|
|
498
478
|
</page_summary>
|
|
499
479
|
|
|
480
|
+
<recent_actions>
|
|
481
|
+
${actionsContext || 'None'}
|
|
482
|
+
</recent_actions>
|
|
483
|
+
|
|
500
484
|
${this.formatExpectations(task)}
|
|
501
485
|
|
|
502
486
|
First: evaluate whether this navigation makes sense for the scenario goal. If the page is unrelated, instruct Tester to back() or reset(). Then plan next steps.
|
|
@@ -509,11 +493,9 @@ export class Pilot implements Agent {
|
|
|
509
493
|
tag('substep').log('Pilot analyzing progress...');
|
|
510
494
|
|
|
511
495
|
if (!this.conversation) {
|
|
512
|
-
const pageSummary = await this.researcher.summary(currentState, {
|
|
513
|
-
allowNewResearch: false,
|
|
514
|
-
});
|
|
515
496
|
const agenticModel = this.provider.getAgenticModel('pilot');
|
|
516
|
-
this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState
|
|
497
|
+
this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState), 'pilot', agenticModel);
|
|
498
|
+
this.conversation.markLastMessageCacheable();
|
|
517
499
|
}
|
|
518
500
|
|
|
519
501
|
const toolCalls = testerConversation.getToolExecutions().slice(-this.stepsToReview);
|
|
@@ -582,6 +564,7 @@ export class Pilot implements Agent {
|
|
|
582
564
|
const result = await this.provider.invokeConversation(this.conversation!, tools, {
|
|
583
565
|
maxToolRoundtrips: opts.maxToolRoundtrips ?? 0,
|
|
584
566
|
agentName: 'pilot',
|
|
567
|
+
stopWhen: opts.task ? () => opts.task!.hasFinished : undefined,
|
|
585
568
|
experimental_telemetry: { functionId },
|
|
586
569
|
});
|
|
587
570
|
const text = result?.response?.text || '';
|
|
@@ -634,6 +617,8 @@ export class Pilot implements Agent {
|
|
|
634
617
|
debugLog(`precondition: ${description}, fisherman: ${this.fisherman?.isAvailable() ? 'available' : 'none'}`);
|
|
635
618
|
|
|
636
619
|
if (!this.fisherman || !this.fisherman.isAvailable()) {
|
|
620
|
+
const skipReason = await this.checkDataAvailability(task, description, 'Fisherman not available');
|
|
621
|
+
if (skipReason) return { noted: true, prepared: false, skipped: true, reason: skipReason };
|
|
637
622
|
return { noted: true, prepared: false, reason: 'Fisherman not available' };
|
|
638
623
|
}
|
|
639
624
|
|
|
@@ -641,6 +626,8 @@ export class Pilot implements Agent {
|
|
|
641
626
|
|
|
642
627
|
if (!result.success || result.created.length === 0) {
|
|
643
628
|
if (result.summary) tag('warning').log(`Precondition failed: ${result.summary}`);
|
|
629
|
+
const skipReason = await this.checkDataAvailability(task, description, result.summary);
|
|
630
|
+
if (skipReason) return { noted: true, prepared: false, skipped: true, reason: skipReason };
|
|
644
631
|
return { noted: true, prepared: false, reason: result.summary };
|
|
645
632
|
}
|
|
646
633
|
|
|
@@ -660,6 +647,38 @@ export class Pilot implements Agent {
|
|
|
660
647
|
};
|
|
661
648
|
}
|
|
662
649
|
|
|
650
|
+
private async checkDataAvailability(task: Test, requestedData: string, fishermanReason: string | undefined): Promise<string | null> {
|
|
651
|
+
if (!this.provider.hasVision()) return null;
|
|
652
|
+
|
|
653
|
+
const action = this.explorer.createAction();
|
|
654
|
+
const screenshotState = await action.caputrePageWithScreenshot().catch(() => null);
|
|
655
|
+
if (!screenshotState?.screenshot) return null;
|
|
656
|
+
|
|
657
|
+
const question = dedent`
|
|
658
|
+
Test scenario: "${task.scenario}"
|
|
659
|
+
Data we tried to create automatically (and failed): ${requestedData}
|
|
660
|
+
Failure reason: ${fishermanReason || 'unknown'}
|
|
661
|
+
|
|
662
|
+
Looking at the current page only, can this scenario still be carried out?
|
|
663
|
+
- YES if the page already shows the items the scenario will act on, OR if the page exposes a UI control that creates such items (an "Add", "New", "+" button, an empty-state CTA, etc.).
|
|
664
|
+
- NO if the scenario needs items that aren't visible AND there is no way to create them from this page (e.g. a filter/search/select scenario over an empty list with no creation affordance).
|
|
665
|
+
|
|
666
|
+
Reply with YES or NO on the first line, then a one-sentence reason on the second line.
|
|
667
|
+
`;
|
|
668
|
+
|
|
669
|
+
const answer = await this.researcher.answerQuestionAboutScreenshot(screenshotState, question);
|
|
670
|
+
if (!answer) return null;
|
|
671
|
+
|
|
672
|
+
const firstLine = answer.split('\n')[0]?.trim().toUpperCase() ?? '';
|
|
673
|
+
if (!firstLine.startsWith('NO')) return null;
|
|
674
|
+
|
|
675
|
+
const reason = answer.split('\n').slice(1).join(' ').trim() || 'Required data is absent and cannot be created from this page';
|
|
676
|
+
task.setVerification(`Pilot: skipped — ${reason}`, TestResult.SKIPPED, screenshotState);
|
|
677
|
+
task.finish(TestResult.SKIPPED);
|
|
678
|
+
tag('info').log(`Pilot: precondition failed and page lacks required data — skipping test (${reason})`);
|
|
679
|
+
return reason;
|
|
680
|
+
}
|
|
681
|
+
|
|
663
682
|
private buildStateContext(state: ActionResult): string {
|
|
664
683
|
const lines: string[] = [];
|
|
665
684
|
|
|
@@ -829,7 +848,7 @@ export class Pilot implements Agent {
|
|
|
829
848
|
}
|
|
830
849
|
|
|
831
850
|
const analysisText = exec.output?.analysis;
|
|
832
|
-
const resultMessage = analysisText ? (analysisText.length >
|
|
851
|
+
const resultMessage = analysisText ? (analysisText.length > 300 ? `${analysisText.slice(0, 300)}...` : analysisText) : exec.output?.message || exec.output?.result;
|
|
833
852
|
if (resultMessage && (CHECK_TOOLS.includes(exec.toolName) || !exec.wasSuccessful)) {
|
|
834
853
|
line += `\n result: ${resultMessage}`;
|
|
835
854
|
}
|
|
@@ -837,6 +856,7 @@ export class Pilot implements Agent {
|
|
|
837
856
|
groups.get(currentUrl)!.lines.push(line);
|
|
838
857
|
}
|
|
839
858
|
|
|
859
|
+
const PER_GROUP_CAP = 25;
|
|
840
860
|
const parts: string[] = [];
|
|
841
861
|
for (const [url, group] of groups) {
|
|
842
862
|
const header = [url];
|
|
@@ -844,7 +864,10 @@ export class Pilot implements Agent {
|
|
|
844
864
|
if (group.h1) header.push(` h1: ${group.h1}`);
|
|
845
865
|
if (group.h3) header.push(` h3: ${group.h3}`);
|
|
846
866
|
header.push('');
|
|
847
|
-
const
|
|
867
|
+
const omitted = Math.max(0, group.lines.length - PER_GROUP_CAP);
|
|
868
|
+
const visibleLines = omitted > 0 ? group.lines.slice(-PER_GROUP_CAP) : group.lines;
|
|
869
|
+
const lines = visibleLines.map((l) => ` ${l}`);
|
|
870
|
+
if (omitted > 0) lines.unshift(` [...${omitted} earlier action(s) omitted...]`);
|
|
848
871
|
parts.push([...header, ...lines].join('\n'));
|
|
849
872
|
}
|
|
850
873
|
|
|
@@ -909,12 +932,12 @@ export class Pilot implements Agent {
|
|
|
909
932
|
return '';
|
|
910
933
|
}
|
|
911
934
|
|
|
912
|
-
private getSystemPrompt(task: Test, initialState: ActionResult
|
|
935
|
+
private getSystemPrompt(task: Test, initialState: ActionResult): string {
|
|
913
936
|
const interactive = isInteractive();
|
|
914
937
|
const stepsText = task.plannedSteps.length > 0 ? task.plannedSteps.map((s, i) => `${i + 1}. ${s}`).join('\n') : 'No planned steps';
|
|
915
938
|
|
|
916
939
|
return dedent`
|
|
917
|
-
You are Pilot
|
|
940
|
+
You are Pilot — a supervisor that detects problems and intervenes only when needed.
|
|
918
941
|
|
|
919
942
|
SCENARIO: ${task.scenario}
|
|
920
943
|
START URL: ${initialState.url}
|
|
@@ -926,136 +949,63 @@ export class Pilot implements Agent {
|
|
|
926
949
|
PLANNED STEPS:
|
|
927
950
|
${stepsText}
|
|
928
951
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
-
|
|
948
|
-
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
-
|
|
955
|
-
-
|
|
956
|
-
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
-
|
|
960
|
-
-
|
|
961
|
-
-
|
|
962
|
-
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
-
|
|
974
|
-
|
|
975
|
-
-
|
|
976
|
-
-
|
|
977
|
-
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
- If Tester's explanation mentions TWO distinct actions in ONE tool call → flag this. Each distinct action should be a separate tool call. Instruct Tester to split into individual steps.
|
|
987
|
-
|
|
988
|
-
Complex component patterns — when Tester fails to interact with dropdowns/selects:
|
|
989
|
-
- Search-and-select dropdowns require a SEQUENCE: click/focus the trigger input, type to filter, then click an option from the dropdown list. Instruct Tester to split this into separate tool calls.
|
|
990
|
-
- If Tester clicks a generic dropdown trigger and ariaDiff shows unrelated options → wrong dropdown was triggered. Instruct Tester to use a more specific selector with container context.
|
|
991
|
-
- If Tester types into an input but no dropdown appears → they may need to click the trigger element first. Suggest using context() to check the current DOM state.
|
|
992
|
-
|
|
993
|
-
Tester ignoring visible elements:
|
|
994
|
-
- If <state> shows "active form" fields but Tester is clicking elements not found in ARIA, or trying buttons that don't exist → Tester is ignoring interactive elements that are actually on the page. Instruct Tester to focus on the elements listed in "active form" — these are the real interactive controls on the current page. The UI map may be outdated.
|
|
995
|
-
|
|
996
|
-
When Tester IS stuck finding an element, use xpathCheck() with COMBINED XPaths:
|
|
997
|
-
- NEVER guess one exact text. UI labels differ from scenario wording.
|
|
998
|
-
- Combine multiple guesses into ONE XPath using "or" operator.
|
|
999
|
-
- Include: synonyms, partial text, aria-label, title, role, icon classes.
|
|
1000
|
-
- Example: looking for a "create project" button:
|
|
1001
|
-
//*[(contains(., "Create project") or contains(., "New project") or contains(., "Add project") or contains(@aria-label, "project")) or (contains(., "project") and (contains(@class, "add") or contains(@class, "plus") or contains(@class, "create") or .//*[contains(@class, "plus") or contains(@class, "add") or contains(@class, "icon-add")]))][@role="button" or @role="link" or self::button or self::a]
|
|
1002
|
-
- Key: combine text synonyms + icon classes on children (.//*[contains(@class,...)]) + aria attributes
|
|
1003
|
-
- If no results, broaden: drop the role filter, or search by role only, then check results for relevant text.
|
|
1004
|
-
- After finding candidates, narrow down and include discovered XPath in NEXT instruction.
|
|
1005
|
-
|
|
1006
|
-
If you need more page context, mention ATTACH_HTML, ATTACH_ARIA, or ATTACH_UI_MAP — but only when recent actions show failures.
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
Available Tester tools:
|
|
1010
|
-
- click(locator) — click elements
|
|
1011
|
-
- pressKey(key) — keyboard keys
|
|
1012
|
-
- form(code) — execute multiple commands (fillField, type, selectOption, attachFile)
|
|
1013
|
-
- see(request) — visual screenshot analysis
|
|
1014
|
-
- verify(assertion) — AI-powered DOM assertion (uses I.see, I.seeElement, I.seeInField, I.dontSee)
|
|
1015
|
-
- context() — fresh HTML/ARIA snapshot
|
|
1016
|
-
- research() — get UI map
|
|
1017
|
-
- xpathCheck(xpath) — find elements by XPath
|
|
1018
|
-
- visualClick(element) — coordinate-based click
|
|
1019
|
-
- back() — return to previous page
|
|
1020
|
-
- getVisitedStates() — list all visited pages (deduped by URL)
|
|
1021
|
-
- reset() — return to initial page
|
|
1022
|
-
- stop(reason) — abort test
|
|
1023
|
-
- finish(verify) — complete test successfully
|
|
1024
|
-
- record(notes) — document findings
|
|
1025
|
-
|
|
1026
|
-
YOUR tools (Pilot-only):
|
|
1027
|
-
- precondition(description) — create FRESH test data via API that the test will act on. Do NOT request users.
|
|
1028
|
-
|
|
1029
|
-
PRECONDITIONS — when and what to create:
|
|
1030
|
-
Preconditions create NEW disposable items that the test will modify, delete, or interact with.
|
|
1031
|
-
|
|
1032
|
-
Ask yourself: "What object will this test change/delete/use? Create THAT."
|
|
1033
|
-
|
|
1034
|
-
When to call precondition():
|
|
1035
|
-
- Scenario edits/deletes/modifies an item → create a disposable target
|
|
1036
|
-
- Scenario needs auxiliary data (labels, categories, statuses to filter by)
|
|
1037
|
-
- Tester failed because required data is missing (empty dropdown, no items to select)
|
|
1038
|
-
|
|
1039
|
-
When to SKIP precondition():
|
|
1040
|
-
- Scenario is "Create X" — the test itself creates the item, no precondition needed
|
|
1041
|
-
- Current page already shows the exact data needed (check <state> h1/title and <page_summary>)
|
|
1042
|
-
- Scenario tests navigation, search UI, or viewing — no data mutation involved
|
|
1043
|
-
|
|
1044
|
-
Examples — when to create:
|
|
1045
|
-
- "Edit test description" → precondition("1 test") — the test will edit this item
|
|
1046
|
-
- "Delete a comment" → precondition("1 comment") — the test will delete this item
|
|
1047
|
-
- "Assign a label to item" → precondition("1 item and 1 label named Bug") — test assigns the label
|
|
1048
|
-
- "Filter by status" → precondition("3 items: 2 with status Open, 1 with status Closed")
|
|
1049
|
-
|
|
1050
|
-
Examples — when to skip:
|
|
1051
|
-
- "Create a new blog post" → SKIP, the test creates it
|
|
1052
|
-
- "Edit blog post" while on a blog post page → SKIP, data already exists
|
|
1053
|
-
- "View dashboard" → SKIP, no data mutation
|
|
1054
|
-
|
|
1055
|
-
WRONG: precondition("1 test suite named Updated Suite with existing tests") — describes the page, not what to create
|
|
1056
|
-
RIGHT: precondition("1 test") — create a fresh test that the scenario will edit
|
|
1057
|
-
|
|
1058
|
-
Keep descriptions short and specific.
|
|
952
|
+
Your job: plan, review new pages, detect stuck patterns, suggest concrete next steps. Track which
|
|
953
|
+
expectations are checked. When things go well, encourage briefly and let Tester continue. The current
|
|
954
|
+
page is usually richer than the page summary lists — prefer exploring it before navigating away.
|
|
955
|
+
|
|
956
|
+
Already-achieved detection: if the scenario goal is met in the current state (page_summary, ariaDiff,
|
|
957
|
+
state), instruct Tester to verify() and finish(). If goal was already true at the start, propose
|
|
958
|
+
different input data so the test is meaningful. If Tester repeats the same successful action, STOP.
|
|
959
|
+
|
|
960
|
+
Action classification: GOAL-ADVANCING actions mutate the scenario's subject data (create/edit/delete/submit/verify).
|
|
961
|
+
VIEW-ONLY actions toggle filters/tabs/sort/collapse without changing data. One VIEW-ONLY to reveal a
|
|
962
|
+
target is fine; ≥2 consecutive VIEW-ONLY actions with no GOAL-ADVANCING action in between is thrashing
|
|
963
|
+
— redirect Tester to the actual mutation or verification. Repeated large htmlParts diffs are a thrashing signal.
|
|
964
|
+
|
|
965
|
+
Navigation: compare current url to START URL. Subpage = OK. Parent/sibling = suspicious, instruct
|
|
966
|
+
back()/reset(). Different domain = wrong, reset() immediately.
|
|
967
|
+
|
|
968
|
+
Tool usage policy:
|
|
969
|
+
- When Tester is making progress with no failures, do NOT call see/context/research — Tester already has ARIA/HTML.
|
|
970
|
+
- Use see/context only after 2+ failures on the same element or action.
|
|
971
|
+
- Use xpathCheck proactively on the FIRST element-not-found error or when ARIA role looks wrong; pass the discovered locator into your next instruction.
|
|
972
|
+
${interactive ? '- Use askUser() only as last resort.' : ''}
|
|
973
|
+
|
|
974
|
+
Diagnostic patterns (use <state>, executed/element/skipped fields, ariaDiff):
|
|
975
|
+
- Click failed + button in "disabled buttons" → required field missing. Instruct fill first.
|
|
976
|
+
- "modal: none" but Tester targets a modal → modal closed; re-trigger.
|
|
977
|
+
- Action SUCCESS but ariaDiff empty → may have worked without visible DOM change; check result message.
|
|
978
|
+
- MultipleElementsFound → xpathCheck() to identify the right one, then precise locator or visualClick().
|
|
979
|
+
- Wrong page (settings vs feature) → getVisitedStates() then back() or reset(). Don't try breadcrumbs (SPA back-nav is unreliable).
|
|
980
|
+
- Click SUCCESS but executed locator ≠ explanation intent, or "skipped" attempts present → wrong element clicked.
|
|
981
|
+
- form(I.type()) SUCCESS but "element" shows a button/link → keys went to wrong element; click the input first.
|
|
982
|
+
- ariaDiff shows 5+ added/removed → page entered new mode (editor/modal); call context() before guessing selectors.
|
|
983
|
+
- Empty dropdown/list when items expected → missing data; call precondition() to create it.
|
|
984
|
+
- Search-and-select needs SEQUENCE: focus trigger → type to filter → click option. Tell Tester to split into separate tool calls.
|
|
985
|
+
- Multi-action explanation in one tool call → instruct Tester to split.
|
|
986
|
+
|
|
987
|
+
xpathCheck strategy when stuck: never guess one exact text. Combine synonyms, aria-label, title,
|
|
988
|
+
role, icon classes with "or" in one XPath. If empty, broaden (drop role filter). Pass discovered
|
|
989
|
+
XPath into NEXT instruction.
|
|
990
|
+
|
|
991
|
+
To request more context, mention ATTACH_HTML, ATTACH_ARIA, or ATTACH_UI_MAP — only when recent actions show failures.
|
|
992
|
+
|
|
993
|
+
Tester tools: click, pressKey, form, see, verify, context, research, xpathCheck, visualClick,
|
|
994
|
+
back, getVisitedStates, reset, stop, finish, record.
|
|
995
|
+
|
|
996
|
+
YOUR Pilot-only tool: precondition(description) — create FRESH disposable test data via API. Never
|
|
997
|
+
request users. Use when:
|
|
998
|
+
- Scenario edits/deletes/modifies an item → create a disposable target ("1 post").
|
|
999
|
+
- Scenario needs auxiliary data (labels, categories, statuses for filtering).
|
|
1000
|
+
- Tester failed because required data is missing (empty dropdown, empty list).
|
|
1001
|
+
|
|
1002
|
+
Skip precondition() when:
|
|
1003
|
+
- Scenario is "Create X" — the test creates it itself.
|
|
1004
|
+
- Current page already shows the exact data needed.
|
|
1005
|
+
- Scenario tests navigation, search UI, or viewing.
|
|
1006
|
+
|
|
1007
|
+
Describe WHAT to create, not what exists. RIGHT: precondition("1 test"). WRONG:
|
|
1008
|
+
precondition("1 test suite named Updated Suite with existing tests"). Keep descriptions short.
|
|
1059
1009
|
|
|
1060
1010
|
Response format:
|
|
1061
1011
|
PROGRESS: <1 sentence assessment>
|