explorbot 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +94 -0
- package/README.md +267 -0
- package/assets/sample-files/sample.docx +0 -0
- package/assets/sample-files/sample.mp3 +0 -0
- package/assets/sample-files/sample.mp4 +0 -0
- package/assets/sample-files/sample.pdf +21 -0
- package/assets/sample-files/sample.png +0 -0
- package/assets/sample-files/sample.xlsx +0 -0
- package/assets/sample-files/sample.zip +0 -0
- package/dist/assets/sample-files/sample.docx +0 -0
- package/dist/assets/sample-files/sample.mp3 +0 -0
- package/dist/assets/sample-files/sample.mp4 +0 -0
- package/dist/assets/sample-files/sample.pdf +21 -0
- package/dist/assets/sample-files/sample.png +0 -0
- package/dist/assets/sample-files/sample.xlsx +0 -0
- package/dist/assets/sample-files/sample.zip +0 -0
- package/dist/bin/explorbot-cli.js +683 -0
- package/dist/bin/explorbot-cli.js.map +1 -0
- package/dist/boat/api-tester/bin/apibot-cli.js +5 -0
- package/dist/boat/api-tester/bin/apibot-cli.js.map +1 -0
- package/dist/boat/api-tester/example/apibot.config.js +31 -0
- package/dist/boat/api-tester/example/apibot.config.js.map +1 -0
- package/dist/boat/api-tester/src/ai/chief/styles.js +13 -0
- package/dist/boat/api-tester/src/ai/chief/styles.js.map +1 -0
- package/dist/boat/api-tester/src/ai/chief.js +301 -0
- package/dist/boat/api-tester/src/ai/chief.js.map +1 -0
- package/dist/boat/api-tester/src/ai/curler-tools.js +263 -0
- package/dist/boat/api-tester/src/ai/curler-tools.js.map +1 -0
- package/dist/boat/api-tester/src/ai/curler.js +271 -0
- package/dist/boat/api-tester/src/ai/curler.js.map +1 -0
- package/dist/boat/api-tester/src/api-client.js +26 -0
- package/dist/boat/api-tester/src/api-client.js.map +1 -0
- package/dist/boat/api-tester/src/apibot.js +166 -0
- package/dist/boat/api-tester/src/apibot.js.map +1 -0
- package/dist/boat/api-tester/src/cli.js +262 -0
- package/dist/boat/api-tester/src/cli.js.map +1 -0
- package/dist/boat/api-tester/src/config.js +159 -0
- package/dist/boat/api-tester/src/config.js.map +1 -0
- package/dist/prompts/audit-rules.md +124 -0
- package/dist/rules/chief/general.md +11 -0
- package/dist/rules/chief/styles/curious.md +12 -0
- package/dist/rules/chief/styles/hacker.md +19 -0
- package/dist/rules/chief/styles/normal.md +11 -0
- package/dist/rules/chief/styles/psycho.md +17 -0
- package/dist/rules/navigator/multiple-locator.md +47 -0
- package/dist/rules/navigator/output.md +69 -0
- package/dist/rules/navigator/verification-actions.md +122 -0
- package/dist/rules/navigator/verification-output.md +53 -0
- package/dist/rules/planner/styles/curious.md +39 -0
- package/dist/rules/planner/styles/normal.md +21 -0
- package/dist/rules/planner/styles/psycho.md +14 -0
- package/dist/rules/researcher/list-element.md +11 -0
- package/dist/rules/researcher/screenshot-ui-map.md +30 -0
- package/dist/rules/researcher/section-ui-map.md +18 -0
- package/dist/rules/researcher/ui-map-table.md +18 -0
- package/dist/src/action-result.js +574 -0
- package/dist/src/action-result.js.map +1 -0
- package/dist/src/action.js +388 -0
- package/dist/src/action.js.map +1 -0
- package/dist/src/activity.js +86 -0
- package/dist/src/activity.js.map +1 -0
- package/dist/src/ai/agent.js +2 -0
- package/dist/src/ai/agent.js.map +1 -0
- package/dist/src/ai/bosun.js +443 -0
- package/dist/src/ai/bosun.js.map +1 -0
- package/dist/src/ai/captain/idle-mode.js +102 -0
- package/dist/src/ai/captain/idle-mode.js.map +1 -0
- package/dist/src/ai/captain/mixin.js +11 -0
- package/dist/src/ai/captain/mixin.js.map +1 -0
- package/dist/src/ai/captain/test-mode.js +251 -0
- package/dist/src/ai/captain/test-mode.js.map +1 -0
- package/dist/src/ai/captain/web-mode.js +124 -0
- package/dist/src/ai/captain/web-mode.js.map +1 -0
- package/dist/src/ai/captain.js +442 -0
- package/dist/src/ai/captain.js.map +1 -0
- package/dist/src/ai/conversation.js +176 -0
- package/dist/src/ai/conversation.js.map +1 -0
- package/dist/src/ai/experience-compactor.js +232 -0
- package/dist/src/ai/experience-compactor.js.map +1 -0
- package/dist/src/ai/fisherman-tools.js +154 -0
- package/dist/src/ai/fisherman-tools.js.map +1 -0
- package/dist/src/ai/fisherman.js +184 -0
- package/dist/src/ai/fisherman.js.map +1 -0
- package/dist/src/ai/historian.js +384 -0
- package/dist/src/ai/historian.js.map +1 -0
- package/dist/src/ai/navigator.js +493 -0
- package/dist/src/ai/navigator.js.map +1 -0
- package/dist/src/ai/pilot.js +684 -0
- package/dist/src/ai/pilot.js.map +1 -0
- package/dist/src/ai/planner/session-dedup.js +28 -0
- package/dist/src/ai/planner/session-dedup.js.map +1 -0
- package/dist/src/ai/planner/styles.js +15 -0
- package/dist/src/ai/planner/styles.js.map +1 -0
- package/dist/src/ai/planner/subpages.js +118 -0
- package/dist/src/ai/planner/subpages.js.map +1 -0
- package/dist/src/ai/planner.js +486 -0
- package/dist/src/ai/planner.js.map +1 -0
- package/dist/src/ai/provider.js +540 -0
- package/dist/src/ai/provider.js.map +1 -0
- package/dist/src/ai/quartermaster.js +210 -0
- package/dist/src/ai/quartermaster.js.map +1 -0
- package/dist/src/ai/researcher/cache.js +95 -0
- package/dist/src/ai/researcher/cache.js.map +1 -0
- package/dist/src/ai/researcher/coordinates.js +210 -0
- package/dist/src/ai/researcher/coordinates.js.map +1 -0
- package/dist/src/ai/researcher/deep-analysis.js +364 -0
- package/dist/src/ai/researcher/deep-analysis.js.map +1 -0
- package/dist/src/ai/researcher/fingerprint-worker.js +46 -0
- package/dist/src/ai/researcher/fingerprint-worker.js.map +1 -0
- package/dist/src/ai/researcher/focus.js +37 -0
- package/dist/src/ai/researcher/focus.js.map +1 -0
- package/dist/src/ai/researcher/locators.js +242 -0
- package/dist/src/ai/researcher/locators.js.map +1 -0
- package/dist/src/ai/researcher/mixin.js +3 -0
- package/dist/src/ai/researcher/mixin.js.map +1 -0
- package/dist/src/ai/researcher/parser.js +160 -0
- package/dist/src/ai/researcher/parser.js.map +1 -0
- package/dist/src/ai/researcher/research-result.js +110 -0
- package/dist/src/ai/researcher/research-result.js.map +1 -0
- package/dist/src/ai/researcher.js +776 -0
- package/dist/src/ai/researcher.js.map +1 -0
- package/dist/src/ai/rules.js +368 -0
- package/dist/src/ai/rules.js.map +1 -0
- package/dist/src/ai/task-agent.js +110 -0
- package/dist/src/ai/task-agent.js.map +1 -0
- package/dist/src/ai/tester.js +840 -0
- package/dist/src/ai/tester.js.map +1 -0
- package/dist/src/ai/tools.js +980 -0
- package/dist/src/ai/tools.js.map +1 -0
- package/dist/src/api/api-client.js +91 -0
- package/dist/src/api/api-client.js.map +1 -0
- package/dist/src/api/request-result.js +177 -0
- package/dist/src/api/request-result.js.map +1 -0
- package/dist/src/api/request-store.js +109 -0
- package/dist/src/api/request-store.js.map +1 -0
- package/dist/src/api/spec-reader.js +148 -0
- package/dist/src/api/spec-reader.js.map +1 -0
- package/dist/src/api/xhr-capture.js +91 -0
- package/dist/src/api/xhr-capture.js.map +1 -0
- package/dist/src/browser-server.js +67 -0
- package/dist/src/browser-server.js.map +1 -0
- package/dist/src/command-handler.js +363 -0
- package/dist/src/command-handler.js.map +1 -0
- package/dist/src/commands/add-rule-command.js +52 -0
- package/dist/src/commands/add-rule-command.js.map +1 -0
- package/dist/src/commands/base-command.js +14 -0
- package/dist/src/commands/base-command.js.map +1 -0
- package/dist/src/commands/clean-command.js +67 -0
- package/dist/src/commands/clean-command.js.map +1 -0
- package/dist/src/commands/context-aria-command.js +18 -0
- package/dist/src/commands/context-aria-command.js.map +1 -0
- package/dist/src/commands/context-command.js +57 -0
- package/dist/src/commands/context-command.js.map +1 -0
- package/dist/src/commands/context-data-command.js +25 -0
- package/dist/src/commands/context-data-command.js.map +1 -0
- package/dist/src/commands/context-experience-command.js +41 -0
- package/dist/src/commands/context-experience-command.js.map +1 -0
- package/dist/src/commands/context-html-command.js +26 -0
- package/dist/src/commands/context-html-command.js.map +1 -0
- package/dist/src/commands/context-knowledge-command.js +36 -0
- package/dist/src/commands/context-knowledge-command.js.map +1 -0
- package/dist/src/commands/debug-command.js +12 -0
- package/dist/src/commands/debug-command.js.map +1 -0
- package/dist/src/commands/drill-command.js +29 -0
- package/dist/src/commands/drill-command.js.map +1 -0
- package/dist/src/commands/exit-command.js +26 -0
- package/dist/src/commands/exit-command.js.map +1 -0
- package/dist/src/commands/explore-command.js +124 -0
- package/dist/src/commands/explore-command.js.map +1 -0
- package/dist/src/commands/freesail-command.js +84 -0
- package/dist/src/commands/freesail-command.js.map +1 -0
- package/dist/src/commands/help-command.js +7 -0
- package/dist/src/commands/help-command.js.map +1 -0
- package/dist/src/commands/index.js +63 -0
- package/dist/src/commands/index.js.map +1 -0
- package/dist/src/commands/knows-command.js +54 -0
- package/dist/src/commands/knows-command.js.map +1 -0
- package/dist/src/commands/learn-command.js +35 -0
- package/dist/src/commands/learn-command.js.map +1 -0
- package/dist/src/commands/navigate-command.js +16 -0
- package/dist/src/commands/navigate-command.js.map +1 -0
- package/dist/src/commands/path-command.js +70 -0
- package/dist/src/commands/path-command.js.map +1 -0
- package/dist/src/commands/plan-clear-command.js +13 -0
- package/dist/src/commands/plan-clear-command.js.map +1 -0
- package/dist/src/commands/plan-command.js +36 -0
- package/dist/src/commands/plan-command.js.map +1 -0
- package/dist/src/commands/plan-edit-command.js +8 -0
- package/dist/src/commands/plan-edit-command.js.map +1 -0
- package/dist/src/commands/plan-load-command.js +16 -0
- package/dist/src/commands/plan-load-command.js.map +1 -0
- package/dist/src/commands/plan-reload-command.js +23 -0
- package/dist/src/commands/plan-reload-command.js.map +1 -0
- package/dist/src/commands/plan-save-command.js +22 -0
- package/dist/src/commands/plan-save-command.js.map +1 -0
- package/dist/src/commands/research-command.js +38 -0
- package/dist/src/commands/research-command.js.map +1 -0
- package/dist/src/commands/start-command.js +12 -0
- package/dist/src/commands/start-command.js.map +1 -0
- package/dist/src/commands/status-command.js +19 -0
- package/dist/src/commands/status-command.js.map +1 -0
- package/dist/src/commands/test-command.js +85 -0
- package/dist/src/commands/test-command.js.map +1 -0
- package/dist/src/components/ActivityPane.js +55 -0
- package/dist/src/components/ActivityPane.js.map +1 -0
- package/dist/src/components/AddKnowledge.js +122 -0
- package/dist/src/components/AddKnowledge.js.map +1 -0
- package/dist/src/components/AddRule.js +117 -0
- package/dist/src/components/AddRule.js.map +1 -0
- package/dist/src/components/App.js +313 -0
- package/dist/src/components/App.js.map +1 -0
- package/dist/src/components/Autocomplete.js +43 -0
- package/dist/src/components/Autocomplete.js.map +1 -0
- package/dist/src/components/InputPane.js +207 -0
- package/dist/src/components/InputPane.js.map +1 -0
- package/dist/src/components/InputReadline.js +598 -0
- package/dist/src/components/InputReadline.js.map +1 -0
- package/dist/src/components/LogPane.js +123 -0
- package/dist/src/components/LogPane.js.map +1 -0
- package/dist/src/components/PlanEditor.js +126 -0
- package/dist/src/components/PlanEditor.js.map +1 -0
- package/dist/src/components/PlanPane.js +51 -0
- package/dist/src/components/PlanPane.js.map +1 -0
- package/dist/src/components/SessionTimer.js +26 -0
- package/dist/src/components/SessionTimer.js.map +1 -0
- package/dist/src/components/StateTransitionPane.js +107 -0
- package/dist/src/components/StateTransitionPane.js.map +1 -0
- package/dist/src/components/StatusPane.js +37 -0
- package/dist/src/components/StatusPane.js.map +1 -0
- package/dist/src/components/TaskPane.js +96 -0
- package/dist/src/components/TaskPane.js.map +1 -0
- package/dist/src/components/Welcome.js +52 -0
- package/dist/src/components/Welcome.js.map +1 -0
- package/dist/src/components/WelcomeChecklist.js +96 -0
- package/dist/src/components/WelcomeChecklist.js.map +1 -0
- package/dist/src/components/WelcomeCommands.js +61 -0
- package/dist/src/components/WelcomeCommands.js.map +1 -0
- package/dist/src/components/autocomplete-store.js +22 -0
- package/dist/src/components/autocomplete-store.js.map +1 -0
- package/dist/src/components/parse-keypress.js +174 -0
- package/dist/src/components/parse-keypress.js.map +1 -0
- package/dist/src/config.js +249 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/execution-controller.js +92 -0
- package/dist/src/execution-controller.js.map +1 -0
- package/dist/src/experience-tracker.js +294 -0
- package/dist/src/experience-tracker.js.map +1 -0
- package/dist/src/explorbot.js +348 -0
- package/dist/src/explorbot.js.map +1 -0
- package/dist/src/explorer.js +611 -0
- package/dist/src/explorer.js.map +1 -0
- package/dist/src/index.js +56 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/knowledge-tracker.js +184 -0
- package/dist/src/knowledge-tracker.js.map +1 -0
- package/dist/src/observability.js +126 -0
- package/dist/src/observability.js.map +1 -0
- package/dist/src/reporter.js +185 -0
- package/dist/src/reporter.js.map +1 -0
- package/dist/src/state-manager.js +427 -0
- package/dist/src/state-manager.js.map +1 -0
- package/dist/src/stats.js +44 -0
- package/dist/src/stats.js.map +1 -0
- package/dist/src/test-plan.js +343 -0
- package/dist/src/test-plan.js.map +1 -0
- package/dist/src/utils/aria.js +588 -0
- package/dist/src/utils/aria.js.map +1 -0
- package/dist/src/utils/code-extractor.js +21 -0
- package/dist/src/utils/code-extractor.js.map +1 -0
- package/dist/src/utils/context-formatter.js +205 -0
- package/dist/src/utils/context-formatter.js.map +1 -0
- package/dist/src/utils/error-page.js +19 -0
- package/dist/src/utils/error-page.js.map +1 -0
- package/dist/src/utils/expandable.js +35 -0
- package/dist/src/utils/expandable.js.map +1 -0
- package/dist/src/utils/hooks-runner.js +77 -0
- package/dist/src/utils/hooks-runner.js.map +1 -0
- package/dist/src/utils/html-diff.js +734 -0
- package/dist/src/utils/html-diff.js.map +1 -0
- package/dist/src/utils/html.js +1163 -0
- package/dist/src/utils/html.js.map +1 -0
- package/dist/src/utils/logger.js +465 -0
- package/dist/src/utils/logger.js.map +1 -0
- package/dist/src/utils/loop.js +126 -0
- package/dist/src/utils/loop.js.map +1 -0
- package/dist/src/utils/markdown-parser.js +117 -0
- package/dist/src/utils/markdown-parser.js.map +1 -0
- package/dist/src/utils/markdown-query.js +393 -0
- package/dist/src/utils/markdown-query.js.map +1 -0
- package/dist/src/utils/markdown-terminal.js +40 -0
- package/dist/src/utils/markdown-terminal.js.map +1 -0
- package/dist/src/utils/research-parser.js +2 -0
- package/dist/src/utils/research-parser.js.map +1 -0
- package/dist/src/utils/retry.js +55 -0
- package/dist/src/utils/retry.js.map +1 -0
- package/dist/src/utils/rules-loader.js +104 -0
- package/dist/src/utils/rules-loader.js.map +1 -0
- package/dist/src/utils/strings.js +14 -0
- package/dist/src/utils/strings.js.map +1 -0
- package/dist/src/utils/test-plan-markdown.js +301 -0
- package/dist/src/utils/test-plan-markdown.js.map +1 -0
- package/dist/src/utils/throttle.js +16 -0
- package/dist/src/utils/throttle.js.map +1 -0
- package/dist/src/utils/unique-names.js +13 -0
- package/dist/src/utils/unique-names.js.map +1 -0
- package/dist/src/utils/url-matcher.js +48 -0
- package/dist/src/utils/url-matcher.js.map +1 -0
- package/dist/src/utils/web-element.js +131 -0
- package/dist/src/utils/web-element.js.map +1 -0
- package/dist/src/utils/xpath.js +110 -0
- package/dist/src/utils/xpath.js.map +1 -0
- package/package.json +119 -0
- package/prompts/audit-rules.md +124 -0
- package/rules/chief/general.md +11 -0
- package/rules/chief/styles/curious.md +12 -0
- package/rules/chief/styles/hacker.md +19 -0
- package/rules/chief/styles/normal.md +11 -0
- package/rules/chief/styles/psycho.md +17 -0
- package/rules/navigator/multiple-locator.md +47 -0
- package/rules/navigator/output.md +69 -0
- package/rules/navigator/verification-actions.md +122 -0
- package/rules/navigator/verification-output.md +53 -0
- package/rules/planner/styles/curious.md +39 -0
- package/rules/planner/styles/normal.md +21 -0
- package/rules/planner/styles/psycho.md +14 -0
- package/rules/researcher/list-element.md +11 -0
- package/rules/researcher/screenshot-ui-map.md +30 -0
- package/rules/researcher/section-ui-map.md +18 -0
- package/rules/researcher/ui-map-table.md +18 -0
|
@@ -0,0 +1,684 @@
|
|
|
1
|
+
import { tool } from 'ai';
|
|
2
|
+
import dedent from 'dedent';
|
|
3
|
+
import { z } from 'zod';
|
|
4
|
+
import { ConfigParser } from "../config.js";
|
|
5
|
+
import { TestResult } from "../test-plan.js";
|
|
6
|
+
import { collectInteractiveNodes, detectFocusArea, extractFocusedElement } from "../utils/aria.js";
|
|
7
|
+
import { createDebug, tag } from "../utils/logger.js";
|
|
8
|
+
const debugLog = createDebug('explorbot:pilot');
|
|
9
|
+
import { truncateJson } from "../utils/strings.js";
|
|
10
|
+
import { isInteractive } from "./task-agent.js";
|
|
11
|
+
const CHECK_TOOLS = ['verify', 'see', 'research', 'context'];
|
|
12
|
+
const META_TOOLS = ['record', 'reset', 'stop', 'finish'];
|
|
13
|
+
export class Pilot {
|
|
14
|
+
emoji = '🧭';
|
|
15
|
+
provider;
|
|
16
|
+
agentTools;
|
|
17
|
+
conversation = null;
|
|
18
|
+
researcher;
|
|
19
|
+
explorer;
|
|
20
|
+
fisherman = null;
|
|
21
|
+
constructor(provider, agentTools, researcher, explorer) {
|
|
22
|
+
this.provider = provider;
|
|
23
|
+
this.agentTools = agentTools;
|
|
24
|
+
this.researcher = researcher;
|
|
25
|
+
this.explorer = explorer;
|
|
26
|
+
}
|
|
27
|
+
setFisherman(fisherman) {
|
|
28
|
+
this.fisherman = fisherman;
|
|
29
|
+
}
|
|
30
|
+
get stepsToReview() {
|
|
31
|
+
return ConfigParser.getInstance().getConfig().ai?.agents?.pilot?.stepsToReview ?? 5;
|
|
32
|
+
}
|
|
33
|
+
reset() {
|
|
34
|
+
this.conversation = null;
|
|
35
|
+
}
|
|
36
|
+
getLastAnalysis() {
|
|
37
|
+
if (!this.conversation)
|
|
38
|
+
return null;
|
|
39
|
+
return this.conversation.getLastMessage() || null;
|
|
40
|
+
}
|
|
41
|
+
async reviewStop(task, currentState, testerConversation) {
|
|
42
|
+
return this.reviewDecision('stop', task, currentState, testerConversation);
|
|
43
|
+
}
|
|
44
|
+
async reviewFinish(task, currentState, testerConversation) {
|
|
45
|
+
return this.reviewDecision('finish', task, currentState, testerConversation);
|
|
46
|
+
}
|
|
47
|
+
async reviewCompletion(task, currentState, testerConversation) {
|
|
48
|
+
const verdictType = task.hasAchievedAny() ? 'finish' : 'stop';
|
|
49
|
+
return this.reviewDecision(verdictType, task, currentState, testerConversation);
|
|
50
|
+
}
|
|
51
|
+
async finalReview(task, currentState, testerConversation) {
|
|
52
|
+
if (task.hasFinished)
|
|
53
|
+
return false;
|
|
54
|
+
return this.reviewCompletion(task, currentState, testerConversation);
|
|
55
|
+
}
|
|
56
|
+
async reviewDecision(type, task, currentState, testerConversation) {
|
|
57
|
+
tag('substep').log(`Pilot reviewing ${type} verdict...`);
|
|
58
|
+
const sessionLog = this.formatSessionLog(testerConversation);
|
|
59
|
+
const stateContext = this.buildStateContext(currentState);
|
|
60
|
+
const notes = task.notesToString() || 'No notes recorded.';
|
|
61
|
+
let visualAnalysis = '';
|
|
62
|
+
if (this.provider.hasVision()) {
|
|
63
|
+
try {
|
|
64
|
+
const action = this.explorer.createAction();
|
|
65
|
+
const screenshotState = await action.caputrePageWithScreenshot();
|
|
66
|
+
if (screenshotState.screenshot) {
|
|
67
|
+
visualAnalysis = (await this.researcher.answerQuestionAboutScreenshot(screenshotState, `Describe current page state relevant to: ${task.scenario}`)) || '';
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
// vision not available, continue without
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
const schema = z.object({
|
|
75
|
+
decision: z.enum(['pass', 'fail', 'continue', 'skipped']).describe('pass = test succeeded, fail = test failed, continue = tester should keep going, skipped = scenario is irrelevant OR systematic execution failures prevented testing'),
|
|
76
|
+
reason: z.string().describe('What happened and why (1-2 sentences). Do NOT repeat the decision status (e.g. "scenario goal achieved/not achieved") — just explain the evidence. For continue: explain why rejected and suggest alternatives.'),
|
|
77
|
+
guidance: z.string().nullable().describe('Required for "continue": specific actionable instruction for the tester — what exactly to verify, retry differently, or complete next. Be concrete.'),
|
|
78
|
+
});
|
|
79
|
+
const userContent = dedent `
|
|
80
|
+
Tester wants to ${type} the test.
|
|
81
|
+
|
|
82
|
+
<state>
|
|
83
|
+
${stateContext}
|
|
84
|
+
</state>
|
|
85
|
+
|
|
86
|
+
${visualAnalysis ? `<visual_analysis>\n${visualAnalysis}\n</visual_analysis>` : ''}
|
|
87
|
+
|
|
88
|
+
${this.formatExpectations(task)}
|
|
89
|
+
|
|
90
|
+
<notes>
|
|
91
|
+
${notes}
|
|
92
|
+
</notes>
|
|
93
|
+
|
|
94
|
+
<session_log>
|
|
95
|
+
${sessionLog || 'No actions recorded'}
|
|
96
|
+
</session_log>
|
|
97
|
+
|
|
98
|
+
Decide:
|
|
99
|
+
- "pass" ONLY if the SCENARIO GOAL is fully accomplished (not just milestones)
|
|
100
|
+
- "fail" if the scenario was attempted but failed
|
|
101
|
+
- "skipped" if the scenario is irrelevant/inapplicable OR systematic execution failures prevented testing (e.g., repeated LLM errors, navigation crashes, tool failures unrelated to the scenario)
|
|
102
|
+
- "continue" if tester hasn't completed the scenario goal yet — even if milestones were checked
|
|
103
|
+
- If evidence is mixed, but final state indicates goal completion, choose "pass"
|
|
104
|
+
- If evidence is mixed and final state is unclear, prefer "continue" over "fail"
|
|
105
|
+
`;
|
|
106
|
+
const messages = [
|
|
107
|
+
{
|
|
108
|
+
role: 'system',
|
|
109
|
+
content: this.buildVerdictSystemPrompt(type, task),
|
|
110
|
+
},
|
|
111
|
+
{ role: 'user', content: userContent },
|
|
112
|
+
];
|
|
113
|
+
try {
|
|
114
|
+
const response = await this.provider.generateObject(messages, schema, this.provider.getAgenticModel('pilot'), {
|
|
115
|
+
agentName: 'pilot',
|
|
116
|
+
experimental_telemetry: { functionId: 'pilot.reviewVerdict' },
|
|
117
|
+
});
|
|
118
|
+
const result = response?.object;
|
|
119
|
+
if (!result) {
|
|
120
|
+
task.finish(TestResult.FAILED);
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
tag('info').log(`Pilot: ${result.decision} — ${result.reason}`);
|
|
124
|
+
task.summary = result.reason;
|
|
125
|
+
if (result.decision === 'pass') {
|
|
126
|
+
task.addNote(`Pilot: ${result.reason}`, TestResult.PASSED);
|
|
127
|
+
task.finish(TestResult.PASSED);
|
|
128
|
+
return false;
|
|
129
|
+
}
|
|
130
|
+
if (result.decision === 'fail') {
|
|
131
|
+
task.addNote(`Pilot: ${result.reason}`, TestResult.FAILED);
|
|
132
|
+
task.finish(TestResult.FAILED);
|
|
133
|
+
return false;
|
|
134
|
+
}
|
|
135
|
+
if (result.decision === 'skipped') {
|
|
136
|
+
task.addNote(`Pilot: skipped — ${result.reason}`, TestResult.SKIPPED);
|
|
137
|
+
task.finish(TestResult.SKIPPED);
|
|
138
|
+
return false;
|
|
139
|
+
}
|
|
140
|
+
task.addNote(`Pilot: continue — ${result.reason}`);
|
|
141
|
+
const guidanceText = result.guidance ? `\n\nWhat to do next: ${result.guidance}` : '';
|
|
142
|
+
testerConversation.addUserText(`Pilot: ${result.reason}${guidanceText}`);
|
|
143
|
+
return true;
|
|
144
|
+
}
|
|
145
|
+
catch (error) {
|
|
146
|
+
tag('warning').log(`Pilot verdict failed: ${error.message}`);
|
|
147
|
+
task.finish(TestResult.FAILED);
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
buildVerdictSystemPrompt(type, task) {
|
|
152
|
+
return dedent `
|
|
153
|
+
You are Pilot — the final decision maker for test pass/fail.
|
|
154
|
+
Tester has requested to ${type} the test. Review the evidence and decide.
|
|
155
|
+
|
|
156
|
+
SCENARIO: ${task.scenario}
|
|
157
|
+
|
|
158
|
+
The SCENARIO is the primary goal. The test can only pass if the scenario goal is fully accomplished.
|
|
159
|
+
PRIORITY ORDER (strict):
|
|
160
|
+
1) Final observable state proving the scenario goal
|
|
161
|
+
2) Verification evidence (if provided)
|
|
162
|
+
3) Intermediate action/step outcomes
|
|
163
|
+
If final state evidence proves the scenario goal, PASS even when some intermediate actions failed.
|
|
164
|
+
Do not fail only because a specific click failed, no toast appeared, or navigation was different than expected.
|
|
165
|
+
Intermediate failures are diagnostic, not decisive, when end state confirms success.
|
|
166
|
+
Expected results are helpful milestones but they DO NOT override the scenario goal.
|
|
167
|
+
NEVER fail a test because an expected result (milestone) was not met when the scenario goal itself IS accomplished.
|
|
168
|
+
The SCENARIO TITLE defines what must happen. If the title says "Create X and verify it appears" and X was created and appears — that's a PASS, even if some milestone about icons/status/styling was not met.
|
|
169
|
+
If the scenario says "Create X", then X must be created — opening a form or navigating to /new URL is NOT enough. There must be evidence that the item now exists: visible on page, redirected to the item's page, or a success/confirmation message appeared.
|
|
170
|
+
If the scenario says "Delete X", then X must be deleted — clicking delete button is not enough. There must be evidence the item is gone.
|
|
171
|
+
If the scenario says "Edit X", then changes must be saved — opening an edit form is NOT enough.
|
|
172
|
+
For edit/update/rename scenarios, persisted updated value visible in list/detail view is valid save evidence, even without toast and even if page redirected away from edit view.
|
|
173
|
+
DO NOT trust Tester's self-assessment in notes (like "scenario goal achieved"). Verify against actual actions and state.
|
|
174
|
+
EVIDENCE SOURCES: verify(), see(), visual_analysis, and action results in session_log are all evidence. They may disagree — analyze all of them together to reach your decision. No single source automatically overrides the others. Visual analysis from screenshots is strong evidence for UI state (active tabs, visible items, counts, colors). Tester's self-assessment in record() notes is the least reliable — always cross-check against actual evidence.
|
|
175
|
+
SESSION LOG shows ALL actions grouped by URL. If the scenario requires changing data (edit/create/delete) but all form/click actions FAILED, the test cannot pass — even if a verify() found matching content that existed before the test.
|
|
176
|
+
|
|
177
|
+
VERIFICATION RULE: Only the LAST few actions before finish/stop count as verification evidence.
|
|
178
|
+
- If verify() or see() is among the last actions → use its result as evidence.
|
|
179
|
+
- If no verification was done → prefer "continue" with guidance telling tester what to verify.
|
|
180
|
+
- If verify assertion describes a state that was ALREADY TRUE before the test started, the verification proves nothing — reject with "continue".
|
|
181
|
+
|
|
182
|
+
GUIDANCE FIELD: When decision is "continue", you MUST provide "guidance" — a specific actionable instruction:
|
|
183
|
+
- If evidence is insufficient: tell tester to verify with see()/verify(), specify WHAT to check
|
|
184
|
+
- If approach was wrong: tell tester to try a different method, suggest which one
|
|
185
|
+
- If remaining steps exist: tell tester which steps to complete next
|
|
186
|
+
Be concrete. Example: "Use see() to check if the description text appears in the Description tab panel" not "verify the result".
|
|
187
|
+
Do NOT tell tester to redo the same actions that already succeeded.
|
|
188
|
+
|
|
189
|
+
NEGATIVE TESTS: Some scenarios test that something CANNOT or SHOULD NOT happen.
|
|
190
|
+
Patterns: "without a name", "with invalid data", "empty field", "wrong password", "unauthorized", "duplicate".
|
|
191
|
+
For negative tests, success means the system PREVENTED the action — error messages, validation, disabled buttons.
|
|
192
|
+
Example: "Create X without a name" PASSES if X was NOT created and validation appeared.
|
|
193
|
+
|
|
194
|
+
SKIPPED TESTS: Choose "skipped" in two cases:
|
|
195
|
+
1) Scenario is irrelevant: feature doesn't exist on the page, required UI elements are completely absent, scenario prerequisites cannot be met.
|
|
196
|
+
2) Systematic execution failures: repeated LLM/API errors, navigation crashes, tool failures unrelated to the scenario itself. These are infrastructure problems, not test failures.
|
|
197
|
+
Do NOT use "skipped" when the feature exists but the test just failed to interact with it — that's "fail" or "continue".
|
|
198
|
+
|
|
199
|
+
${this.buildDeletionScope(task)}
|
|
200
|
+
|
|
201
|
+
REASON FORMAT: The "reason" field goes into the test report. Do NOT start with "The scenario goal was/was not achieved" or similar status phrases — the decision field already conveys that. Instead, state what happened: what was verified, what failed, or what evidence was found.
|
|
202
|
+
|
|
203
|
+
EXPECTED RESULTS (milestones, not the goal):
|
|
204
|
+
${task.expected.map((e) => `- ${e}`).join('\n')}
|
|
205
|
+
`;
|
|
206
|
+
}
|
|
207
|
+
async planTest(task, currentState) {
|
|
208
|
+
tag('substep').log('Pilot planning test...');
|
|
209
|
+
debugLog(`planTest: ${task.scenario}, fisherman: ${this.fisherman ? 'available' : 'none'}`);
|
|
210
|
+
const pageSummary = await this.researcher.summary(currentState, {
|
|
211
|
+
allowNewResearch: false,
|
|
212
|
+
});
|
|
213
|
+
const agenticModel = this.provider.getAgenticModel('pilot');
|
|
214
|
+
this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState, pageSummary), 'pilot', agenticModel);
|
|
215
|
+
const stateContext = this.buildStateContext(currentState);
|
|
216
|
+
return this.sendToPilot(dedent `
|
|
217
|
+
<state>
|
|
218
|
+
${stateContext}
|
|
219
|
+
</state>
|
|
220
|
+
|
|
221
|
+
${pageSummary ? `<page_summary>\n${pageSummary}\n</page_summary>` : ''}
|
|
222
|
+
|
|
223
|
+
Plan the test execution for this scenario.
|
|
224
|
+
|
|
225
|
+
FIRST: Call precondition() to create fresh data that this test will act on.
|
|
226
|
+
Ask: "What will this test edit/delete/use?" — create THAT item via precondition.
|
|
227
|
+
Do not describe what's already on the page — create new disposable items for the test.
|
|
228
|
+
|
|
229
|
+
THEN: Based on the page elements and current state, outline:
|
|
230
|
+
1. Which elements to interact with and in what order
|
|
231
|
+
2. What to verify at each step
|
|
232
|
+
3. Potential issues to watch for
|
|
233
|
+
|
|
234
|
+
Before planning navigation to another page, assume the current page may already contain
|
|
235
|
+
the elements needed for the scenario. The page summary does not list every element.
|
|
236
|
+
Prefer interacting with the current page over navigating away.
|
|
237
|
+
|
|
238
|
+
Be concise and specific. Tester will follow your plan.
|
|
239
|
+
`, 'pilot.planTest', { tools: true, maxToolRoundtrips: 3, task });
|
|
240
|
+
}
|
|
241
|
+
async reviewNewPage(task, currentState) {
|
|
242
|
+
if (!this.conversation)
|
|
243
|
+
return '';
|
|
244
|
+
tag('substep').log('Pilot reviewing new page...');
|
|
245
|
+
const pageSummary = await this.researcher.summary(currentState, {
|
|
246
|
+
allowNewResearch: false,
|
|
247
|
+
});
|
|
248
|
+
if (!pageSummary)
|
|
249
|
+
return '';
|
|
250
|
+
const stateContext = this.buildStateContext(currentState);
|
|
251
|
+
this.conversation.cleanupTag('page_summary', '...trimmed...', 1);
|
|
252
|
+
return this.sendToPilot(dedent `
|
|
253
|
+
Navigated to new page.
|
|
254
|
+
START URL: ${task.startUrl}
|
|
255
|
+
|
|
256
|
+
<state>
|
|
257
|
+
${stateContext}
|
|
258
|
+
</state>
|
|
259
|
+
|
|
260
|
+
<page_summary>
|
|
261
|
+
${pageSummary}
|
|
262
|
+
</page_summary>
|
|
263
|
+
|
|
264
|
+
${this.formatExpectations(task)}
|
|
265
|
+
|
|
266
|
+
First: evaluate whether this navigation makes sense for the scenario goal. If the page is unrelated, instruct Tester to back() or reset(). Then plan next steps.
|
|
267
|
+
`, 'pilot.reviewNewPage');
|
|
268
|
+
}
|
|
269
|
+
async analyzeProgress(task, currentState, testerConversation) {
|
|
270
|
+
tag('substep').log('Pilot analyzing progress...');
|
|
271
|
+
if (!this.conversation) {
|
|
272
|
+
const pageSummary = await this.researcher.summary(currentState, {
|
|
273
|
+
allowNewResearch: false,
|
|
274
|
+
});
|
|
275
|
+
const agenticModel = this.provider.getAgenticModel('pilot');
|
|
276
|
+
this.conversation = this.provider.startConversation(this.getSystemPrompt(task, currentState, pageSummary), 'pilot', agenticModel);
|
|
277
|
+
}
|
|
278
|
+
const toolCalls = testerConversation.getToolExecutions().slice(-this.stepsToReview);
|
|
279
|
+
const actionsContext = this.formatActions(toolCalls);
|
|
280
|
+
const stateContext = this.buildStateContext(currentState);
|
|
281
|
+
this.conversation.cleanupTag('recent_actions', '...trimmed...', 2);
|
|
282
|
+
const hasFailures = toolCalls.length === 0 || toolCalls.some((t) => !t.wasSuccessful);
|
|
283
|
+
const text = await this.sendToPilot(dedent `
|
|
284
|
+
START URL: ${task.startUrl}
|
|
285
|
+
|
|
286
|
+
<state>
|
|
287
|
+
${stateContext}
|
|
288
|
+
</state>
|
|
289
|
+
|
|
290
|
+
${this.formatExpectations(task)}
|
|
291
|
+
|
|
292
|
+
<recent_actions>
|
|
293
|
+
${actionsContext || 'None'}
|
|
294
|
+
</recent_actions>
|
|
295
|
+
|
|
296
|
+
What should Tester do next?
|
|
297
|
+
`, 'pilot.analyze', { tools: hasFailures, maxToolRoundtrips: hasFailures ? 2 : 0, task });
|
|
298
|
+
const contextToAttach = await this.fetchRequestedContext(text, currentState);
|
|
299
|
+
if (contextToAttach) {
|
|
300
|
+
return `${text}\n\n${contextToAttach}`;
|
|
301
|
+
}
|
|
302
|
+
return text;
|
|
303
|
+
}
|
|
304
|
+
formatExpectations(task) {
|
|
305
|
+
const checked = task.getCheckedExpectations();
|
|
306
|
+
const remaining = task.getRemainingExpectations();
|
|
307
|
+
return `CHECKED: ${checked.length > 0 ? checked.join(', ') : 'none'}\nREMAINING: ${remaining.length > 0 ? remaining.join(', ') : 'none'}`;
|
|
308
|
+
}
|
|
309
|
+
async sendToPilot(userText, functionId, opts = {}) {
|
|
310
|
+
debugLog(`sendToPilot: ${functionId}, tools: ${!!opts.tools}, roundtrips: ${opts.maxToolRoundtrips ?? 0}`);
|
|
311
|
+
this.conversation.addUserText(userText);
|
|
312
|
+
let tools = opts.tools ? this.agentTools : undefined;
|
|
313
|
+
if (opts.tools && opts.task) {
|
|
314
|
+
tools = { ...tools, ...this.buildPreconditionTool(opts.task) };
|
|
315
|
+
}
|
|
316
|
+
const result = await this.provider.invokeConversation(this.conversation, tools, {
|
|
317
|
+
maxToolRoundtrips: opts.maxToolRoundtrips ?? 0,
|
|
318
|
+
agentName: 'pilot',
|
|
319
|
+
experimental_telemetry: { functionId },
|
|
320
|
+
});
|
|
321
|
+
return result?.response?.text || '';
|
|
322
|
+
}
|
|
323
|
+
buildPreconditionTool(task) {
|
|
324
|
+
return {
|
|
325
|
+
precondition: tool({
|
|
326
|
+
description: 'Create fresh disposable data that the test will act on (edit, delete, filter). Describe WHAT to create, not what exists. Do NOT request users. Examples: "1 post", "1 comment", "1 label named Bug".',
|
|
327
|
+
inputSchema: z.object({
|
|
328
|
+
description: z.string().describe('What data is needed, e.g. "1 post and 2 comments in it"'),
|
|
329
|
+
}),
|
|
330
|
+
execute: async ({ description }) => {
|
|
331
|
+
task.addNote(`Precondition: ${description}`);
|
|
332
|
+
tag('info').log(`Precondition: ${description}`);
|
|
333
|
+
debugLog(`precondition: ${description}, fisherman: ${this.fisherman?.isAvailable() ? 'available' : 'none'}`);
|
|
334
|
+
if (!this.fisherman || !this.fisherman.isAvailable()) {
|
|
335
|
+
return { noted: true, prepared: false, reason: 'Fisherman not available' };
|
|
336
|
+
}
|
|
337
|
+
const result = await this.fisherman.prepareData(description, task.startUrl, task.sessionName);
|
|
338
|
+
if (!result.success || result.created.length === 0) {
|
|
339
|
+
if (result.summary)
|
|
340
|
+
tag('warning').log(`Precondition failed: ${result.summary}`);
|
|
341
|
+
return { noted: true, prepared: false, reason: result.summary };
|
|
342
|
+
}
|
|
343
|
+
const items = result.created.map((c) => {
|
|
344
|
+
const parts = [c.type];
|
|
345
|
+
if (c.title)
|
|
346
|
+
parts.push(`"${c.title}"`);
|
|
347
|
+
if (c.id)
|
|
348
|
+
parts.push(`(id: ${c.id})`);
|
|
349
|
+
return parts.join(' ');
|
|
350
|
+
});
|
|
351
|
+
const stepText = `Precondition: created ${items.join(', ')}`;
|
|
352
|
+
task.addStep(stepText);
|
|
353
|
+
tag('success').log(stepText);
|
|
354
|
+
return { noted: true, prepared: true, created: result.created };
|
|
355
|
+
},
|
|
356
|
+
}),
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
buildStateContext(state) {
|
|
360
|
+
const lines = [];
|
|
361
|
+
lines.push(`url: ${state.url}`);
|
|
362
|
+
lines.push(`title: ${state.title || 'unknown'}`);
|
|
363
|
+
const focused = extractFocusedElement(state.ariaSnapshot);
|
|
364
|
+
if (focused) {
|
|
365
|
+
const valuePart = focused.value ? ` (value: "${focused.value}")` : '';
|
|
366
|
+
lines.push(`focused: ${focused.role} "${focused.name}"${valuePart}`);
|
|
367
|
+
}
|
|
368
|
+
lines.push(`h1: ${state.h1 || ''}`);
|
|
369
|
+
lines.push(`h2: ${state.h2 || ''}`);
|
|
370
|
+
lines.push(`h3: ${state.h3 || ''}`);
|
|
371
|
+
lines.push(`h4: ${state.h4 || ''}`);
|
|
372
|
+
const focusArea = detectFocusArea(state.ariaSnapshot);
|
|
373
|
+
if (focusArea.detected) {
|
|
374
|
+
lines.push(`modal: ${focusArea.name || focusArea.type}`);
|
|
375
|
+
}
|
|
376
|
+
else {
|
|
377
|
+
lines.push('modal: none');
|
|
378
|
+
}
|
|
379
|
+
if (this.explorer.hasOtherTabs()) {
|
|
380
|
+
const tabs = this.explorer.getOtherTabsInfo();
|
|
381
|
+
lines.push(`other tabs: ${tabs.length} (${tabs.map((t) => `${t.url} - ${t.title}`).join(', ')})`);
|
|
382
|
+
}
|
|
383
|
+
else {
|
|
384
|
+
lines.push('other tabs: none');
|
|
385
|
+
}
|
|
386
|
+
const verifications = Object.entries(state.verifications ?? {});
|
|
387
|
+
if (verifications.length > 0) {
|
|
388
|
+
const verifyLines = verifications.map(([a, v]) => `${v ? 'PASS' : 'FAIL'}: ${a}`);
|
|
389
|
+
lines.push(`verifications: ${verifyLines.join(', ')}`);
|
|
390
|
+
}
|
|
391
|
+
const interactiveNodes = collectInteractiveNodes(state.ariaSnapshot);
|
|
392
|
+
const disabledButtons = interactiveNodes.filter((n) => n.role === 'button' && n.disabled === true && n.name).map((n) => n.name);
|
|
393
|
+
lines.push(`disabled buttons: ${disabledButtons.length > 0 ? disabledButtons.join(', ') : 'none'}`);
|
|
394
|
+
const formFields = interactiveNodes.filter((n) => n.role === 'textbox' || n.role === 'combobox' || n.role === 'select' || n.role === 'searchbox' || n.role === 'spinbutton');
|
|
395
|
+
if (formFields.length > 0) {
|
|
396
|
+
const fieldDescriptions = formFields.map((f) => {
|
|
397
|
+
let desc = `${f.role} "${f.name || ''}"`;
|
|
398
|
+
if (f.required)
|
|
399
|
+
desc += ' [required]';
|
|
400
|
+
return desc;
|
|
401
|
+
});
|
|
402
|
+
lines.push(`active form: ${fieldDescriptions.join(', ')}`);
|
|
403
|
+
}
|
|
404
|
+
return lines.join('\n');
|
|
405
|
+
}
|
|
406
|
+
async fetchRequestedContext(text, currentState) {
|
|
407
|
+
const parts = [];
|
|
408
|
+
if (text.includes('ATTACH_HTML')) {
|
|
409
|
+
const html = await currentState.simplifiedHtml();
|
|
410
|
+
parts.push(dedent `
|
|
411
|
+
<page_html>
|
|
412
|
+
${html}
|
|
413
|
+
</page_html>
|
|
414
|
+
`);
|
|
415
|
+
}
|
|
416
|
+
if (text.includes('ATTACH_ARIA')) {
|
|
417
|
+
parts.push(dedent `
|
|
418
|
+
<page_aria>
|
|
419
|
+
${currentState.getInteractiveARIA()}
|
|
420
|
+
</page_aria>
|
|
421
|
+
`);
|
|
422
|
+
}
|
|
423
|
+
if (text.includes('ATTACH_SUMMARY')) {
|
|
424
|
+
const summary = await this.researcher.summary(currentState);
|
|
425
|
+
if (summary) {
|
|
426
|
+
parts.push(dedent `
|
|
427
|
+
<page_summary>
|
|
428
|
+
${summary}
|
|
429
|
+
</page_summary>
|
|
430
|
+
`);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
if (text.includes('ATTACH_UI_MAP')) {
|
|
434
|
+
const uiMap = await this.researcher.research(currentState);
|
|
435
|
+
if (uiMap) {
|
|
436
|
+
parts.push(dedent `
|
|
437
|
+
<page_ui_map>
|
|
438
|
+
${uiMap}
|
|
439
|
+
</page_ui_map>
|
|
440
|
+
`);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
return parts.join('\n\n');
|
|
444
|
+
}
|
|
445
|
+
formatSessionLog(testerConversation) {
|
|
446
|
+
const executions = testerConversation.getToolExecutions().filter((t) => !META_TOOLS.includes(t.toolName));
|
|
447
|
+
const stateHistory = this.explorer.getStateManager().getStateHistory();
|
|
448
|
+
const initialUrl = stateHistory[0]?.toState?.url || '';
|
|
449
|
+
let currentUrl = initialUrl;
|
|
450
|
+
const groups = new Map();
|
|
451
|
+
const ensureGroup = (url) => {
|
|
452
|
+
if (!groups.has(url)) {
|
|
453
|
+
const matchingState = stateHistory.find((t) => t.toState.url === url)?.toState;
|
|
454
|
+
groups.set(url, {
|
|
455
|
+
title: matchingState?.title,
|
|
456
|
+
h1: matchingState?.h1,
|
|
457
|
+
h3: matchingState?.h3,
|
|
458
|
+
lines: [],
|
|
459
|
+
});
|
|
460
|
+
}
|
|
461
|
+
};
|
|
462
|
+
ensureGroup(currentUrl);
|
|
463
|
+
for (const exec of executions) {
|
|
464
|
+
if (!CHECK_TOOLS.includes(exec.toolName) && exec.output?.url && exec.output.url !== currentUrl) {
|
|
465
|
+
currentUrl = exec.output.url;
|
|
466
|
+
ensureGroup(currentUrl);
|
|
467
|
+
}
|
|
468
|
+
const description = exec.input?.explanation || exec.input?.assertion || exec.input?.request || truncateJson(exec.input);
|
|
469
|
+
const status = exec.wasSuccessful ? 'OK' : 'FAILED';
|
|
470
|
+
let line = `${exec.toolName} '${description}' -> ${status}`;
|
|
471
|
+
if (exec.toolName === 'verify') {
|
|
472
|
+
if (!exec.wasSuccessful && exec.output?.alreadyVerified) {
|
|
473
|
+
line = `${exec.toolName} '${description}' -> BLOCKED (already verified on this state)`;
|
|
474
|
+
}
|
|
475
|
+
else if (exec.output?.code) {
|
|
476
|
+
line += `\n code: ${exec.output.code}`;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
const analysisText = exec.output?.analysis;
|
|
480
|
+
const resultMessage = analysisText ? (analysisText.length > 500 ? `${analysisText.slice(0, 500)}...` : analysisText) : exec.output?.message || exec.output?.result;
|
|
481
|
+
if (resultMessage && (CHECK_TOOLS.includes(exec.toolName) || !exec.wasSuccessful)) {
|
|
482
|
+
line += `\n result: ${resultMessage}`;
|
|
483
|
+
}
|
|
484
|
+
groups.get(currentUrl).lines.push(line);
|
|
485
|
+
}
|
|
486
|
+
const parts = [];
|
|
487
|
+
for (const [url, group] of groups) {
|
|
488
|
+
const header = [url];
|
|
489
|
+
if (group.title)
|
|
490
|
+
header.push(` title: ${group.title}`);
|
|
491
|
+
if (group.h1)
|
|
492
|
+
header.push(` h1: ${group.h1}`);
|
|
493
|
+
if (group.h3)
|
|
494
|
+
header.push(` h3: ${group.h3}`);
|
|
495
|
+
header.push('');
|
|
496
|
+
const lines = group.lines.map((l) => ` ${l}`);
|
|
497
|
+
parts.push([...header, ...lines].join('\n'));
|
|
498
|
+
}
|
|
499
|
+
return parts.join('\n\n');
|
|
500
|
+
}
|
|
501
|
+
formatActions(toolCalls) {
|
|
502
|
+
return toolCalls
|
|
503
|
+
.map((t) => {
|
|
504
|
+
const status = t.wasSuccessful ? 'SUCCESS' : 'FAILED';
|
|
505
|
+
const kind = CHECK_TOOLS.includes(t.toolName) ? 'CHECK' : 'ACTION';
|
|
506
|
+
const description = t.input?.explanation || t.input?.request || truncateJson(t.input);
|
|
507
|
+
const analysisText = t.output?.analysis;
|
|
508
|
+
const resultMessage = analysisText ? (analysisText.length > 500 ? `${analysisText.slice(0, 500)}...` : analysisText) : t.output?.message || '';
|
|
509
|
+
const errorDetail = t.output?.attempts?.find((a) => a.error)?.error;
|
|
510
|
+
let line = `[${status}] ${kind} ${t.toolName}: ${description}`;
|
|
511
|
+
const executedCode = t.output?.code;
|
|
512
|
+
if (executedCode && t.toolName === 'click') {
|
|
513
|
+
line += `\n executed: ${executedCode}`;
|
|
514
|
+
}
|
|
515
|
+
const targeted = t.output?.targetedHtml;
|
|
516
|
+
if (targeted) {
|
|
517
|
+
line += `\n element: ${targeted}`;
|
|
518
|
+
}
|
|
519
|
+
if (resultMessage)
|
|
520
|
+
line += `\n result: ${resultMessage}`;
|
|
521
|
+
if (errorDetail && errorDetail !== resultMessage)
|
|
522
|
+
line += `\n error: ${errorDetail}`;
|
|
523
|
+
const attempts = t.output?.attempts;
|
|
524
|
+
if (attempts && attempts.length > 1 && t.wasSuccessful) {
|
|
525
|
+
const failedBefore = attempts.filter((a) => !a.success);
|
|
526
|
+
if (failedBefore.length > 0) {
|
|
527
|
+
line += `\n skipped: ${failedBefore.map((a) => a.command).join(', ')}`;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
const ariaDiff = t.output?.pageDiff?.ariaChanges;
|
|
531
|
+
if (ariaDiff)
|
|
532
|
+
line += `\n ${ariaDiff}`;
|
|
533
|
+
return line;
|
|
534
|
+
})
|
|
535
|
+
.join('\n\n');
|
|
536
|
+
}
|
|
537
|
+
buildDeletionScope(task) {
|
|
538
|
+
const deletableItems = task.plan
|
|
539
|
+
? task.plan
|
|
540
|
+
.listTests()
|
|
541
|
+
.filter((t) => t.isSuccessful && t.sessionName)
|
|
542
|
+
.map((t) => t.sessionName)
|
|
543
|
+
: [];
|
|
544
|
+
const scenarioLower = task.scenario.toLowerCase();
|
|
545
|
+
if (deletableItems.length > 0) {
|
|
546
|
+
return `For deletion scenarios, items can only be deleted if their title contains: ${deletableItems.join(', ')}`;
|
|
547
|
+
}
|
|
548
|
+
if (scenarioLower.includes('delete') || scenarioLower.includes('remove')) {
|
|
549
|
+
return 'No items available for deletion — test should create an item first';
|
|
550
|
+
}
|
|
551
|
+
return '';
|
|
552
|
+
}
|
|
553
|
+
getSystemPrompt(task, initialState, pageSummary) {
|
|
554
|
+
const interactive = isInteractive();
|
|
555
|
+
const stepsText = task.plannedSteps.length > 0 ? task.plannedSteps.map((s, i) => `${i + 1}. ${s}`).join('\n') : 'No planned steps';
|
|
556
|
+
return dedent `
|
|
557
|
+
You are Pilot - a supervisor that detects problems and intervenes only when needed.
|
|
558
|
+
|
|
559
|
+
SCENARIO: ${task.scenario}
|
|
560
|
+
START URL: ${initialState.url}
|
|
561
|
+
PAGE: ${initialState.title || ''} | ${initialState.h1 || ''}
|
|
562
|
+
|
|
563
|
+
EXPECTED RESULTS:
|
|
564
|
+
${task.expected.map((e) => `- ${e}`).join('\n')}
|
|
565
|
+
|
|
566
|
+
PLANNED STEPS:
|
|
567
|
+
${stepsText}
|
|
568
|
+
|
|
569
|
+
${pageSummary ? `PAGE SUMMARY:\n${pageSummary}` : ''}
|
|
570
|
+
|
|
571
|
+
Your job:
|
|
572
|
+
1. Plan test execution by reviewing page elements and scenario requirements
|
|
573
|
+
2. When Tester navigates to a new page, review available elements and plan next steps
|
|
574
|
+
3. Detect when Tester is stuck: repeated failures, loops, or wrong direction
|
|
575
|
+
4. Track which expectations have been checked and which remain
|
|
576
|
+
5. When problems are detected, suggest concrete alternative approaches
|
|
577
|
+
6. When everything is going well, give brief encouragement and let Tester continue
|
|
578
|
+
7. Before suggesting navigation to another page, assume the current page may already have what the scenario needs. The page summary is incomplete — not every element is listed. Prefer exploring the current page first.
|
|
579
|
+
|
|
580
|
+
Already-achieved state detection:
|
|
581
|
+
- When planning or reviewing, check if the scenario goal is ALREADY met in the current state (page_summary, ariaDiff, or state context).
|
|
582
|
+
- If the goal appears already achieved at start: adapt the scenario — suggest different input values or data to make the test meaningful.
|
|
583
|
+
- If the goal was achieved by a previous action (SUCCESS in recent_actions with confirming ariaDiff): instruct Tester to verify() the result and finish(). Do NOT repeat the same action.
|
|
584
|
+
- If Tester keeps re-opening the same panel and re-submitting the same data — STOP. The action was already completed.
|
|
585
|
+
|
|
586
|
+
Navigation awareness — always compare current page url to START URL:
|
|
587
|
+
- subpage navigation (deeper path from START URL) — OK, scenario may need sub-pages
|
|
588
|
+
- outer-page navigation (parent/sibling path from START URL) — SUSPICIOUS. The scenario target is on the START page. Do NOT rationalize leaving it. Instruct Tester to back() or reset().
|
|
589
|
+
- outer-site navigation (different domain) — WRONG. Instruct Tester to reset() immediately.
|
|
590
|
+
|
|
591
|
+
IMPORTANT — Tool usage policy:
|
|
592
|
+
- DO NOT use tools (see, context) when Tester is making progress and no failures are recorded
|
|
593
|
+
- Tester already has full ARIA and HTML context — do not duplicate that work
|
|
594
|
+
- ONLY use see/context tools when Tester has failed 2+ times on the same element or action
|
|
595
|
+
- Use xpathCheck proactively when Tester fails to find an element even ONCE (element not found error)
|
|
596
|
+
- If Tester's ARIA locator used wrong role (e.g. "textbox" instead of "combobox"), use xpathCheck to identify the correct element
|
|
597
|
+
- After finding the element via xpathCheck, include the discovered locator in your NEXT instruction
|
|
598
|
+
${interactive ? '- Use askUser() only as last resort when automated recovery has failed' : ''}
|
|
599
|
+
|
|
600
|
+
Diagnosing failures — use <state> context:
|
|
601
|
+
- Button click failed AND that button is in "disabled buttons" → button is disabled, not missing. Check "active form" for unfilled [required] fields. Instruct Tester to fill required fields first.
|
|
602
|
+
- Form submit failed → check "active form" for fields that may need values. Instruct Tester to fill them before retrying submit.
|
|
603
|
+
- "modal: none" but Tester tries to interact with a modal → modal was closed or never opened. Instruct Tester to re-trigger the modal.
|
|
604
|
+
- Actions succeed but ariaDiff is empty → action may have worked without visible DOM changes. Check result message before assuming failure.
|
|
605
|
+
- Multiple elements matched (MultipleElementsFound) → use xpathCheck() to inspect the matched elements and determine which one is correct. Then instruct Tester with a precise locator or suggest visualClick() to click the right element by visual appearance.
|
|
606
|
+
- Tester navigated to a page unrelated to the scenario (e.g., settings instead of feature page) → use getVisitedStates() to check which pages were visited, then suggest back() to return to a relevant page, or reset() if multiple wrong navigations occurred. Do NOT try navigating back via breadcrumbs or links — SPA frameworks make manual back-navigation unreliable.
|
|
607
|
+
- If diagnosis is unclear, ariaDiff is empty, and your previous advice didn't help → suggest Tester use see() to visually inspect the page. But ONLY as a last resort after other diagnostics failed.
|
|
608
|
+
- Click succeeded but ariaDiff shows elements unrelated to tester's intention (e.g., clicked "Edit" but dropdown appeared) → wrong button or unexpected behavior. Instruct Tester to Escape and try a different approach.
|
|
609
|
+
- form(I.type()) succeeded → I.type() sends keys to whatever is focused, no guarantee it's the right field. Instruct Tester to verify with see() that text appeared in the correct field. If targetedHtml shows a button/link, text went to wrong element — click the correct field first and retry.
|
|
610
|
+
- ariaDiff shows 5+ elements removed/added after clicking content → page entered a different mode (editor, panel, modal). Instruct Tester to call context() to see current state before guessing selectors.
|
|
611
|
+
|
|
612
|
+
Detecting logically wrong successes — review "executed", "element", and "skipped" fields:
|
|
613
|
+
- Click SUCCESS but "executed" command differs from "explanation" intent → wrong element was clicked. The intended element wasn't found and a different one was clicked instead.
|
|
614
|
+
- Click SUCCESS with "skipped" commands listed → earlier attempts failed, fell through to a different locator. Check if the successful locator actually targets the intended element.
|
|
615
|
+
- form(I.type()) SUCCESS but "element" shows a button/link instead of input → text went to wrong element. Instruct Tester to click the correct input first.
|
|
616
|
+
- Action SUCCESS but ariaDiff shows changes unrelated to the stated goal → action hit the wrong target. Instruct Tester to undo (Escape/back) and retry with precise locator.
|
|
617
|
+
- If Tester's explanation mentions TWO distinct actions in ONE tool call → flag this. Each distinct action should be a separate tool call. Instruct Tester to split into individual steps.
|
|
618
|
+
|
|
619
|
+
Complex component patterns — when Tester fails to interact with dropdowns/selects:
|
|
620
|
+
- Search-and-select dropdowns require a SEQUENCE: click/focus the trigger input, type to filter, then click an option from the dropdown list. Instruct Tester to split this into separate tool calls.
|
|
621
|
+
- If Tester clicks a generic dropdown trigger and ariaDiff shows unrelated options → wrong dropdown was triggered. Instruct Tester to use a more specific selector with container context.
|
|
622
|
+
- If Tester types into an input but no dropdown appears → they may need to click the trigger element first. Suggest using context() to check the current DOM state.
|
|
623
|
+
|
|
624
|
+
Tester ignoring visible elements:
|
|
625
|
+
- If <state> shows "active form" fields but Tester is clicking elements not found in ARIA, or trying buttons that don't exist → Tester is ignoring interactive elements that are actually on the page. Instruct Tester to focus on the elements listed in "active form" — these are the real interactive controls on the current page. The UI map may be outdated.
|
|
626
|
+
|
|
627
|
+
When Tester IS stuck finding an element, use xpathCheck() with COMBINED XPaths:
|
|
628
|
+
- NEVER guess one exact text. UI labels differ from scenario wording.
|
|
629
|
+
- Combine multiple guesses into ONE XPath using "or" operator.
|
|
630
|
+
- Include: synonyms, partial text, aria-label, title, role, icon classes.
|
|
631
|
+
- Example: looking for a "create project" button:
|
|
632
|
+
//*[(contains(., "Create project") or contains(., "New project") or contains(., "Add project") or contains(@aria-label, "project")) or (contains(., "project") and (contains(@class, "add") or contains(@class, "plus") or contains(@class, "create") or .//*[contains(@class, "plus") or contains(@class, "add") or contains(@class, "icon-add")]))][@role="button" or @role="link" or self::button or self::a]
|
|
633
|
+
- Key: combine text synonyms + icon classes on children (.//*[contains(@class,...)]) + aria attributes
|
|
634
|
+
- If no results, broaden: drop the role filter, or search by role only, then check results for relevant text.
|
|
635
|
+
- After finding candidates, narrow down and include discovered XPath in NEXT instruction.
|
|
636
|
+
|
|
637
|
+
If you need more page context, mention ATTACH_HTML, ATTACH_ARIA, or ATTACH_UI_MAP — but only when recent actions show failures.
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
Available Tester tools:
|
|
641
|
+
- click(locator) — click elements
|
|
642
|
+
- pressKey(key) — keyboard keys
|
|
643
|
+
- form(code) — execute multiple commands (fillField, type, selectOption, attachFile)
|
|
644
|
+
- see(request) — visual screenshot analysis
|
|
645
|
+
- verify(assertion) — AI-powered DOM assertion (uses I.see, I.seeElement, I.seeInField, I.dontSee)
|
|
646
|
+
- context() — fresh HTML/ARIA snapshot
|
|
647
|
+
- research() — get UI map
|
|
648
|
+
- xpathCheck(xpath) — find elements by XPath
|
|
649
|
+
- visualClick(element) — coordinate-based click
|
|
650
|
+
- back() — return to previous page
|
|
651
|
+
- getVisitedStates() — list all visited pages (deduped by URL)
|
|
652
|
+
- reset() — return to initial page
|
|
653
|
+
- stop(reason) — abort test
|
|
654
|
+
- finish(verify) — complete test successfully
|
|
655
|
+
- record(notes) — document findings
|
|
656
|
+
|
|
657
|
+
YOUR tools (Pilot-only):
|
|
658
|
+
- precondition(description) — create FRESH test data via API that the test will act on. Do NOT request users.
|
|
659
|
+
|
|
660
|
+
PRECONDITIONS — what to create:
|
|
661
|
+
Preconditions create NEW disposable items that the test will modify, delete, or interact with.
|
|
662
|
+
Do NOT describe what already exists on the page — describe what NEW data the test needs to act on.
|
|
663
|
+
|
|
664
|
+
Ask yourself: "What object will this test change/delete/use? Create THAT."
|
|
665
|
+
|
|
666
|
+
Examples:
|
|
667
|
+
- "Edit test description" → precondition("1 test") — the test will edit this item
|
|
668
|
+
- "Delete a comment" → precondition("1 comment") — the test will delete this item
|
|
669
|
+
- "Assign a label to item" → precondition("1 item and 1 label named Bug") — test assigns the label
|
|
670
|
+
- "Filter by status" → precondition("3 items: 2 with status Open, 1 with status Closed")
|
|
671
|
+
- "Move item between lists" → precondition("1 item in list A")
|
|
672
|
+
|
|
673
|
+
WRONG: precondition("1 test suite named Updated Suite with existing tests") — this describes the page, not what to create
|
|
674
|
+
RIGHT: precondition("1 test") — create a fresh test that the scenario will edit
|
|
675
|
+
|
|
676
|
+
Call precondition() for EVERY item the scenario will act on. Keep descriptions short and specific.
|
|
677
|
+
|
|
678
|
+
Response format:
|
|
679
|
+
PROGRESS: <1 sentence assessment>
|
|
680
|
+
NEXT: <specific actionable instruction for Tester>
|
|
681
|
+
`;
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
//# sourceMappingURL=pilot.js.map
|