explorbot 0.1.13 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +3 -2
- package/dist/src/action.js +3 -2
- package/dist/src/ai/conversation.js +20 -4
- package/dist/src/ai/historian/utils.js +8 -1
- package/dist/src/ai/pilot.js +198 -260
- package/dist/src/ai/provider.js +25 -12
- package/dist/src/ai/quartermaster.js +2 -2
- package/dist/src/ai/rules.js +2 -0
- package/dist/src/ai/session-analyst.js +46 -41
- package/dist/src/ai/tester.js +56 -20
- package/dist/src/ai/tools.js +19 -4
- package/dist/src/commands/explore-command.js +8 -2
- package/dist/src/components/StatusPane.js +6 -1
- package/dist/src/experience-tracker.js +9 -0
- package/dist/src/explorer.js +2 -5
- package/dist/src/reporter.js +41 -1
- package/dist/src/stats.js +2 -1
- package/dist/src/test-plan.js +47 -3
- package/package.json +3 -2
- package/src/action.ts +3 -2
- package/src/ai/conversation.ts +21 -4
- package/src/ai/historian/utils.ts +8 -1
- package/src/ai/pilot.ts +199 -259
- package/src/ai/provider.ts +24 -12
- package/src/ai/quartermaster.ts +2 -2
- package/src/ai/rules.ts +2 -0
- package/src/ai/session-analyst.ts +47 -41
- package/src/ai/tester.ts +48 -18
- package/src/ai/tools.ts +18 -4
- package/src/commands/explore-command.ts +9 -2
- package/src/components/StatusPane.tsx +6 -3
- package/src/experience-tracker.ts +9 -0
- package/src/explorer.ts +1 -4
- package/src/reporter.ts +44 -1
- package/src/stats.ts +3 -1
- package/src/test-plan.ts +62 -3
package/src/ai/provider.ts
CHANGED
|
@@ -19,6 +19,15 @@ const responseLog = createDebug('explorbot:provider:in');
|
|
|
19
19
|
class AiError extends Error {}
|
|
20
20
|
export class ContextLengthError extends Error {}
|
|
21
21
|
|
|
22
|
+
function extractCachedTokens(usage: any): number {
|
|
23
|
+
if (!usage) return 0;
|
|
24
|
+
const direct = usage.cachedInputTokens ?? usage.inputTokenDetails?.cacheReadTokens;
|
|
25
|
+
if (typeof direct === 'number') return direct;
|
|
26
|
+
const raw = usage.raw;
|
|
27
|
+
const fromRaw = raw?.prompt_tokens_details?.cached_tokens ?? raw?.promptTokensDetails?.cachedTokens;
|
|
28
|
+
return typeof fromRaw === 'number' ? fromRaw : 0;
|
|
29
|
+
}
|
|
30
|
+
|
|
22
31
|
function rejectAfterIdle(ms: number, signal: { cancelled: boolean }): Promise<never> {
|
|
23
32
|
return new Promise((_, reject) => {
|
|
24
33
|
const tick = () => {
|
|
@@ -265,9 +274,10 @@ export class Provider {
|
|
|
265
274
|
|
|
266
275
|
if (response.usage) {
|
|
267
276
|
Stats.recordTokens(options.agentName || 'unknown', modelName, {
|
|
268
|
-
input: response.usage.promptTokens
|
|
269
|
-
output: response.usage.completionTokens
|
|
270
|
-
total: response.usage.totalTokens
|
|
277
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
278
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
279
|
+
total: response.usage.totalTokens ?? 0,
|
|
280
|
+
cached: extractCachedTokens(response.usage),
|
|
271
281
|
});
|
|
272
282
|
}
|
|
273
283
|
|
|
@@ -355,9 +365,10 @@ export class Provider {
|
|
|
355
365
|
|
|
356
366
|
if (response.usage) {
|
|
357
367
|
Stats.recordTokens(options.agentName || 'unknown', modelName, {
|
|
358
|
-
input: response.usage.promptTokens
|
|
359
|
-
output: response.usage.completionTokens
|
|
360
|
-
total: response.usage.totalTokens
|
|
368
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
369
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
370
|
+
total: response.usage.totalTokens ?? 0,
|
|
371
|
+
cached: extractCachedTokens(response.usage),
|
|
361
372
|
});
|
|
362
373
|
}
|
|
363
374
|
|
|
@@ -428,9 +439,10 @@ export class Provider {
|
|
|
428
439
|
|
|
429
440
|
if (response.usage) {
|
|
430
441
|
Stats.recordTokens(options.agentName || 'unknown', modelName, {
|
|
431
|
-
input: response.usage.promptTokens
|
|
432
|
-
output: response.usage.completionTokens
|
|
433
|
-
total: response.usage.totalTokens
|
|
442
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
443
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
444
|
+
total: response.usage.totalTokens ?? 0,
|
|
445
|
+
cached: extractCachedTokens(response.usage),
|
|
434
446
|
});
|
|
435
447
|
}
|
|
436
448
|
|
|
@@ -625,9 +637,9 @@ export class Provider {
|
|
|
625
637
|
|
|
626
638
|
if (response.usage) {
|
|
627
639
|
Stats.recordTokens('vision', this.getModelName(this.config.visionModel), {
|
|
628
|
-
input: response.usage.promptTokens
|
|
629
|
-
output: response.usage.completionTokens
|
|
630
|
-
total: response.usage.totalTokens
|
|
640
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
641
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
642
|
+
total: response.usage.totalTokens ?? 0,
|
|
631
643
|
});
|
|
632
644
|
}
|
|
633
645
|
|
package/src/ai/quartermaster.ts
CHANGED
|
@@ -240,11 +240,11 @@ Focus on what would confuse a real user or caused the agent to make mistakes.`;
|
|
|
240
240
|
const criticalViolations = report.axeViolations.filter((v) => v.impact === 'critical' || v.impact === 'serious');
|
|
241
241
|
for (const v of criticalViolations.slice(0, 3)) {
|
|
242
242
|
const nodeHtml = v.nodes[0]?.html.slice(0, 100) || '';
|
|
243
|
-
task.
|
|
243
|
+
task.addVerificationDetail(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
|
|
244
244
|
}
|
|
245
245
|
|
|
246
246
|
for (const issue of report.semanticIssues.slice(0, 3)) {
|
|
247
|
-
task.
|
|
247
|
+
task.addVerificationDetail(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
|
|
248
248
|
}
|
|
249
249
|
}
|
|
250
250
|
|
package/src/ai/rules.ts
CHANGED
|
@@ -241,6 +241,8 @@ export function multipleTabsRule(tabs: Array<{ url: string; title: string }>): s
|
|
|
241
241
|
|
|
242
242
|
export const actionRule = dedent`
|
|
243
243
|
<actions>
|
|
244
|
+
\`faker\` (from @faker-js/faker) is available inside I.* calls for generating data, e.g. I.fillField('Bio', faker.lorem.paragraphs(5)).
|
|
245
|
+
|
|
244
246
|
### I.click
|
|
245
247
|
|
|
246
248
|
clicks on the element by its locator
|
|
@@ -19,69 +19,71 @@ export class SessionAnalyst implements Agent {
|
|
|
19
19
|
const eligible = tests.filter((t) => t.startTime != null);
|
|
20
20
|
if (eligible.length === 0) return '';
|
|
21
21
|
|
|
22
|
-
const model = this.provider.
|
|
22
|
+
const model = this.provider.getAgenticModel('analyst');
|
|
23
23
|
const customPrompt = this.provider.getSystemPromptForAgent('analyst', undefined);
|
|
24
24
|
|
|
25
25
|
const systemPrompt = dedent`
|
|
26
|
-
You write a
|
|
26
|
+
You write a TERSE end-of-session report. Reader is a developer who wants to UNDERSTAND THE FEATURE — what works, what is broken, what is unclear. Every word must earn its place.
|
|
27
27
|
|
|
28
|
-
Output MARKDOWN. No JSON, no preamble, no closing
|
|
28
|
+
Output MARKDOWN. No JSON, no preamble, no closing summary.
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
Group by ROOT CAUSE, not by scenario. If three tests fail for the same dropdown, that is ONE defect listing all three test refs (#3, #5, #7). Do not produce one cluster per test.
|
|
30
|
+
NO EMOJI. No 🔴 🟡 🟢 ✅, no escape sequences like \\u2705. Use plain text severity tags: [High], [Medium], [Low] for defects.
|
|
32
31
|
|
|
33
|
-
##
|
|
34
|
-
Use the FINAL verdict (the test's \`result\` field) as the starting point. Mid-test errors that the automation recovered from do NOT make a passed test unreliable.
|
|
32
|
+
## Reporting unit
|
|
35
33
|
|
|
36
|
-
|
|
37
|
-
- **UX issue** — app works but the UI is ambiguous, controls are hidden, or labels are unclear. Worth flagging to design.
|
|
38
|
-
- **Execution issue** — the FINAL verdict is unreliable. Only two cases:
|
|
39
|
-
1. \`result: failed\` AND the failure was automation, environment, or UI/UX (locator missing, timeout, AI loop, navigation stuck, modal trapped focus, no accessible label) — i.e. the test could not conclude whether the app works.
|
|
40
|
-
2. \`result: passed\` AND clear evidence in the log shows the user-visible goal was NOT achieved (no confirmation visible, no state change verified, the assertion was vacuous).
|
|
34
|
+
Report at the level of FEATURES / FLOWS / PAGES. Tests are evidence, not the unit. Several tests covering the same flow → ONE entry citing all of them.
|
|
41
35
|
|
|
42
|
-
|
|
36
|
+
## Walk every test
|
|
43
37
|
|
|
44
|
-
|
|
45
|
-
- 🔴 critical or high — core flow blocked, data loss, security
|
|
46
|
-
- 🟡 medium — partial breakage with workaround
|
|
47
|
-
- 🟢 low — cosmetic
|
|
38
|
+
PASSED test: did all steps run, was the goal actually verified, did the user-visible goal happen? All yes → contributes to What works. Any no → Execution issue (false positive).
|
|
48
39
|
|
|
49
|
-
|
|
40
|
+
FAILED test, first match wins: (1) goal achieved but mis-verified → Execution. (2) automation failure (locator/timeout/loop/modal/a11y) → Execution. (3) bad preconditions or data → Execution. (4) wrong URL/environment → Execution. (5) app contradicted expected outcome → Defect.
|
|
41
|
+
|
|
42
|
+
Crucial distinction: "the app misbehaved" vs "the automation could not interact with the app". ONLY the first is a Defect. If the automation gives up before the app responds — timeout, retries exhausted, dead loop / loop detected, could not click or find an element — that is an Execution issue regardless of what the log calls it. Failure inside the automation ≠ failure inside the product.
|
|
43
|
+
|
|
44
|
+
A solitary failure where adjacent tests on the same feature passed → Execution, not Defect.
|
|
45
|
+
|
|
46
|
+
## Severity (defects only)
|
|
47
|
+
[High] blocks a core flow · [Medium] degrades a flow but workaround exists · [Low] cosmetic / edge case
|
|
48
|
+
|
|
49
|
+
## Format
|
|
50
50
|
|
|
51
51
|
# Session Analysis
|
|
52
52
|
|
|
53
|
-
<
|
|
53
|
+
<ONE or TWO sentences describing the FEATURE STATE — what was explored, whether the core flow holds, what the standout problem is. NO test counts, NO "N tests run". Talk about the product, not the run.>
|
|
54
|
+
|
|
55
|
+
## Coverage
|
|
56
|
+
- Pages: <paths>
|
|
57
|
+
- Features: <capabilities>
|
|
58
|
+
|
|
59
|
+
## What works
|
|
60
|
+
- **<feature>** — #2, #7, #8
|
|
54
61
|
|
|
55
62
|
## Defects
|
|
56
63
|
|
|
57
|
-
###
|
|
58
|
-
Affects: #3, #5
|
|
64
|
+
### [Medium] <plain-English bug title>
|
|
65
|
+
Affects: #3, #5
|
|
59
66
|
Reproduce:
|
|
60
|
-
1. <concrete UI step
|
|
61
|
-
2. <next
|
|
62
|
-
Evidence: <one short observation
|
|
63
|
-
|
|
64
|
-
### 🟡 <next defect>
|
|
65
|
-
...
|
|
67
|
+
1. <concrete UI step>
|
|
68
|
+
2. <next>
|
|
69
|
+
Evidence: <one short observation>
|
|
66
70
|
|
|
67
71
|
## UX issues
|
|
68
|
-
|
|
69
|
-
- **<title>** — #4
|
|
70
|
-
<one short evidence line>
|
|
72
|
+
- **<feature>** — <what's confusing> (#7)
|
|
71
73
|
|
|
72
74
|
## Execution Issues
|
|
75
|
+
- **#2 <scenario>** — <≤10 words, what was unreliable>
|
|
73
76
|
|
|
74
|
-
|
|
75
|
-
- **<…>** — <…>
|
|
77
|
+
## Brevity rules
|
|
76
78
|
|
|
77
|
-
|
|
78
|
-
-
|
|
79
|
-
- Defect title
|
|
80
|
-
- Reproduce steps are
|
|
81
|
-
- Evidence is
|
|
82
|
-
-
|
|
83
|
-
-
|
|
84
|
-
-
|
|
79
|
+
- Headline: 2 sentences MAX. About the FEATURE, not the run. No counts, no "N tests", no "this session". Banned words: "exercised", "comprehensive", "notably", "this session", "module", "targeted", "covered creation".
|
|
80
|
+
- What works: feature name + test refs. NO parentheticals, NO caveats. If there's a caveat, the entry doesn't belong here.
|
|
81
|
+
- Defect title is the BUG ("Search returns non-matching results"), never the scenario name.
|
|
82
|
+
- Reproduce steps are imperative one-liners drawn from the log.
|
|
83
|
+
- Evidence is one short factual observation. Never quote the \`result\` field.
|
|
84
|
+
- Execution Issues: ONE line per test, ≤10 words, plain. Examples: "passed vacuously, no list assertion", "no file upload step in log", "dead loop on Save click". No prefixes, no nested explanation.
|
|
85
|
+
- Omit any empty section.
|
|
86
|
+
- Section order: Coverage → What works → Defects (severity desc) → UX issues → Execution Issues.
|
|
85
87
|
|
|
86
88
|
${customPrompt || ''}
|
|
87
89
|
`;
|
|
@@ -101,7 +103,7 @@ export class SessionAnalyst implements Agent {
|
|
|
101
103
|
{ agentName: 'analyst' }
|
|
102
104
|
);
|
|
103
105
|
|
|
104
|
-
return (response?.text || '').trim();
|
|
106
|
+
return decodeEscapes((response?.text || '').trim());
|
|
105
107
|
}
|
|
106
108
|
|
|
107
109
|
writeReport(markdown: string): string {
|
|
@@ -131,3 +133,7 @@ export class SessionAnalyst implements Agent {
|
|
|
131
133
|
`;
|
|
132
134
|
}
|
|
133
135
|
}
|
|
136
|
+
|
|
137
|
+
function decodeEscapes(text: string): string {
|
|
138
|
+
return text.replace(/\\u\{([0-9a-fA-F]+)\}/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16)));
|
|
139
|
+
}
|
package/src/ai/tester.ts
CHANGED
|
@@ -64,6 +64,8 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
64
64
|
private pageStateHash: string | null = null;
|
|
65
65
|
private pageActionResult: ActionResult | null = null;
|
|
66
66
|
private hooksRunner: HooksRunner;
|
|
67
|
+
private seenUiMapUrls = new Set<string>();
|
|
68
|
+
private lastAnalyzedStateHash: string | null = null;
|
|
67
69
|
|
|
68
70
|
constructor(explorer: Explorer, provider: Provider, researcher: Researcher, navigator: Navigator, agentTools?: any) {
|
|
69
71
|
super();
|
|
@@ -104,7 +106,7 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
104
106
|
}
|
|
105
107
|
|
|
106
108
|
private get progressCheckInterval(): number {
|
|
107
|
-
return (this.explorer.getConfig().ai?.agents?.tester as any)?.progressCheckInterval ??
|
|
109
|
+
return (this.explorer.getConfig().ai?.agents?.tester as any)?.progressCheckInterval ?? 3;
|
|
108
110
|
}
|
|
109
111
|
|
|
110
112
|
getConversation(): Conversation | null {
|
|
@@ -123,6 +125,8 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
123
125
|
this.previousStateHash = null;
|
|
124
126
|
this.pageStateHash = null;
|
|
125
127
|
this.pageActionResult = null;
|
|
128
|
+
this.seenUiMapUrls.clear();
|
|
129
|
+
this.lastAnalyzedStateHash = null;
|
|
126
130
|
this.explorer.getStateManager().clearHistory();
|
|
127
131
|
this.resetFailureCount();
|
|
128
132
|
this.pilot?.reset();
|
|
@@ -147,14 +151,20 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
147
151
|
const initialState = ActionResult.fromState(state);
|
|
148
152
|
|
|
149
153
|
const conversation = this.provider.startConversation(this.getSystemMessage(), 'tester');
|
|
154
|
+
conversation.markLastMessageCacheable();
|
|
150
155
|
this.currentConversation = conversation;
|
|
151
156
|
|
|
152
157
|
const outputDir = ConfigParser.getInstance().getOutputDir();
|
|
153
158
|
this.executionLogFile = join(outputDir, `tester_${task.sessionName}.md`);
|
|
154
159
|
// Note: Markdown saving functionality removed from Conversation class
|
|
155
160
|
|
|
156
|
-
const
|
|
157
|
-
conversation.addUserText(
|
|
161
|
+
const scenarioBlock = this.buildScenarioBlock(task, initialState);
|
|
162
|
+
conversation.addUserText(scenarioBlock);
|
|
163
|
+
conversation.markLastMessageCacheable();
|
|
164
|
+
conversation.protectPrefix(conversation.messages.length);
|
|
165
|
+
|
|
166
|
+
const pageContext = await this.reinjectContextIfNeeded(1, initialState);
|
|
167
|
+
if (pageContext) conversation.addUserText(pageContext);
|
|
158
168
|
|
|
159
169
|
return await Observability.run(
|
|
160
170
|
`test: ${task.scenario}`,
|
|
@@ -177,6 +187,12 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
177
187
|
if (this.pilot) {
|
|
178
188
|
try {
|
|
179
189
|
const plan = await this.pilot.planTest(task, initialState);
|
|
190
|
+
if (task.hasFinished) {
|
|
191
|
+
offFailedRequest?.();
|
|
192
|
+
page?.off('pageerror', onPageError);
|
|
193
|
+
page?.off('console', onConsoleMessage);
|
|
194
|
+
return { success: task.isSuccessful };
|
|
195
|
+
}
|
|
180
196
|
if (plan) {
|
|
181
197
|
conversation.addUserText(`Pilot's test plan:\n${plan}\n\nFollow this plan while executing the test.`);
|
|
182
198
|
}
|
|
@@ -200,13 +216,15 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
200
216
|
debugLog(`Navigating to ${task.startUrl}`);
|
|
201
217
|
await this.explorer.visit(task.startUrl!);
|
|
202
218
|
|
|
203
|
-
const
|
|
219
|
+
const startState = this.explorer.getStateManager().getCurrentState();
|
|
220
|
+
if (startState) task.addUrlNote(startState);
|
|
221
|
+
const currentUrl = startState?.url || task.startUrl || '';
|
|
204
222
|
await this.hooksRunner.runBeforeHook('tester', currentUrl);
|
|
205
223
|
|
|
206
224
|
const offStateChange = this.explorer.getStateManager().onStateChange((event: StateTransition) => {
|
|
207
225
|
if (task.hasFinished) return;
|
|
208
226
|
if (event.toState?.url === event.fromState?.url) return;
|
|
209
|
-
task.
|
|
227
|
+
if (event.toState) task.addUrlNote(event.toState, event.fromState || undefined);
|
|
210
228
|
task.states.push(event.toState);
|
|
211
229
|
});
|
|
212
230
|
|
|
@@ -253,13 +271,13 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
253
271
|
`);
|
|
254
272
|
}
|
|
255
273
|
|
|
256
|
-
conversation.cleanupTag('page_aria', '...cleaned aria snapshot...',
|
|
274
|
+
conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 1);
|
|
257
275
|
conversation.cleanupTag('page_html', '...cleaned HTML snapshot...', 1);
|
|
258
276
|
conversation.cleanupTag('experience', '...cleaned experience...', 1);
|
|
259
277
|
conversation.cleanupTag('applied_experience', '...cleaned past experience...', 1);
|
|
260
278
|
conversation.cleanupTag('page_ui_map', '...cleaned UI map...', 1);
|
|
261
279
|
conversation.cleanupTag('page_ui_map_overlay', '...cleaned UI overlay...', 1);
|
|
262
|
-
conversation.compactToolResults(
|
|
280
|
+
conversation.compactToolResults(2);
|
|
263
281
|
|
|
264
282
|
if (iteration > 1) {
|
|
265
283
|
const isNewPage = this.previousUrl !== null && this.previousUrl !== currentState.url;
|
|
@@ -270,16 +288,17 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
270
288
|
if (isNewPage && this.pilot) {
|
|
271
289
|
const guidance = await this.pilot.reviewNewPage(task, currentState, conversation);
|
|
272
290
|
if (guidance) nextStep += `\n\n${guidance}`;
|
|
273
|
-
} else if ((iteration
|
|
291
|
+
} else if (this.shouldAnalyzeProgress(iteration, currentState) && this.pilot) {
|
|
274
292
|
const guidance = await this.pilot.analyzeProgress(task, currentState, conversation);
|
|
275
293
|
if (guidance) nextStep += `\n\n${guidance}`;
|
|
276
294
|
this.consecutiveFailures = 0;
|
|
295
|
+
this.lastAnalyzedStateHash = currentState.hash;
|
|
277
296
|
}
|
|
278
297
|
conversation.addUserText(nextStep);
|
|
279
298
|
}
|
|
280
299
|
|
|
281
300
|
const result = await this.provider.invokeConversation(conversation, tools, {
|
|
282
|
-
maxToolRoundtrips:
|
|
301
|
+
maxToolRoundtrips: 3,
|
|
283
302
|
toolChoice: 'required',
|
|
284
303
|
stopWhen: () => task.hasFinished,
|
|
285
304
|
});
|
|
@@ -421,6 +440,14 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
421
440
|
};
|
|
422
441
|
}
|
|
423
442
|
|
|
443
|
+
private shouldAnalyzeProgress(iteration: number, currentState: ActionResult): boolean {
|
|
444
|
+
if (this.consecutiveFailures >= 3) return true;
|
|
445
|
+
if (this.consecutiveEmptyResults >= 2) return true;
|
|
446
|
+
if (iteration % this.progressCheckInterval !== 0) return false;
|
|
447
|
+
if (this.lastAnalyzedStateHash === currentState.hash) return false;
|
|
448
|
+
return true;
|
|
449
|
+
}
|
|
450
|
+
|
|
424
451
|
private async prepareInstructionsForNextStep(task: Test): Promise<string> {
|
|
425
452
|
let outcomeStatus = dedent`
|
|
426
453
|
<task>
|
|
@@ -511,17 +538,21 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
511
538
|
}
|
|
512
539
|
|
|
513
540
|
if (isNewUrl) {
|
|
541
|
+
const alreadySeenUiMap = this.seenUiMapUrls.has(currentUrl);
|
|
514
542
|
let research = '';
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
543
|
+
if (!alreadySeenUiMap) {
|
|
544
|
+
try {
|
|
545
|
+
research = await this.researcher.research(currentState);
|
|
546
|
+
} catch (err) {
|
|
547
|
+
if (!(err instanceof ErrorPageError)) throw err;
|
|
548
|
+
tag('warning').log(`Research skipped: ${err.message}`);
|
|
549
|
+
}
|
|
520
550
|
}
|
|
521
551
|
this.pageStateHash = currentStateHash;
|
|
522
552
|
this.pageActionResult = currentState;
|
|
523
553
|
let uiMapSection = '';
|
|
524
554
|
if (research) {
|
|
555
|
+
this.seenUiMapUrls.add(currentUrl);
|
|
525
556
|
uiMapSection = dedent`
|
|
526
557
|
|
|
527
558
|
Page UI Map
|
|
@@ -530,6 +561,8 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
530
561
|
${research}
|
|
531
562
|
</page_ui_map>
|
|
532
563
|
`;
|
|
564
|
+
} else if (alreadySeenUiMap) {
|
|
565
|
+
uiMapSection = `\n\n<page_ui_map>UI map for ${currentUrl} was shown earlier in this session — refer to it above.</page_ui_map>`;
|
|
533
566
|
}
|
|
534
567
|
|
|
535
568
|
context += dedent`
|
|
@@ -740,9 +773,8 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
740
773
|
`;
|
|
741
774
|
}
|
|
742
775
|
|
|
743
|
-
private
|
|
776
|
+
private buildScenarioBlock(task: Test, actionResult: ActionResult): string {
|
|
744
777
|
const knowledge = this.getKnowledge(actionResult);
|
|
745
|
-
const pageContext = await this.reinjectContextIfNeeded(1, actionResult);
|
|
746
778
|
|
|
747
779
|
return dedent`
|
|
748
780
|
<task>
|
|
@@ -770,8 +802,6 @@ export class Tester extends TaskAgent implements Agent {
|
|
|
770
802
|
${this.buildAvailableFiles()}
|
|
771
803
|
|
|
772
804
|
${knowledge}
|
|
773
|
-
|
|
774
|
-
${pageContext}
|
|
775
805
|
`;
|
|
776
806
|
}
|
|
777
807
|
|
package/src/ai/tools.ts
CHANGED
|
@@ -510,7 +510,7 @@ export function createAgentTools({
|
|
|
510
510
|
}
|
|
511
511
|
|
|
512
512
|
return successToolResult('see', {
|
|
513
|
-
analysis: analysisResult,
|
|
513
|
+
analysis: cap(analysisResult, ANALYSIS_OUTPUT_CAP),
|
|
514
514
|
message: `Successfully analyzed screenshot for: ${request}`,
|
|
515
515
|
suggestion: 'Visual confirmation is valid evidence for test results. Use record() to note the visual findings.',
|
|
516
516
|
});
|
|
@@ -559,8 +559,8 @@ export function createAgentTools({
|
|
|
559
559
|
url: currentState.url,
|
|
560
560
|
title: currentState.title,
|
|
561
561
|
suggestion: 'If not enough context received, call see() to visually identify elements in page contents',
|
|
562
|
-
aria,
|
|
563
|
-
html,
|
|
562
|
+
aria: cap(aria, ARIA_OUTPUT_CAP),
|
|
563
|
+
html: cap(html, HTML_OUTPUT_CAP),
|
|
564
564
|
reminder: 'Context provided. Do not call context() again until you perform actions or suspect page changed.',
|
|
565
565
|
});
|
|
566
566
|
} catch (error) {
|
|
@@ -657,7 +657,7 @@ export function createAgentTools({
|
|
|
657
657
|
|
|
658
658
|
return successToolResult('research', {
|
|
659
659
|
analysis: researchResult,
|
|
660
|
-
aria: ActionResult.fromState(currentState).getInteractiveARIA(),
|
|
660
|
+
aria: cap(ActionResult.fromState(currentState).getInteractiveARIA(), ARIA_OUTPUT_CAP),
|
|
661
661
|
message: `Successfully researched page: ${currentState.url}.`,
|
|
662
662
|
suggestion: dedent`
|
|
663
663
|
You received comprehensive UI map report. Use it to understand the page structure and navigate to the elements.
|
|
@@ -1001,6 +1001,16 @@ export function createAgentTools({
|
|
|
1001
1001
|
|
|
1002
1002
|
const PAGE_DIFF_SUGGESTION = 'Analyze page diff. htmlParts shows what changed and WHERE — each part has a container selector. Use the container as context when clicking elements from the diff.';
|
|
1003
1003
|
|
|
1004
|
+
const ARIA_OUTPUT_CAP = 4000;
|
|
1005
|
+
const HTML_OUTPUT_CAP = 6000;
|
|
1006
|
+
const ANALYSIS_OUTPUT_CAP = 2000;
|
|
1007
|
+
|
|
1008
|
+
function cap(text: string | undefined | null, max: number): string {
|
|
1009
|
+
if (!text) return '';
|
|
1010
|
+
if (text.length <= max) return text;
|
|
1011
|
+
return `${text.slice(0, max)}\n[...truncated; ${text.length - max} chars omitted...]`;
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1004
1014
|
function transformContainsCommand(command: string): string {
|
|
1005
1015
|
if (!command.includes(':contains(')) return command;
|
|
1006
1016
|
|
|
@@ -1044,8 +1054,12 @@ function successToolResult(action: string, data?: Record<string, any>, source?:
|
|
|
1044
1054
|
if (data?.pageDiff) {
|
|
1045
1055
|
let suggestion = PAGE_DIFF_SUGGESTION;
|
|
1046
1056
|
const ariaChanges = data.pageDiff.ariaChanges || '';
|
|
1057
|
+
const urlChanged = data.pageDiff.urlChanged === true;
|
|
1058
|
+
const hasHtmlParts = Array.isArray(data.pageDiff.htmlParts) && data.pageDiff.htmlParts.length > 0;
|
|
1047
1059
|
if (countAriaChanges(ariaChanges) >= 50) {
|
|
1048
1060
|
suggestion = `MAJOR PAGE CHANGE. Page entered a different mode. Check htmlParts and iframes in pageDiff before next action. ${suggestion}`;
|
|
1061
|
+
} else if (!urlChanged && !ariaChanges && !hasHtmlParts) {
|
|
1062
|
+
suggestion = 'Action ran without error but produced no observable change (URL, ARIA and HTML all unchanged). The locator likely matched a non-interactive ancestor or an element outside the intended control. Re-locate via xpathCheck() or verify with see() before treating this as success.';
|
|
1049
1063
|
} else if (ariaChanges.includes('heading') && ariaChanges.includes('added')) {
|
|
1050
1064
|
suggestion += ' WARNING: A new panel or modal may have appeared. If this was not the intended action, close it and try a different element.';
|
|
1051
1065
|
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import figureSet from 'figures';
|
|
2
2
|
import { getStyles } from '../ai/planner/styles.js';
|
|
3
3
|
import { outputPath } from '../config.js';
|
|
4
|
+
import { normalizeUrl } from '../state-manager.js';
|
|
4
5
|
import { Stats } from '../stats.js';
|
|
5
6
|
import type { Plan } from '../test-plan.js';
|
|
6
7
|
import { getCliName } from '../utils/cli-name.ts';
|
|
@@ -11,6 +12,8 @@ import { type NextStepSection, printNextSteps, relativeToCwd } from '../utils/ne
|
|
|
11
12
|
import { safeFilename } from '../utils/strings.ts';
|
|
12
13
|
import { BaseCommand, type Suggestion } from './base-command.js';
|
|
13
14
|
|
|
15
|
+
const MAX_SUB_PAGE_ATTEMPTS = 30;
|
|
16
|
+
|
|
14
17
|
export class ExploreCommand extends BaseCommand {
|
|
15
18
|
name = 'explore';
|
|
16
19
|
description = 'Start web exploration';
|
|
@@ -27,6 +30,7 @@ export class ExploreCommand extends BaseCommand {
|
|
|
27
30
|
maxTests?: number;
|
|
28
31
|
private testsRun = 0;
|
|
29
32
|
private completedPlans: Plan[] = [];
|
|
33
|
+
private failedSubPages = new Set<string>();
|
|
30
34
|
|
|
31
35
|
async execute(args: string): Promise<void> {
|
|
32
36
|
const { opts, args: remaining } = this.parseArgs(args);
|
|
@@ -46,10 +50,12 @@ export class ExploreCommand extends BaseCommand {
|
|
|
46
50
|
|
|
47
51
|
if (!feature && !this.isLimitReached()) {
|
|
48
52
|
const planner = this.explorBot.agentPlanner();
|
|
49
|
-
|
|
53
|
+
let attempts = 0;
|
|
54
|
+
while (attempts < MAX_SUB_PAGE_ATTEMPTS) {
|
|
55
|
+
attempts++;
|
|
50
56
|
if (this.isLimitReached()) break;
|
|
51
57
|
|
|
52
|
-
const candidates = planner.collectSubPageCandidates(mainPlan, mainUrl || '/');
|
|
58
|
+
const candidates = planner.collectSubPageCandidates(mainPlan, mainUrl || '/').filter((c) => !this.failedSubPages.has(normalizeUrl(c.url)));
|
|
53
59
|
if (candidates.length === 0) break;
|
|
54
60
|
|
|
55
61
|
const pick = await planner.pickNextSubPage(candidates);
|
|
@@ -64,6 +70,7 @@ export class ExploreCommand extends BaseCommand {
|
|
|
64
70
|
this.completedPlans.push(subPlan);
|
|
65
71
|
}
|
|
66
72
|
} catch (err) {
|
|
73
|
+
this.failedSubPages.add(normalizeUrl(pick.url));
|
|
67
74
|
tag('warning').log(`Sub-page exploration failed: ${err instanceof Error ? err.message : err}`);
|
|
68
75
|
}
|
|
69
76
|
}
|
|
@@ -52,9 +52,12 @@ export const StatusPane: React.FC<{ onComplete?: () => void }> = ({ onComplete }
|
|
|
52
52
|
<Text bold>Usage</Text>
|
|
53
53
|
</Box>
|
|
54
54
|
<Row label="Time" value={Stats.getElapsedTime()} />
|
|
55
|
-
{tokenRows.map(([model, tokens]) =>
|
|
56
|
-
|
|
57
|
-
|
|
55
|
+
{tokenRows.map(([model, tokens]) => {
|
|
56
|
+
const cached = tokens.cached ?? 0;
|
|
57
|
+
const cachePct = tokens.input > 0 ? Math.round((cached / tokens.input) * 100) : 0;
|
|
58
|
+
const suffix = cached > 0 ? ` (${Stats.humanizeTokens(cached)} cached, ${cachePct}%)` : '';
|
|
59
|
+
return <Row key={model} label={model} value={`${Stats.humanizeTokens(tokens.total)} tokens${suffix}`} />;
|
|
60
|
+
})}
|
|
58
61
|
</>
|
|
59
62
|
)}
|
|
60
63
|
</Box>
|
|
@@ -3,6 +3,7 @@ import { basename, dirname, join } from 'node:path';
|
|
|
3
3
|
import matter from 'gray-matter';
|
|
4
4
|
import { type Tokens, marked } from 'marked';
|
|
5
5
|
import type { ActionResult } from './action-result.js';
|
|
6
|
+
import { isNonReusableCode } from './ai/historian/utils.ts';
|
|
6
7
|
import { ConfigParser } from './config.js';
|
|
7
8
|
import { KnowledgeTracker } from './knowledge-tracker.js';
|
|
8
9
|
import type { WebPageState } from './state-manager.js';
|
|
@@ -166,6 +167,10 @@ export class ExperienceTracker {
|
|
|
166
167
|
writeAction(state: ActionResult, action: ActionInput): void {
|
|
167
168
|
if (this.disabled || this.isWritingDisabled(state)) return;
|
|
168
169
|
if (!action.code?.trim()) return;
|
|
170
|
+
if (isNonReusableCode(action.code)) {
|
|
171
|
+
debugLog('Skipping action with non-reusable code: %s', action.code);
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
169
174
|
|
|
170
175
|
this.ensureExperienceFile(state);
|
|
171
176
|
const stateHash = state.getStateHash();
|
|
@@ -189,6 +194,10 @@ export class ExperienceTracker {
|
|
|
189
194
|
writeFlow(state: ActionResult, body: string, relatedUrls?: string[]): void {
|
|
190
195
|
if (this.disabled || this.isWritingDisabled(state)) return;
|
|
191
196
|
if (!body?.trim()) return;
|
|
197
|
+
if (isNonReusableCode(body)) {
|
|
198
|
+
debugLog('Skipping flow body with non-reusable code');
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
192
201
|
|
|
193
202
|
this.ensureExperienceFile(state);
|
|
194
203
|
const stateHash = state.getStateHash();
|
package/src/explorer.ts
CHANGED
|
@@ -549,10 +549,7 @@ class Explorer {
|
|
|
549
549
|
if (!this.stateManager.getCurrentState()) return;
|
|
550
550
|
|
|
551
551
|
const lastScreenshot = ActionResult.fromState(this.stateManager.getCurrentState()!).screenshotFile;
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
const screenshotPath = outputPath('states', lastScreenshot);
|
|
555
|
-
test.addArtifact(screenshotPath);
|
|
552
|
+
test.setActiveNoteScreenshot(lastScreenshot);
|
|
556
553
|
};
|
|
557
554
|
|
|
558
555
|
const dialogHandler = (dialog: any) => {
|
package/src/reporter.ts
CHANGED
|
@@ -110,7 +110,7 @@ export class Reporter {
|
|
|
110
110
|
const timeoutMs = Number(process.env.TESTOMATIO_TIMEOUT_MS || '15000');
|
|
111
111
|
const timeoutPromise = new Promise<'timeout'>((resolve) => setTimeout(() => resolve('timeout'), timeoutMs));
|
|
112
112
|
|
|
113
|
-
const result = await Promise.race([this.client.createRun().then(() => 'success' as const), timeoutPromise]);
|
|
113
|
+
const result = await Promise.race([this.client.createRun({ configuration: { exploratory: true } }).then(() => 'success' as const), timeoutPromise]);
|
|
114
114
|
|
|
115
115
|
if (result === 'timeout') {
|
|
116
116
|
debugLog('Reporter run creation timed out');
|
|
@@ -145,6 +145,7 @@ export class Reporter {
|
|
|
145
145
|
message: note.message,
|
|
146
146
|
status: note.status,
|
|
147
147
|
screenshot: note.screenshot,
|
|
148
|
+
log: note.log,
|
|
148
149
|
}))
|
|
149
150
|
.sort((a, b) => a.startTime - b.startTime);
|
|
150
151
|
|
|
@@ -180,9 +181,18 @@ export class Reporter {
|
|
|
180
181
|
if (noteEntry.screenshot) {
|
|
181
182
|
step.artifacts = [outputPath('states', noteEntry.screenshot)];
|
|
182
183
|
}
|
|
184
|
+
if (noteEntry.log) {
|
|
185
|
+
step.log = noteEntry.log;
|
|
186
|
+
}
|
|
183
187
|
steps.push(step);
|
|
184
188
|
}
|
|
185
189
|
|
|
190
|
+
const verificationStep = this.buildVerificationStep(test, lastScreenshotFile);
|
|
191
|
+
if (verificationStep) {
|
|
192
|
+
steps.push(verificationStep);
|
|
193
|
+
return steps;
|
|
194
|
+
}
|
|
195
|
+
|
|
186
196
|
if (lastScreenshotFile && steps.length > 0) {
|
|
187
197
|
const lastStep = steps[steps.length - 1];
|
|
188
198
|
const screenshotPath = outputPath('states', lastScreenshotFile);
|
|
@@ -196,6 +206,39 @@ export class Reporter {
|
|
|
196
206
|
return steps;
|
|
197
207
|
}
|
|
198
208
|
|
|
209
|
+
private buildVerificationStep(test: Test, lastScreenshotFile?: string): Step | undefined {
|
|
210
|
+
const v = test.verification;
|
|
211
|
+
if (!v) return undefined;
|
|
212
|
+
|
|
213
|
+
const subSteps: Step[] = [];
|
|
214
|
+
if (v.message) subSteps.push({ category: 'framework', title: v.message, duration: 0 });
|
|
215
|
+
if (v.url) {
|
|
216
|
+
subSteps.push({
|
|
217
|
+
category: 'framework',
|
|
218
|
+
title: v.pageLabel ? `Navigated to ${v.pageLabel}` : 'Final page',
|
|
219
|
+
log: v.url,
|
|
220
|
+
duration: 0,
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
for (const detail of v.details) {
|
|
224
|
+
subSteps.push({ category: 'framework', title: detail, duration: 0 });
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const screenshotFile = v.screenshot || lastScreenshotFile;
|
|
228
|
+
|
|
229
|
+
const step: Step = {
|
|
230
|
+
category: 'user',
|
|
231
|
+
title: 'Verification',
|
|
232
|
+
duration: 0,
|
|
233
|
+
status: v.status || 'none',
|
|
234
|
+
steps: subSteps.length > 0 ? subSteps : undefined,
|
|
235
|
+
};
|
|
236
|
+
if (screenshotFile) {
|
|
237
|
+
step.artifacts = [outputPath('states', screenshotFile)];
|
|
238
|
+
}
|
|
239
|
+
return step;
|
|
240
|
+
}
|
|
241
|
+
|
|
199
242
|
async reportTest(test: Test, meta?: ReporterMeta): Promise<void> {
|
|
200
243
|
await this.startRun();
|
|
201
244
|
|