explorbot 0.1.13 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +3 -2
- package/dist/src/action.js +3 -2
- package/dist/src/ai/conversation.js +20 -4
- package/dist/src/ai/historian/utils.js +8 -1
- package/dist/src/ai/pilot.js +198 -260
- package/dist/src/ai/provider.js +25 -12
- package/dist/src/ai/quartermaster.js +2 -2
- package/dist/src/ai/researcher/focus.js +51 -10
- package/dist/src/ai/researcher/sections.js +8 -4
- package/dist/src/ai/researcher.js +9 -24
- package/dist/src/ai/rules.js +2 -0
- package/dist/src/ai/session-analyst.js +46 -41
- package/dist/src/ai/tester.js +63 -22
- package/dist/src/ai/tools.js +19 -4
- package/dist/src/commands/explore-command.js +8 -2
- package/dist/src/components/StatusPane.js +6 -1
- package/dist/src/experience-tracker.js +9 -0
- package/dist/src/explorer.js +2 -5
- package/dist/src/reporter.js +41 -1
- package/dist/src/stats.js +2 -1
- package/dist/src/test-plan.js +47 -3
- package/package.json +3 -2
- package/src/action.ts +3 -2
- package/src/ai/conversation.ts +21 -4
- package/src/ai/historian/utils.ts +8 -1
- package/src/ai/pilot.ts +199 -259
- package/src/ai/provider.ts +24 -12
- package/src/ai/quartermaster.ts +2 -2
- package/src/ai/researcher/focus.ts +57 -8
- package/src/ai/researcher/sections.ts +7 -3
- package/src/ai/researcher.ts +8 -23
- package/src/ai/rules.ts +2 -0
- package/src/ai/session-analyst.ts +47 -41
- package/src/ai/tester.ts +55 -20
- package/src/ai/tools.ts +18 -4
- package/src/commands/explore-command.ts +9 -2
- package/src/components/StatusPane.tsx +6 -3
- package/src/experience-tracker.ts +9 -0
- package/src/explorer.ts +1 -4
- package/src/reporter.ts +44 -1
- package/src/stats.ts +3 -1
- package/src/test-plan.ts +62 -3
package/dist/src/ai/provider.js
CHANGED
|
@@ -16,6 +16,16 @@ class AiError extends Error {
|
|
|
16
16
|
}
|
|
17
17
|
export class ContextLengthError extends Error {
|
|
18
18
|
}
|
|
19
|
+
function extractCachedTokens(usage) {
|
|
20
|
+
if (!usage)
|
|
21
|
+
return 0;
|
|
22
|
+
const direct = usage.cachedInputTokens ?? usage.inputTokenDetails?.cacheReadTokens;
|
|
23
|
+
if (typeof direct === 'number')
|
|
24
|
+
return direct;
|
|
25
|
+
const raw = usage.raw;
|
|
26
|
+
const fromRaw = raw?.prompt_tokens_details?.cached_tokens ?? raw?.promptTokensDetails?.cachedTokens;
|
|
27
|
+
return typeof fromRaw === 'number' ? fromRaw : 0;
|
|
28
|
+
}
|
|
19
29
|
function rejectAfterIdle(ms, signal) {
|
|
20
30
|
return new Promise((_, reject) => {
|
|
21
31
|
const tick = () => {
|
|
@@ -227,9 +237,10 @@ export class Provider {
|
|
|
227
237
|
responseLog(response.text);
|
|
228
238
|
if (response.usage) {
|
|
229
239
|
Stats.recordTokens(options.agentName || 'unknown', modelName, {
|
|
230
|
-
input: response.usage.promptTokens
|
|
231
|
-
output: response.usage.completionTokens
|
|
232
|
-
total: response.usage.totalTokens
|
|
240
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
241
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
242
|
+
total: response.usage.totalTokens ?? 0,
|
|
243
|
+
cached: extractCachedTokens(response.usage),
|
|
233
244
|
});
|
|
234
245
|
}
|
|
235
246
|
return response;
|
|
@@ -311,9 +322,10 @@ export class Provider {
|
|
|
311
322
|
responseLog(response.text);
|
|
312
323
|
if (response.usage) {
|
|
313
324
|
Stats.recordTokens(options.agentName || 'unknown', modelName, {
|
|
314
|
-
input: response.usage.promptTokens
|
|
315
|
-
output: response.usage.completionTokens
|
|
316
|
-
total: response.usage.totalTokens
|
|
325
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
326
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
327
|
+
total: response.usage.totalTokens ?? 0,
|
|
328
|
+
cached: extractCachedTokens(response.usage),
|
|
317
329
|
});
|
|
318
330
|
}
|
|
319
331
|
return response;
|
|
@@ -379,9 +391,10 @@ export class Provider {
|
|
|
379
391
|
responseLog(response.object);
|
|
380
392
|
if (response.usage) {
|
|
381
393
|
Stats.recordTokens(options.agentName || 'unknown', modelName, {
|
|
382
|
-
input: response.usage.promptTokens
|
|
383
|
-
output: response.usage.completionTokens
|
|
384
|
-
total: response.usage.totalTokens
|
|
394
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
395
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
396
|
+
total: response.usage.totalTokens ?? 0,
|
|
397
|
+
cached: extractCachedTokens(response.usage),
|
|
385
398
|
});
|
|
386
399
|
}
|
|
387
400
|
return response;
|
|
@@ -555,9 +568,9 @@ export class Provider {
|
|
|
555
568
|
responseLog(response.text);
|
|
556
569
|
if (response.usage) {
|
|
557
570
|
Stats.recordTokens('vision', this.getModelName(this.config.visionModel), {
|
|
558
|
-
input: response.usage.promptTokens
|
|
559
|
-
output: response.usage.completionTokens
|
|
560
|
-
total: response.usage.totalTokens
|
|
571
|
+
input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
|
|
572
|
+
output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
|
|
573
|
+
total: response.usage.totalTokens ?? 0,
|
|
561
574
|
});
|
|
562
575
|
}
|
|
563
576
|
return response;
|
|
@@ -169,10 +169,10 @@ Focus on what would confuse a real user or caused the agent to make mistakes.`;
|
|
|
169
169
|
const criticalViolations = report.axeViolations.filter((v) => v.impact === 'critical' || v.impact === 'serious');
|
|
170
170
|
for (const v of criticalViolations.slice(0, 3)) {
|
|
171
171
|
const nodeHtml = v.nodes[0]?.html.slice(0, 100) || '';
|
|
172
|
-
task.
|
|
172
|
+
task.addVerificationDetail(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
|
|
173
173
|
}
|
|
174
174
|
for (const issue of report.semanticIssues.slice(0, 3)) {
|
|
175
|
-
task.
|
|
175
|
+
task.addVerificationDetail(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
|
|
176
176
|
}
|
|
177
177
|
}
|
|
178
178
|
saveReport(stateHash, report) {
|
|
@@ -1,20 +1,61 @@
|
|
|
1
|
-
import { detectFocusArea } from "../../utils/aria.js";
|
|
2
1
|
import { mdq } from "../../utils/markdown-query.js";
|
|
3
2
|
export const FOCUSED_MARKER = '> **Focused**';
|
|
4
3
|
const FOCUS_SKIP_SECTIONS = new Set(['navigation', 'menu']);
|
|
5
4
|
export function hasFocusedSection(text) {
|
|
6
5
|
return text.includes(FOCUSED_MARKER);
|
|
7
6
|
}
|
|
8
|
-
export function
|
|
9
|
-
const
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
const
|
|
14
|
-
if (
|
|
15
|
-
|
|
7
|
+
export async function detectFocusedSection(page, sections) {
|
|
8
|
+
const candidates = [];
|
|
9
|
+
for (const section of sections) {
|
|
10
|
+
if (!section.containerCss)
|
|
11
|
+
continue;
|
|
12
|
+
const key = section.name.toLowerCase().replace(/^section:\s*/, '');
|
|
13
|
+
if (FOCUS_SKIP_SECTIONS.has(key))
|
|
14
|
+
continue;
|
|
15
|
+
try {
|
|
16
|
+
const locator = page.locator(section.containerCss).first();
|
|
17
|
+
if (!(await locator.isVisible()))
|
|
18
|
+
continue;
|
|
19
|
+
const probe = await locator.evaluate((el) => {
|
|
20
|
+
const dialogSelector = '[role="dialog"], [role="alertdialog"], [aria-modal="true"]';
|
|
21
|
+
const isDialog = el.matches(dialogSelector) || !!el.querySelector(dialogSelector);
|
|
22
|
+
let cur = el;
|
|
23
|
+
let maxZ = 0;
|
|
24
|
+
while (cur && cur !== document.body) {
|
|
25
|
+
const cs = window.getComputedStyle(cur);
|
|
26
|
+
if (cs.position !== 'static') {
|
|
27
|
+
const z = Number.parseInt(cs.zIndex, 10);
|
|
28
|
+
if (!Number.isNaN(z) && z > maxZ)
|
|
29
|
+
maxZ = z;
|
|
30
|
+
}
|
|
31
|
+
cur = cur.parentElement;
|
|
32
|
+
}
|
|
33
|
+
const shadow = window.getComputedStyle(el).boxShadow;
|
|
34
|
+
const hasShadow = !!shadow && shadow !== 'none';
|
|
35
|
+
return { isDialog, zIndex: maxZ, hasShadow };
|
|
36
|
+
});
|
|
37
|
+
candidates.push({ name: section.name, ...probe });
|
|
38
|
+
}
|
|
39
|
+
catch { }
|
|
16
40
|
}
|
|
17
|
-
|
|
41
|
+
if (candidates.length === 0)
|
|
42
|
+
return null;
|
|
43
|
+
const dialogs = candidates.filter((c) => c.isDialog);
|
|
44
|
+
const pool = dialogs.length > 0 ? dialogs : candidates;
|
|
45
|
+
const winner = pool.reduce((best, c) => {
|
|
46
|
+
if (!best)
|
|
47
|
+
return c;
|
|
48
|
+
if (c.zIndex !== best.zIndex)
|
|
49
|
+
return c.zIndex > best.zIndex ? c : best;
|
|
50
|
+
if (c.hasShadow !== best.hasShadow)
|
|
51
|
+
return c.hasShadow ? c : best;
|
|
52
|
+
return best;
|
|
53
|
+
}, null);
|
|
54
|
+
if (!winner)
|
|
55
|
+
return null;
|
|
56
|
+
if (dialogs.length === 0 && winner.zIndex === 0 && !winner.hasShadow)
|
|
57
|
+
return null;
|
|
58
|
+
return winner.name;
|
|
18
59
|
}
|
|
19
60
|
export function markSectionAsFocused(result, sectionName) {
|
|
20
61
|
if (hasFocusedSection(result.text))
|
|
@@ -3,6 +3,8 @@ import { executionController } from "../../execution-controller.js";
|
|
|
3
3
|
import { tag } from '../../utils/logger.js';
|
|
4
4
|
import { RulesLoader } from "../../utils/rules-loader.js";
|
|
5
5
|
import { locatorRule as generalLocatorRuleText } from '../rules.js';
|
|
6
|
+
import { markSectionAsFocused } from "./focus.js";
|
|
7
|
+
import { ResearchResult } from "./research-result.js";
|
|
6
8
|
export function WithSections(Base) {
|
|
7
9
|
return class extends Base {
|
|
8
10
|
async researchBySections() {
|
|
@@ -40,10 +42,12 @@ export function WithSections(Base) {
|
|
|
40
42
|
if (parts.length === 0) {
|
|
41
43
|
throw new Error('Per-section research produced no sections — AI responses all empty or NOT_PRESENT');
|
|
42
44
|
}
|
|
43
|
-
|
|
44
|
-
if (focusCss)
|
|
45
|
-
merged
|
|
46
|
-
|
|
45
|
+
const merged = parts.join('\n\n');
|
|
46
|
+
if (!focusCss)
|
|
47
|
+
return merged;
|
|
48
|
+
const focused = new ResearchResult(merged, this.actionResult?.url || '');
|
|
49
|
+
markSectionAsFocused(focused, 'Focus');
|
|
50
|
+
return focused.text;
|
|
47
51
|
}
|
|
48
52
|
async _detectFocusCss() {
|
|
49
53
|
const focusSections = this.explorer.getConfig().ai?.agents?.researcher?.focusSections;
|
|
@@ -16,7 +16,7 @@ import { ContextLengthError } from './provider.js';
|
|
|
16
16
|
import { findSimilarResearch, getCachedResearch, saveResearch } from "./researcher/cache.js";
|
|
17
17
|
import { WithCoordinates } from "./researcher/coordinates.js";
|
|
18
18
|
import { WithDeepAnalysis } from "./researcher/deep-analysis.js";
|
|
19
|
-
import {
|
|
19
|
+
import { detectFocusedSection, hasFocusedSection, markSectionAsFocused, pickDefaultFocusedSection } from "./researcher/focus.js";
|
|
20
20
|
import { WithLocators } from "./researcher/locators.js";
|
|
21
21
|
import { extractValidContainers, formatResearchSummary, parseResearchSections } from "./researcher/parser.js";
|
|
22
22
|
import { ResearchResult } from "./researcher/research-result.js";
|
|
@@ -186,18 +186,13 @@ export class Researcher extends ResearcherBase {
|
|
|
186
186
|
if (!interrupted() && fix && result.locators.some((l) => l.valid === false)) {
|
|
187
187
|
await this.fixBrokenSections(result, activeConversation);
|
|
188
188
|
}
|
|
189
|
-
// Focused section:
|
|
190
|
-
|
|
191
|
-
if (
|
|
192
|
-
result.text = result.text.replace(focusMatch[0], '');
|
|
193
|
-
markSectionAsFocused(result, focusMatch[1].trim());
|
|
194
|
-
}
|
|
195
|
-
if (!hasFocusedSection(result.text)) {
|
|
189
|
+
// Focused section: unified Playwright probe (HTML+CSS+visibility).
|
|
190
|
+
// Must run BEFORE visuallyAnnotateContainers — annotation overlays inject z-index 99998+ which would pollute the scoring.
|
|
191
|
+
if (!interrupted() && this.hasScreenshotToAnalyze) {
|
|
196
192
|
const sections = parseResearchSections(result.text);
|
|
197
|
-
const
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
markSectionAsFocused(result, focusedName);
|
|
193
|
+
const focused = await detectFocusedSection(this.explorer.playwrightHelper.page, sections);
|
|
194
|
+
if (focused)
|
|
195
|
+
markSectionAsFocused(result, focused);
|
|
201
196
|
}
|
|
202
197
|
// Stage 4: Visual analysis
|
|
203
198
|
if (!interrupted() && this.hasScreenshotToAnalyze) {
|
|
@@ -232,8 +227,8 @@ export class Researcher extends ResearcherBase {
|
|
|
232
227
|
await this.backfillCoordinates(result);
|
|
233
228
|
await this.backfillBrokenLocators(result);
|
|
234
229
|
}
|
|
235
|
-
// Focused section: final fallback
|
|
236
|
-
if (!hasFocusedSection(result.text)) {
|
|
230
|
+
// Focused section: final fallback (vision-only — without a screenshot we don't infer focus)
|
|
231
|
+
if (this.hasScreenshotToAnalyze && !hasFocusedSection(result.text)) {
|
|
237
232
|
const sections = parseResearchSections(result.text);
|
|
238
233
|
const fallback = pickDefaultFocusedSection(sections);
|
|
239
234
|
if (fallback)
|
|
@@ -388,16 +383,6 @@ export class Researcher extends ResearcherBase {
|
|
|
388
383
|
|
|
389
384
|
| Element | ARIA | CSS | eidx |
|
|
390
385
|
</section_format>
|
|
391
|
-
|
|
392
|
-
<focused_section>
|
|
393
|
-
At the end of your output, declare the primary focus area on a single line:
|
|
394
|
-
|
|
395
|
-
> Focused: <exact section name>
|
|
396
|
-
|
|
397
|
-
- If a dialog/modal/drawer/overlay exists, it is focused.
|
|
398
|
-
- Otherwise pick the section where the main business action happens (list for catalog, detail for item page, content for article).
|
|
399
|
-
- Navigation and menu/toolbar are never focused.
|
|
400
|
-
</focused_section>
|
|
401
386
|
`;
|
|
402
387
|
}
|
|
403
388
|
async buildResearchPrompt() {
|
package/dist/src/ai/rules.js
CHANGED
|
@@ -231,6 +231,8 @@ export function multipleTabsRule(tabs) {
|
|
|
231
231
|
}
|
|
232
232
|
export const actionRule = dedent `
|
|
233
233
|
<actions>
|
|
234
|
+
\`faker\` (from @faker-js/faker) is available inside I.* calls for generating data, e.g. I.fillField('Bio', faker.lorem.paragraphs(5)).
|
|
235
|
+
|
|
234
236
|
### I.click
|
|
235
237
|
|
|
236
238
|
clicks on the element by its locator
|
|
@@ -13,68 +13,70 @@ export class SessionAnalyst {
|
|
|
13
13
|
const eligible = tests.filter((t) => t.startTime != null);
|
|
14
14
|
if (eligible.length === 0)
|
|
15
15
|
return '';
|
|
16
|
-
const model = this.provider.
|
|
16
|
+
const model = this.provider.getAgenticModel('analyst');
|
|
17
17
|
const customPrompt = this.provider.getSystemPromptForAgent('analyst', undefined);
|
|
18
18
|
const systemPrompt = dedent `
|
|
19
|
-
You write a
|
|
19
|
+
You write a TERSE end-of-session report. Reader is a developer who wants to UNDERSTAND THE FEATURE — what works, what is broken, what is unclear. Every word must earn its place.
|
|
20
20
|
|
|
21
|
-
Output MARKDOWN. No JSON, no preamble, no closing
|
|
21
|
+
Output MARKDOWN. No JSON, no preamble, no closing summary.
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
Group by ROOT CAUSE, not by scenario. If three tests fail for the same dropdown, that is ONE defect listing all three test refs (#3, #5, #7). Do not produce one cluster per test.
|
|
23
|
+
NO EMOJI. No 🔴 🟡 🟢 ✅, no escape sequences like \\u2705. Use plain text severity tags: [High], [Medium], [Low] for defects.
|
|
25
24
|
|
|
26
|
-
##
|
|
27
|
-
Use the FINAL verdict (the test's \`result\` field) as the starting point. Mid-test errors that the automation recovered from do NOT make a passed test unreliable.
|
|
25
|
+
## Reporting unit
|
|
28
26
|
|
|
29
|
-
|
|
30
|
-
- **UX issue** — app works but the UI is ambiguous, controls are hidden, or labels are unclear. Worth flagging to design.
|
|
31
|
-
- **Execution issue** — the FINAL verdict is unreliable. Only two cases:
|
|
32
|
-
1. \`result: failed\` AND the failure was automation, environment, or UI/UX (locator missing, timeout, AI loop, navigation stuck, modal trapped focus, no accessible label) — i.e. the test could not conclude whether the app works.
|
|
33
|
-
2. \`result: passed\` AND clear evidence in the log shows the user-visible goal was NOT achieved (no confirmation visible, no state change verified, the assertion was vacuous).
|
|
27
|
+
Report at the level of FEATURES / FLOWS / PAGES. Tests are evidence, not the unit. Several tests covering the same flow → ONE entry citing all of them.
|
|
34
28
|
|
|
35
|
-
|
|
29
|
+
## Walk every test
|
|
36
30
|
|
|
37
|
-
|
|
38
|
-
- 🔴 critical or high — core flow blocked, data loss, security
|
|
39
|
-
- 🟡 medium — partial breakage with workaround
|
|
40
|
-
- 🟢 low — cosmetic
|
|
31
|
+
PASSED test: did all steps run, was the goal actually verified, did the user-visible goal happen? All yes → contributes to What works. Any no → Execution issue (false positive).
|
|
41
32
|
|
|
42
|
-
|
|
33
|
+
FAILED test, first match wins: (1) goal achieved but mis-verified → Execution. (2) automation failure (locator/timeout/loop/modal/a11y) → Execution. (3) bad preconditions or data → Execution. (4) wrong URL/environment → Execution. (5) app contradicted expected outcome → Defect.
|
|
34
|
+
|
|
35
|
+
Crucial distinction: "the app misbehaved" vs "the automation could not interact with the app". ONLY the first is a Defect. If the automation gives up before the app responds — timeout, retries exhausted, dead loop / loop detected, could not click or find an element — that is an Execution issue regardless of what the log calls it. Failure inside the automation ≠ failure inside the product.
|
|
36
|
+
|
|
37
|
+
A solitary failure where adjacent tests on the same feature passed → Execution, not Defect.
|
|
38
|
+
|
|
39
|
+
## Severity (defects only)
|
|
40
|
+
[High] blocks a core flow · [Medium] degrades a flow but workaround exists · [Low] cosmetic / edge case
|
|
41
|
+
|
|
42
|
+
## Format
|
|
43
43
|
|
|
44
44
|
# Session Analysis
|
|
45
45
|
|
|
46
|
-
<
|
|
46
|
+
<ONE or TWO sentences describing the FEATURE STATE — what was explored, whether the core flow holds, what the standout problem is. NO test counts, NO "N tests run". Talk about the product, not the run.>
|
|
47
|
+
|
|
48
|
+
## Coverage
|
|
49
|
+
- Pages: <paths>
|
|
50
|
+
- Features: <capabilities>
|
|
51
|
+
|
|
52
|
+
## What works
|
|
53
|
+
- **<feature>** — #2, #7, #8
|
|
47
54
|
|
|
48
55
|
## Defects
|
|
49
56
|
|
|
50
|
-
###
|
|
51
|
-
Affects: #3, #5
|
|
57
|
+
### [Medium] <plain-English bug title>
|
|
58
|
+
Affects: #3, #5
|
|
52
59
|
Reproduce:
|
|
53
|
-
1. <concrete UI step
|
|
54
|
-
2. <next
|
|
55
|
-
Evidence: <one short observation
|
|
56
|
-
|
|
57
|
-
### 🟡 <next defect>
|
|
58
|
-
...
|
|
60
|
+
1. <concrete UI step>
|
|
61
|
+
2. <next>
|
|
62
|
+
Evidence: <one short observation>
|
|
59
63
|
|
|
60
64
|
## UX issues
|
|
61
|
-
|
|
62
|
-
- **<title>** — #4
|
|
63
|
-
<one short evidence line>
|
|
65
|
+
- **<feature>** — <what's confusing> (#7)
|
|
64
66
|
|
|
65
67
|
## Execution Issues
|
|
68
|
+
- **#2 <scenario>** — <≤10 words, what was unreliable>
|
|
66
69
|
|
|
67
|
-
|
|
68
|
-
- **<…>** — <…>
|
|
70
|
+
## Brevity rules
|
|
69
71
|
|
|
70
|
-
|
|
71
|
-
-
|
|
72
|
-
- Defect title
|
|
73
|
-
- Reproduce steps are
|
|
74
|
-
- Evidence is
|
|
75
|
-
-
|
|
76
|
-
-
|
|
77
|
-
-
|
|
72
|
+
- Headline: 2 sentences MAX. About the FEATURE, not the run. No counts, no "N tests", no "this session". Banned words: "exercised", "comprehensive", "notably", "this session", "module", "targeted", "covered creation".
|
|
73
|
+
- What works: feature name + test refs. NO parentheticals, NO caveats. If there's a caveat, the entry doesn't belong here.
|
|
74
|
+
- Defect title is the BUG ("Search returns non-matching results"), never the scenario name.
|
|
75
|
+
- Reproduce steps are imperative one-liners drawn from the log.
|
|
76
|
+
- Evidence is one short factual observation. Never quote the \`result\` field.
|
|
77
|
+
- Execution Issues: ONE line per test, ≤10 words, plain. Examples: "passed vacuously, no list assertion", "no file upload step in log", "dead loop on Save click". No prefixes, no nested explanation.
|
|
78
|
+
- Omit any empty section.
|
|
79
|
+
- Section order: Coverage → What works → Defects (severity desc) → UX issues → Execution Issues.
|
|
78
80
|
|
|
79
81
|
${customPrompt || ''}
|
|
80
82
|
`;
|
|
@@ -87,7 +89,7 @@ export class SessionAnalyst {
|
|
|
87
89
|
{ role: 'system', content: systemPrompt },
|
|
88
90
|
{ role: 'user', content: userPayload },
|
|
89
91
|
], model, { agentName: 'analyst' });
|
|
90
|
-
return (response?.text || '').trim();
|
|
92
|
+
return decodeEscapes((response?.text || '').trim());
|
|
91
93
|
}
|
|
92
94
|
writeReport(markdown) {
|
|
93
95
|
const filePath = outputPath('reports', `${Stats.sessionLabel()}.md`);
|
|
@@ -115,3 +117,6 @@ export class SessionAnalyst {
|
|
|
115
117
|
`;
|
|
116
118
|
}
|
|
117
119
|
}
|
|
120
|
+
function decodeEscapes(text) {
|
|
121
|
+
return text.replace(/\\u\{([0-9a-fA-F]+)\}/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16)));
|
|
122
|
+
}
|
package/dist/src/ai/tester.js
CHANGED
|
@@ -49,6 +49,8 @@ export class Tester extends TaskAgent {
|
|
|
49
49
|
pageStateHash = null;
|
|
50
50
|
pageActionResult = null;
|
|
51
51
|
hooksRunner;
|
|
52
|
+
seenUiMapUrls = new Set();
|
|
53
|
+
lastAnalyzedStateHash = null;
|
|
52
54
|
constructor(explorer, provider, researcher, navigator, agentTools) {
|
|
53
55
|
super();
|
|
54
56
|
this.explorer = explorer;
|
|
@@ -80,7 +82,7 @@ export class Tester extends TaskAgent {
|
|
|
80
82
|
return ActionResult.fromState(this.explorer.getStateManager().getCurrentState());
|
|
81
83
|
}
|
|
82
84
|
get progressCheckInterval() {
|
|
83
|
-
return this.explorer.getConfig().ai?.agents?.tester?.progressCheckInterval ??
|
|
85
|
+
return this.explorer.getConfig().ai?.agents?.tester?.progressCheckInterval ?? 3;
|
|
84
86
|
}
|
|
85
87
|
getConversation() {
|
|
86
88
|
return this.currentConversation;
|
|
@@ -96,6 +98,8 @@ export class Tester extends TaskAgent {
|
|
|
96
98
|
this.previousStateHash = null;
|
|
97
99
|
this.pageStateHash = null;
|
|
98
100
|
this.pageActionResult = null;
|
|
101
|
+
this.seenUiMapUrls.clear();
|
|
102
|
+
this.lastAnalyzedStateHash = null;
|
|
99
103
|
this.explorer.getStateManager().clearHistory();
|
|
100
104
|
this.resetFailureCount();
|
|
101
105
|
this.pilot?.reset();
|
|
@@ -117,12 +121,18 @@ export class Tester extends TaskAgent {
|
|
|
117
121
|
page?.on('console', onConsoleMessage);
|
|
118
122
|
const initialState = ActionResult.fromState(state);
|
|
119
123
|
const conversation = this.provider.startConversation(this.getSystemMessage(), 'tester');
|
|
124
|
+
conversation.markLastMessageCacheable();
|
|
120
125
|
this.currentConversation = conversation;
|
|
121
126
|
const outputDir = ConfigParser.getInstance().getOutputDir();
|
|
122
127
|
this.executionLogFile = join(outputDir, `tester_${task.sessionName}.md`);
|
|
123
128
|
// Note: Markdown saving functionality removed from Conversation class
|
|
124
|
-
const
|
|
125
|
-
conversation.addUserText(
|
|
129
|
+
const scenarioBlock = this.buildScenarioBlock(task, initialState);
|
|
130
|
+
conversation.addUserText(scenarioBlock);
|
|
131
|
+
conversation.markLastMessageCacheable();
|
|
132
|
+
conversation.protectPrefix(conversation.messages.length);
|
|
133
|
+
const pageContext = await this.reinjectContextIfNeeded(1, initialState);
|
|
134
|
+
if (pageContext)
|
|
135
|
+
conversation.addUserText(pageContext);
|
|
126
136
|
return await Observability.run(`test: ${task.scenario}`, {
|
|
127
137
|
sessionId: task.sessionName,
|
|
128
138
|
tags: ['tester'],
|
|
@@ -138,6 +148,12 @@ export class Tester extends TaskAgent {
|
|
|
138
148
|
if (this.pilot) {
|
|
139
149
|
try {
|
|
140
150
|
const plan = await this.pilot.planTest(task, initialState);
|
|
151
|
+
if (task.hasFinished) {
|
|
152
|
+
offFailedRequest?.();
|
|
153
|
+
page?.off('pageerror', onPageError);
|
|
154
|
+
page?.off('console', onConsoleMessage);
|
|
155
|
+
return { success: task.isSuccessful };
|
|
156
|
+
}
|
|
141
157
|
if (plan) {
|
|
142
158
|
conversation.addUserText(`Pilot's test plan:\n${plan}\n\nFollow this plan while executing the test.`);
|
|
143
159
|
}
|
|
@@ -158,14 +174,18 @@ export class Tester extends TaskAgent {
|
|
|
158
174
|
await this.explorer.startTest(task);
|
|
159
175
|
debugLog(`Navigating to ${task.startUrl}`);
|
|
160
176
|
await this.explorer.visit(task.startUrl);
|
|
161
|
-
const
|
|
177
|
+
const startState = this.explorer.getStateManager().getCurrentState();
|
|
178
|
+
if (startState)
|
|
179
|
+
task.addUrlNote(startState);
|
|
180
|
+
const currentUrl = startState?.url || task.startUrl || '';
|
|
162
181
|
await this.hooksRunner.runBeforeHook('tester', currentUrl);
|
|
163
182
|
const offStateChange = this.explorer.getStateManager().onStateChange((event) => {
|
|
164
183
|
if (task.hasFinished)
|
|
165
184
|
return;
|
|
166
185
|
if (event.toState?.url === event.fromState?.url)
|
|
167
186
|
return;
|
|
168
|
-
|
|
187
|
+
if (event.toState)
|
|
188
|
+
task.addUrlNote(event.toState, event.fromState || undefined);
|
|
169
189
|
task.states.push(event.toState);
|
|
170
190
|
});
|
|
171
191
|
const codeceptjsTools = createCodeceptJSTools(this.explorer, task);
|
|
@@ -203,13 +223,13 @@ export class Tester extends TaskAgent {
|
|
|
203
223
|
The user has interrupted and wants to change direction. Follow the new instruction.
|
|
204
224
|
`);
|
|
205
225
|
}
|
|
206
|
-
conversation.cleanupTag('page_aria', '...cleaned aria snapshot...',
|
|
226
|
+
conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 1);
|
|
207
227
|
conversation.cleanupTag('page_html', '...cleaned HTML snapshot...', 1);
|
|
208
228
|
conversation.cleanupTag('experience', '...cleaned experience...', 1);
|
|
209
229
|
conversation.cleanupTag('applied_experience', '...cleaned past experience...', 1);
|
|
210
230
|
conversation.cleanupTag('page_ui_map', '...cleaned UI map...', 1);
|
|
211
231
|
conversation.cleanupTag('page_ui_map_overlay', '...cleaned UI overlay...', 1);
|
|
212
|
-
conversation.compactToolResults(
|
|
232
|
+
conversation.compactToolResults(2);
|
|
213
233
|
if (iteration > 1) {
|
|
214
234
|
const isNewPage = this.previousUrl !== null && this.previousUrl !== currentState.url;
|
|
215
235
|
let nextStep = '';
|
|
@@ -220,16 +240,17 @@ export class Tester extends TaskAgent {
|
|
|
220
240
|
if (guidance)
|
|
221
241
|
nextStep += `\n\n${guidance}`;
|
|
222
242
|
}
|
|
223
|
-
else if ((iteration
|
|
243
|
+
else if (this.shouldAnalyzeProgress(iteration, currentState) && this.pilot) {
|
|
224
244
|
const guidance = await this.pilot.analyzeProgress(task, currentState, conversation);
|
|
225
245
|
if (guidance)
|
|
226
246
|
nextStep += `\n\n${guidance}`;
|
|
227
247
|
this.consecutiveFailures = 0;
|
|
248
|
+
this.lastAnalyzedStateHash = currentState.hash;
|
|
228
249
|
}
|
|
229
250
|
conversation.addUserText(nextStep);
|
|
230
251
|
}
|
|
231
252
|
const result = await this.provider.invokeConversation(conversation, tools, {
|
|
232
|
-
maxToolRoundtrips:
|
|
253
|
+
maxToolRoundtrips: 3,
|
|
233
254
|
toolChoice: 'required',
|
|
234
255
|
stopWhen: () => task.hasFinished,
|
|
235
256
|
});
|
|
@@ -308,10 +329,15 @@ export class Tester extends TaskAgent {
|
|
|
308
329
|
: undefined,
|
|
309
330
|
catch: async ({ error, stop }) => {
|
|
310
331
|
tag('error').log(`Test execution error: ${error}`);
|
|
332
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
311
333
|
if (!task.hasFinished) {
|
|
312
|
-
task.addNote(`Execution error: ${
|
|
334
|
+
task.addNote(`Execution error: ${message}`);
|
|
313
335
|
}
|
|
314
|
-
|
|
336
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
337
|
+
stop();
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
conversation.addUserText(`Previous AI call failed: ${message}. Take a different approach on the next step.`);
|
|
315
341
|
},
|
|
316
342
|
});
|
|
317
343
|
if (task.hasFinished)
|
|
@@ -354,6 +380,17 @@ export class Tester extends TaskAgent {
|
|
|
354
380
|
...task,
|
|
355
381
|
};
|
|
356
382
|
}
|
|
383
|
+
shouldAnalyzeProgress(iteration, currentState) {
|
|
384
|
+
if (this.consecutiveFailures >= 3)
|
|
385
|
+
return true;
|
|
386
|
+
if (this.consecutiveEmptyResults >= 2)
|
|
387
|
+
return true;
|
|
388
|
+
if (iteration % this.progressCheckInterval !== 0)
|
|
389
|
+
return false;
|
|
390
|
+
if (this.lastAnalyzedStateHash === currentState.hash)
|
|
391
|
+
return false;
|
|
392
|
+
return true;
|
|
393
|
+
}
|
|
357
394
|
async prepareInstructionsForNextStep(task) {
|
|
358
395
|
let outcomeStatus = dedent `
|
|
359
396
|
<task>
|
|
@@ -432,19 +469,23 @@ export class Tester extends TaskAgent {
|
|
|
432
469
|
this.explorer.clearOtherTabsInfo();
|
|
433
470
|
}
|
|
434
471
|
if (isNewUrl) {
|
|
472
|
+
const alreadySeenUiMap = this.seenUiMapUrls.has(currentUrl);
|
|
435
473
|
let research = '';
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
474
|
+
if (!alreadySeenUiMap) {
|
|
475
|
+
try {
|
|
476
|
+
research = await this.researcher.research(currentState);
|
|
477
|
+
}
|
|
478
|
+
catch (err) {
|
|
479
|
+
if (!(err instanceof ErrorPageError))
|
|
480
|
+
throw err;
|
|
481
|
+
tag('warning').log(`Research skipped: ${err.message}`);
|
|
482
|
+
}
|
|
443
483
|
}
|
|
444
484
|
this.pageStateHash = currentStateHash;
|
|
445
485
|
this.pageActionResult = currentState;
|
|
446
486
|
let uiMapSection = '';
|
|
447
487
|
if (research) {
|
|
488
|
+
this.seenUiMapUrls.add(currentUrl);
|
|
448
489
|
uiMapSection = dedent `
|
|
449
490
|
|
|
450
491
|
Page UI Map
|
|
@@ -454,6 +495,9 @@ export class Tester extends TaskAgent {
|
|
|
454
495
|
</page_ui_map>
|
|
455
496
|
`;
|
|
456
497
|
}
|
|
498
|
+
else if (alreadySeenUiMap) {
|
|
499
|
+
uiMapSection = `\n\n<page_ui_map>UI map for ${currentUrl} was shown earlier in this session — refer to it above.</page_ui_map>`;
|
|
500
|
+
}
|
|
457
501
|
context += dedent `
|
|
458
502
|
Context:
|
|
459
503
|
|
|
@@ -651,9 +695,8 @@ export class Tester extends TaskAgent {
|
|
|
651
695
|
${this.provider.getSystemPromptForAgent('tester', this.explorer.getStateManager().getCurrentState()?.url) || ''}
|
|
652
696
|
`;
|
|
653
697
|
}
|
|
654
|
-
|
|
698
|
+
buildScenarioBlock(task, actionResult) {
|
|
655
699
|
const knowledge = this.getKnowledge(actionResult);
|
|
656
|
-
const pageContext = await this.reinjectContextIfNeeded(1, actionResult);
|
|
657
700
|
return dedent `
|
|
658
701
|
<task>
|
|
659
702
|
SCENARIO GOAL: ${task.scenario}
|
|
@@ -680,8 +723,6 @@ export class Tester extends TaskAgent {
|
|
|
680
723
|
${this.buildAvailableFiles()}
|
|
681
724
|
|
|
682
725
|
${knowledge}
|
|
683
|
-
|
|
684
|
-
${pageContext}
|
|
685
726
|
`;
|
|
686
727
|
}
|
|
687
728
|
getDeletableSessionNames(task) {
|