explorbot 0.1.13 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/package.json +3 -2
  2. package/dist/src/action.js +3 -2
  3. package/dist/src/ai/conversation.js +20 -4
  4. package/dist/src/ai/historian/utils.js +8 -1
  5. package/dist/src/ai/pilot.js +198 -260
  6. package/dist/src/ai/provider.js +25 -12
  7. package/dist/src/ai/quartermaster.js +2 -2
  8. package/dist/src/ai/researcher/focus.js +51 -10
  9. package/dist/src/ai/researcher/sections.js +8 -4
  10. package/dist/src/ai/researcher.js +9 -24
  11. package/dist/src/ai/rules.js +2 -0
  12. package/dist/src/ai/session-analyst.js +46 -41
  13. package/dist/src/ai/tester.js +63 -22
  14. package/dist/src/ai/tools.js +19 -4
  15. package/dist/src/commands/explore-command.js +8 -2
  16. package/dist/src/components/StatusPane.js +6 -1
  17. package/dist/src/experience-tracker.js +9 -0
  18. package/dist/src/explorer.js +2 -5
  19. package/dist/src/reporter.js +41 -1
  20. package/dist/src/stats.js +2 -1
  21. package/dist/src/test-plan.js +47 -3
  22. package/package.json +3 -2
  23. package/src/action.ts +3 -2
  24. package/src/ai/conversation.ts +21 -4
  25. package/src/ai/historian/utils.ts +8 -1
  26. package/src/ai/pilot.ts +199 -259
  27. package/src/ai/provider.ts +24 -12
  28. package/src/ai/quartermaster.ts +2 -2
  29. package/src/ai/researcher/focus.ts +57 -8
  30. package/src/ai/researcher/sections.ts +7 -3
  31. package/src/ai/researcher.ts +8 -23
  32. package/src/ai/rules.ts +2 -0
  33. package/src/ai/session-analyst.ts +47 -41
  34. package/src/ai/tester.ts +55 -20
  35. package/src/ai/tools.ts +18 -4
  36. package/src/commands/explore-command.ts +9 -2
  37. package/src/components/StatusPane.tsx +6 -3
  38. package/src/experience-tracker.ts +9 -0
  39. package/src/explorer.ts +1 -4
  40. package/src/reporter.ts +44 -1
  41. package/src/stats.ts +3 -1
  42. package/src/test-plan.ts +62 -3
@@ -16,6 +16,16 @@ class AiError extends Error {
16
16
  }
17
17
  export class ContextLengthError extends Error {
18
18
  }
19
+ function extractCachedTokens(usage) {
20
+ if (!usage)
21
+ return 0;
22
+ const direct = usage.cachedInputTokens ?? usage.inputTokenDetails?.cacheReadTokens;
23
+ if (typeof direct === 'number')
24
+ return direct;
25
+ const raw = usage.raw;
26
+ const fromRaw = raw?.prompt_tokens_details?.cached_tokens ?? raw?.promptTokensDetails?.cachedTokens;
27
+ return typeof fromRaw === 'number' ? fromRaw : 0;
28
+ }
19
29
  function rejectAfterIdle(ms, signal) {
20
30
  return new Promise((_, reject) => {
21
31
  const tick = () => {
@@ -227,9 +237,10 @@ export class Provider {
227
237
  responseLog(response.text);
228
238
  if (response.usage) {
229
239
  Stats.recordTokens(options.agentName || 'unknown', modelName, {
230
- input: response.usage.promptTokens || 0,
231
- output: response.usage.completionTokens || 0,
232
- total: response.usage.totalTokens || 0,
240
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
241
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
242
+ total: response.usage.totalTokens ?? 0,
243
+ cached: extractCachedTokens(response.usage),
233
244
  });
234
245
  }
235
246
  return response;
@@ -311,9 +322,10 @@ export class Provider {
311
322
  responseLog(response.text);
312
323
  if (response.usage) {
313
324
  Stats.recordTokens(options.agentName || 'unknown', modelName, {
314
- input: response.usage.promptTokens || 0,
315
- output: response.usage.completionTokens || 0,
316
- total: response.usage.totalTokens || 0,
325
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
326
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
327
+ total: response.usage.totalTokens ?? 0,
328
+ cached: extractCachedTokens(response.usage),
317
329
  });
318
330
  }
319
331
  return response;
@@ -379,9 +391,10 @@ export class Provider {
379
391
  responseLog(response.object);
380
392
  if (response.usage) {
381
393
  Stats.recordTokens(options.agentName || 'unknown', modelName, {
382
- input: response.usage.promptTokens || 0,
383
- output: response.usage.completionTokens || 0,
384
- total: response.usage.totalTokens || 0,
394
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
395
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
396
+ total: response.usage.totalTokens ?? 0,
397
+ cached: extractCachedTokens(response.usage),
385
398
  });
386
399
  }
387
400
  return response;
@@ -555,9 +568,9 @@ export class Provider {
555
568
  responseLog(response.text);
556
569
  if (response.usage) {
557
570
  Stats.recordTokens('vision', this.getModelName(this.config.visionModel), {
558
- input: response.usage.promptTokens || 0,
559
- output: response.usage.completionTokens || 0,
560
- total: response.usage.totalTokens || 0,
571
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
572
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
573
+ total: response.usage.totalTokens ?? 0,
561
574
  });
562
575
  }
563
576
  return response;
@@ -169,10 +169,10 @@ Focus on what would confuse a real user or caused the agent to make mistakes.`;
169
169
  const criticalViolations = report.axeViolations.filter((v) => v.impact === 'critical' || v.impact === 'serious');
170
170
  for (const v of criticalViolations.slice(0, 3)) {
171
171
  const nodeHtml = v.nodes[0]?.html.slice(0, 100) || '';
172
- task.addNote(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
172
+ task.addVerificationDetail(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
173
173
  }
174
174
  for (const issue of report.semanticIssues.slice(0, 3)) {
175
- task.addNote(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
175
+ task.addVerificationDetail(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
176
176
  }
177
177
  }
178
178
  saveReport(stateHash, report) {
@@ -1,20 +1,61 @@
1
- import { detectFocusArea } from "../../utils/aria.js";
2
1
  import { mdq } from "../../utils/markdown-query.js";
3
2
  export const FOCUSED_MARKER = '> **Focused**';
4
3
  const FOCUS_SKIP_SECTIONS = new Set(['navigation', 'menu']);
5
4
  export function hasFocusedSection(text) {
6
5
  return text.includes(FOCUSED_MARKER);
7
6
  }
8
- export function detectFocusFromAria(ariaSnapshot, sections) {
9
- const focusArea = detectFocusArea(ariaSnapshot);
10
- if (!focusArea.detected)
11
- return null;
12
- if (focusArea.type === 'dialog' || focusArea.type === 'modal') {
13
- const dialogSection = sections.find((s) => s.containerCss && (s.containerCss.includes('[role="dialog"]') || s.containerCss.includes('[role="alertdialog"]') || s.containerCss.includes('[aria-modal')));
14
- if (dialogSection)
15
- return dialogSection.name;
7
+ export async function detectFocusedSection(page, sections) {
8
+ const candidates = [];
9
+ for (const section of sections) {
10
+ if (!section.containerCss)
11
+ continue;
12
+ const key = section.name.toLowerCase().replace(/^section:\s*/, '');
13
+ if (FOCUS_SKIP_SECTIONS.has(key))
14
+ continue;
15
+ try {
16
+ const locator = page.locator(section.containerCss).first();
17
+ if (!(await locator.isVisible()))
18
+ continue;
19
+ const probe = await locator.evaluate((el) => {
20
+ const dialogSelector = '[role="dialog"], [role="alertdialog"], [aria-modal="true"]';
21
+ const isDialog = el.matches(dialogSelector) || !!el.querySelector(dialogSelector);
22
+ let cur = el;
23
+ let maxZ = 0;
24
+ while (cur && cur !== document.body) {
25
+ const cs = window.getComputedStyle(cur);
26
+ if (cs.position !== 'static') {
27
+ const z = Number.parseInt(cs.zIndex, 10);
28
+ if (!Number.isNaN(z) && z > maxZ)
29
+ maxZ = z;
30
+ }
31
+ cur = cur.parentElement;
32
+ }
33
+ const shadow = window.getComputedStyle(el).boxShadow;
34
+ const hasShadow = !!shadow && shadow !== 'none';
35
+ return { isDialog, zIndex: maxZ, hasShadow };
36
+ });
37
+ candidates.push({ name: section.name, ...probe });
38
+ }
39
+ catch { }
16
40
  }
17
- return null;
41
+ if (candidates.length === 0)
42
+ return null;
43
+ const dialogs = candidates.filter((c) => c.isDialog);
44
+ const pool = dialogs.length > 0 ? dialogs : candidates;
45
+ const winner = pool.reduce((best, c) => {
46
+ if (!best)
47
+ return c;
48
+ if (c.zIndex !== best.zIndex)
49
+ return c.zIndex > best.zIndex ? c : best;
50
+ if (c.hasShadow !== best.hasShadow)
51
+ return c.hasShadow ? c : best;
52
+ return best;
53
+ }, null);
54
+ if (!winner)
55
+ return null;
56
+ if (dialogs.length === 0 && winner.zIndex === 0 && !winner.hasShadow)
57
+ return null;
58
+ return winner.name;
18
59
  }
19
60
  export function markSectionAsFocused(result, sectionName) {
20
61
  if (hasFocusedSection(result.text))
@@ -3,6 +3,8 @@ import { executionController } from "../../execution-controller.js";
3
3
  import { tag } from '../../utils/logger.js';
4
4
  import { RulesLoader } from "../../utils/rules-loader.js";
5
5
  import { locatorRule as generalLocatorRuleText } from '../rules.js';
6
+ import { markSectionAsFocused } from "./focus.js";
7
+ import { ResearchResult } from "./research-result.js";
6
8
  export function WithSections(Base) {
7
9
  return class extends Base {
8
10
  async researchBySections() {
@@ -40,10 +42,12 @@ export function WithSections(Base) {
40
42
  if (parts.length === 0) {
41
43
  throw new Error('Per-section research produced no sections — AI responses all empty or NOT_PRESENT');
42
44
  }
43
- let merged = parts.join('\n\n');
44
- if (focusCss)
45
- merged += '\n\n> Focused: Focus';
46
- return merged;
45
+ const merged = parts.join('\n\n');
46
+ if (!focusCss)
47
+ return merged;
48
+ const focused = new ResearchResult(merged, this.actionResult?.url || '');
49
+ markSectionAsFocused(focused, 'Focus');
50
+ return focused.text;
47
51
  }
48
52
  async _detectFocusCss() {
49
53
  const focusSections = this.explorer.getConfig().ai?.agents?.researcher?.focusSections;
@@ -16,7 +16,7 @@ import { ContextLengthError } from './provider.js';
16
16
  import { findSimilarResearch, getCachedResearch, saveResearch } from "./researcher/cache.js";
17
17
  import { WithCoordinates } from "./researcher/coordinates.js";
18
18
  import { WithDeepAnalysis } from "./researcher/deep-analysis.js";
19
- import { detectFocusFromAria, hasFocusedSection, markSectionAsFocused, pickDefaultFocusedSection } from "./researcher/focus.js";
19
+ import { detectFocusedSection, hasFocusedSection, markSectionAsFocused, pickDefaultFocusedSection } from "./researcher/focus.js";
20
20
  import { WithLocators } from "./researcher/locators.js";
21
21
  import { extractValidContainers, formatResearchSummary, parseResearchSections } from "./researcher/parser.js";
22
22
  import { ResearchResult } from "./researcher/research-result.js";
@@ -186,18 +186,13 @@ export class Researcher extends ResearcherBase {
186
186
  if (!interrupted() && fix && result.locators.some((l) => l.valid === false)) {
187
187
  await this.fixBrokenSections(result, activeConversation);
188
188
  }
189
- // Focused section: parse AI declaration, then ARIA fallback
190
- const focusMatch = result.text.match(/^>\s*Focused:\s*(.+)/m);
191
- if (focusMatch) {
192
- result.text = result.text.replace(focusMatch[0], '');
193
- markSectionAsFocused(result, focusMatch[1].trim());
194
- }
195
- if (!hasFocusedSection(result.text)) {
189
+ // Focused section: unified Playwright probe (HTML+CSS+visibility).
190
+ // Must run BEFORE visuallyAnnotateContainers — annotation overlays inject z-index 99998+ which would pollute the scoring.
191
+ if (!interrupted() && this.hasScreenshotToAnalyze) {
196
192
  const sections = parseResearchSections(result.text);
197
- const ariaSnapshot = this.actionResult?.getCompactARIA() || '';
198
- const focusedName = detectFocusFromAria(ariaSnapshot, sections);
199
- if (focusedName)
200
- markSectionAsFocused(result, focusedName);
193
+ const focused = await detectFocusedSection(this.explorer.playwrightHelper.page, sections);
194
+ if (focused)
195
+ markSectionAsFocused(result, focused);
201
196
  }
202
197
  // Stage 4: Visual analysis
203
198
  if (!interrupted() && this.hasScreenshotToAnalyze) {
@@ -232,8 +227,8 @@ export class Researcher extends ResearcherBase {
232
227
  await this.backfillCoordinates(result);
233
228
  await this.backfillBrokenLocators(result);
234
229
  }
235
- // Focused section: final fallback
236
- if (!hasFocusedSection(result.text)) {
230
+ // Focused section: final fallback (vision-only — without a screenshot we don't infer focus)
231
+ if (this.hasScreenshotToAnalyze && !hasFocusedSection(result.text)) {
237
232
  const sections = parseResearchSections(result.text);
238
233
  const fallback = pickDefaultFocusedSection(sections);
239
234
  if (fallback)
@@ -388,16 +383,6 @@ export class Researcher extends ResearcherBase {
388
383
 
389
384
  | Element | ARIA | CSS | eidx |
390
385
  </section_format>
391
-
392
- <focused_section>
393
- At the end of your output, declare the primary focus area on a single line:
394
-
395
- > Focused: <exact section name>
396
-
397
- - If a dialog/modal/drawer/overlay exists, it is focused.
398
- - Otherwise pick the section where the main business action happens (list for catalog, detail for item page, content for article).
399
- - Navigation and menu/toolbar are never focused.
400
- </focused_section>
401
386
  `;
402
387
  }
403
388
  async buildResearchPrompt() {
@@ -231,6 +231,8 @@ export function multipleTabsRule(tabs) {
231
231
  }
232
232
  export const actionRule = dedent `
233
233
  <actions>
234
+ \`faker\` (from @faker-js/faker) is available inside I.* calls for generating data, e.g. I.fillField('Bio', faker.lorem.paragraphs(5)).
235
+
234
236
  ### I.click
235
237
 
236
238
  clicks on the element by its locator
@@ -13,68 +13,70 @@ export class SessionAnalyst {
13
13
  const eligible = tests.filter((t) => t.startTime != null);
14
14
  if (eligible.length === 0)
15
15
  return '';
16
- const model = this.provider.getModelForAgent('analyst');
16
+ const model = this.provider.getAgenticModel('analyst');
17
17
  const customPrompt = this.provider.getSystemPromptForAgent('analyst', undefined);
18
18
  const systemPrompt = dedent `
19
- You write a brief end-of-session report after autonomous exploratory testing. Your reader is a developer who needs to know in seconds: what is broken, how to reproduce it, and which results were inconclusive.
19
+ You write a TERSE end-of-session report. Reader is a developer who wants to UNDERSTAND THE FEATURE what works, what is broken, what is unclear. Every word must earn its place.
20
20
 
21
- Output MARKDOWN. No JSON, no preamble, no closing remarks. Start with the heading.
21
+ Output MARKDOWN. No JSON, no preamble, no closing summary.
22
22
 
23
- ## Clustering
24
- Group by ROOT CAUSE, not by scenario. If three tests fail for the same dropdown, that is ONE defect listing all three test refs (#3, #5, #7). Do not produce one cluster per test.
23
+ NO EMOJI. No 🔴 🟡 🟢 ✅, no escape sequences like \\u2705. Use plain text severity tags: [High], [Medium], [Low] for defects.
25
24
 
26
- ## Bucketing
27
- Use the FINAL verdict (the test's \`result\` field) as the starting point. Mid-test errors that the automation recovered from do NOT make a passed test unreliable.
25
+ ## Reporting unit
28
26
 
29
- - **Defect** real product bug. \`result: failed\` AND the failure reflects the app misbehaving (not the automation). The automation completed its interactions, the app contradicted the expected outcome. Severity required.
30
- - **UX issue** — app works but the UI is ambiguous, controls are hidden, or labels are unclear. Worth flagging to design.
31
- - **Execution issue** — the FINAL verdict is unreliable. Only two cases:
32
- 1. \`result: failed\` AND the failure was automation, environment, or UI/UX (locator missing, timeout, AI loop, navigation stuck, modal trapped focus, no accessible label) — i.e. the test could not conclude whether the app works.
33
- 2. \`result: passed\` AND clear evidence in the log shows the user-visible goal was NOT achieved (no confirmation visible, no state change verified, the assertion was vacuous).
27
+ Report at the level of FEATURES / FLOWS / PAGES. Tests are evidence, not the unit. Several tests covering the same flow ONE entry citing all of them.
34
28
 
35
- A test that passed and shows no contrary evidence belongs in NO section. Do not list passed tests just because the log contains intermediate retries or recovered failures.
29
+ ## Walk every test
36
30
 
37
- ## Severity emoji (defects only)
38
- - 🔴 critical or high — core flow blocked, data loss, security
39
- - 🟡 medium — partial breakage with workaround
40
- - 🟢 low — cosmetic
31
+ PASSED test: did all steps run, was the goal actually verified, did the user-visible goal happen? All yes → contributes to What works. Any no → Execution issue (false positive).
41
32
 
42
- ## Required format
33
+ FAILED test, first match wins: (1) goal achieved but mis-verified → Execution. (2) automation failure (locator/timeout/loop/modal/a11y) → Execution. (3) bad preconditions or data → Execution. (4) wrong URL/environment → Execution. (5) app contradicted expected outcome → Defect.
34
+
35
+ Crucial distinction: "the app misbehaved" vs "the automation could not interact with the app". ONLY the first is a Defect. If the automation gives up before the app responds — timeout, retries exhausted, dead loop / loop detected, could not click or find an element — that is an Execution issue regardless of what the log calls it. Failure inside the automation ≠ failure inside the product.
36
+
37
+ A solitary failure where adjacent tests on the same feature passed → Execution, not Defect.
38
+
39
+ ## Severity (defects only)
40
+ [High] blocks a core flow · [Medium] degrades a flow but workaround exists · [Low] cosmetic / edge case
41
+
42
+ ## Format
43
43
 
44
44
  # Session Analysis
45
45
 
46
- <one sentence: total tests, defect count, headline finding>
46
+ <ONE or TWO sentences describing the FEATURE STATE — what was explored, whether the core flow holds, what the standout problem is. NO test counts, NO "N tests run". Talk about the product, not the run.>
47
+
48
+ ## Coverage
49
+ - Pages: <paths>
50
+ - Features: <capabilities>
51
+
52
+ ## What works
53
+ - **<feature>** — #2, #7, #8
47
54
 
48
55
  ## Defects
49
56
 
50
- ### 🔴 <plain-English title of the BUG, not the scenario name>
51
- Affects: #3, #5, #7
57
+ ### [Medium] <plain-English bug title>
58
+ Affects: #3, #5
52
59
  Reproduce:
53
- 1. <concrete UI step a person can replay>
54
- 2. <next step>
55
- Evidence: <one short observation from the test log>
56
-
57
- ### 🟡 <next defect>
58
- ...
60
+ 1. <concrete UI step>
61
+ 2. <next>
62
+ Evidence: <one short observation>
59
63
 
60
64
  ## UX issues
61
-
62
- - **<title>** — #4
63
- <one short evidence line>
65
+ - **<feature>** — <what's confusing> (#7)
64
66
 
65
67
  ## Execution Issues
68
+ - **#2 <scenario>** — <≤10 words, what was unreliable>
66
69
 
67
- - **<short test name or scenario phrase>** — <plain-English one-liner: what made the result unreliable>
68
- - **<…>** — <…>
70
+ ## Brevity rules
69
71
 
70
- ## Rules
71
- - Defects first, sorted by severity descending. Omit any section that has zero entries.
72
- - Defect title describes the BUG ("Run-type dropdown does not filter"), never the scenario name.
73
- - Reproduce steps are concrete UI actions derived from the log: URL + clicks + inputs. Imperative, one short line each.
74
- - Evidence is the smallest factual observation from notes/steps that supports the claim — what was OBSERVED in the page (HTML, message, missing element). Never quote the test's \`result\` field as evidence; that is a tautology.
75
- - **Execution Issues** entries must explain what actually went wrong in concrete terms a human understands: "could not find a Submit button after navigation", "page reloaded before the assertion ran", "passed without ever seeing a confirmation message", "marked failed but the new item appears in the list", "modal trapped focus and tests could not click outside", "ARIA tree had no labelled controls". Avoid jargon like "locator failed" without context. Never write category prefixes ("execution:", "false-positive:") the section header already says it. No emoji on these entries.
76
- - Do NOT include a passed test in any section unless evidence proves its goal was not achieved. Intermediate retries or recovered errors in the log are not grounds for listing a passed test.
77
- - No editorialising, no restating the scenario verbatim, no closing summary.
72
+ - Headline: 2 sentences MAX. About the FEATURE, not the run. No counts, no "N tests", no "this session". Banned words: "exercised", "comprehensive", "notably", "this session", "module", "targeted", "covered creation".
73
+ - What works: feature name + test refs. NO parentheticals, NO caveats. If there's a caveat, the entry doesn't belong here.
74
+ - Defect title is the BUG ("Search returns non-matching results"), never the scenario name.
75
+ - Reproduce steps are imperative one-liners drawn from the log.
76
+ - Evidence is one short factual observation. Never quote the \`result\` field.
77
+ - Execution Issues: ONE line per test, ≤10 words, plain. Examples: "passed vacuously, no list assertion", "no file upload step in log", "dead loop on Save click". No prefixes, no nested explanation.
78
+ - Omit any empty section.
79
+ - Section order: Coverage What works Defects (severity desc) → UX issues → Execution Issues.
78
80
 
79
81
  ${customPrompt || ''}
80
82
  `;
@@ -87,7 +89,7 @@ export class SessionAnalyst {
87
89
  { role: 'system', content: systemPrompt },
88
90
  { role: 'user', content: userPayload },
89
91
  ], model, { agentName: 'analyst' });
90
- return (response?.text || '').trim();
92
+ return decodeEscapes((response?.text || '').trim());
91
93
  }
92
94
  writeReport(markdown) {
93
95
  const filePath = outputPath('reports', `${Stats.sessionLabel()}.md`);
@@ -115,3 +117,6 @@ export class SessionAnalyst {
115
117
  `;
116
118
  }
117
119
  }
120
+ function decodeEscapes(text) {
121
+ return text.replace(/\\u\{([0-9a-fA-F]+)\}/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16)));
122
+ }
@@ -49,6 +49,8 @@ export class Tester extends TaskAgent {
49
49
  pageStateHash = null;
50
50
  pageActionResult = null;
51
51
  hooksRunner;
52
+ seenUiMapUrls = new Set();
53
+ lastAnalyzedStateHash = null;
52
54
  constructor(explorer, provider, researcher, navigator, agentTools) {
53
55
  super();
54
56
  this.explorer = explorer;
@@ -80,7 +82,7 @@ export class Tester extends TaskAgent {
80
82
  return ActionResult.fromState(this.explorer.getStateManager().getCurrentState());
81
83
  }
82
84
  get progressCheckInterval() {
83
- return this.explorer.getConfig().ai?.agents?.tester?.progressCheckInterval ?? 5;
85
+ return this.explorer.getConfig().ai?.agents?.tester?.progressCheckInterval ?? 3;
84
86
  }
85
87
  getConversation() {
86
88
  return this.currentConversation;
@@ -96,6 +98,8 @@ export class Tester extends TaskAgent {
96
98
  this.previousStateHash = null;
97
99
  this.pageStateHash = null;
98
100
  this.pageActionResult = null;
101
+ this.seenUiMapUrls.clear();
102
+ this.lastAnalyzedStateHash = null;
99
103
  this.explorer.getStateManager().clearHistory();
100
104
  this.resetFailureCount();
101
105
  this.pilot?.reset();
@@ -117,12 +121,18 @@ export class Tester extends TaskAgent {
117
121
  page?.on('console', onConsoleMessage);
118
122
  const initialState = ActionResult.fromState(state);
119
123
  const conversation = this.provider.startConversation(this.getSystemMessage(), 'tester');
124
+ conversation.markLastMessageCacheable();
120
125
  this.currentConversation = conversation;
121
126
  const outputDir = ConfigParser.getInstance().getOutputDir();
122
127
  this.executionLogFile = join(outputDir, `tester_${task.sessionName}.md`);
123
128
  // Note: Markdown saving functionality removed from Conversation class
124
- const initialPrompt = await this.buildTestPrompt(task, initialState);
125
- conversation.addUserText(initialPrompt);
129
+ const scenarioBlock = this.buildScenarioBlock(task, initialState);
130
+ conversation.addUserText(scenarioBlock);
131
+ conversation.markLastMessageCacheable();
132
+ conversation.protectPrefix(conversation.messages.length);
133
+ const pageContext = await this.reinjectContextIfNeeded(1, initialState);
134
+ if (pageContext)
135
+ conversation.addUserText(pageContext);
126
136
  return await Observability.run(`test: ${task.scenario}`, {
127
137
  sessionId: task.sessionName,
128
138
  tags: ['tester'],
@@ -138,6 +148,12 @@ export class Tester extends TaskAgent {
138
148
  if (this.pilot) {
139
149
  try {
140
150
  const plan = await this.pilot.planTest(task, initialState);
151
+ if (task.hasFinished) {
152
+ offFailedRequest?.();
153
+ page?.off('pageerror', onPageError);
154
+ page?.off('console', onConsoleMessage);
155
+ return { success: task.isSuccessful };
156
+ }
141
157
  if (plan) {
142
158
  conversation.addUserText(`Pilot's test plan:\n${plan}\n\nFollow this plan while executing the test.`);
143
159
  }
@@ -158,14 +174,18 @@ export class Tester extends TaskAgent {
158
174
  await this.explorer.startTest(task);
159
175
  debugLog(`Navigating to ${task.startUrl}`);
160
176
  await this.explorer.visit(task.startUrl);
161
- const currentUrl = this.explorer.getStateManager().getCurrentState()?.url || task.startUrl || '';
177
+ const startState = this.explorer.getStateManager().getCurrentState();
178
+ if (startState)
179
+ task.addUrlNote(startState);
180
+ const currentUrl = startState?.url || task.startUrl || '';
162
181
  await this.hooksRunner.runBeforeHook('tester', currentUrl);
163
182
  const offStateChange = this.explorer.getStateManager().onStateChange((event) => {
164
183
  if (task.hasFinished)
165
184
  return;
166
185
  if (event.toState?.url === event.fromState?.url)
167
186
  return;
168
- task.addNote(`Navigated to ${event.toState?.url}`, TestResult.PASSED);
187
+ if (event.toState)
188
+ task.addUrlNote(event.toState, event.fromState || undefined);
169
189
  task.states.push(event.toState);
170
190
  });
171
191
  const codeceptjsTools = createCodeceptJSTools(this.explorer, task);
@@ -203,13 +223,13 @@ export class Tester extends TaskAgent {
203
223
  The user has interrupted and wants to change direction. Follow the new instruction.
204
224
  `);
205
225
  }
206
- conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 2);
226
+ conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 1);
207
227
  conversation.cleanupTag('page_html', '...cleaned HTML snapshot...', 1);
208
228
  conversation.cleanupTag('experience', '...cleaned experience...', 1);
209
229
  conversation.cleanupTag('applied_experience', '...cleaned past experience...', 1);
210
230
  conversation.cleanupTag('page_ui_map', '...cleaned UI map...', 1);
211
231
  conversation.cleanupTag('page_ui_map_overlay', '...cleaned UI overlay...', 1);
212
- conversation.compactToolResults(3);
232
+ conversation.compactToolResults(2);
213
233
  if (iteration > 1) {
214
234
  const isNewPage = this.previousUrl !== null && this.previousUrl !== currentState.url;
215
235
  let nextStep = '';
@@ -220,16 +240,17 @@ export class Tester extends TaskAgent {
220
240
  if (guidance)
221
241
  nextStep += `\n\n${guidance}`;
222
242
  }
223
- else if ((iteration % this.progressCheckInterval === 0 || this.consecutiveFailures >= 3 || this.consecutiveEmptyResults >= 2) && this.pilot) {
243
+ else if (this.shouldAnalyzeProgress(iteration, currentState) && this.pilot) {
224
244
  const guidance = await this.pilot.analyzeProgress(task, currentState, conversation);
225
245
  if (guidance)
226
246
  nextStep += `\n\n${guidance}`;
227
247
  this.consecutiveFailures = 0;
248
+ this.lastAnalyzedStateHash = currentState.hash;
228
249
  }
229
250
  conversation.addUserText(nextStep);
230
251
  }
231
252
  const result = await this.provider.invokeConversation(conversation, tools, {
232
- maxToolRoundtrips: 5,
253
+ maxToolRoundtrips: 3,
233
254
  toolChoice: 'required',
234
255
  stopWhen: () => task.hasFinished,
235
256
  });
@@ -308,10 +329,15 @@ export class Tester extends TaskAgent {
308
329
  : undefined,
309
330
  catch: async ({ error, stop }) => {
310
331
  tag('error').log(`Test execution error: ${error}`);
332
+ const message = error instanceof Error ? error.message : String(error);
311
333
  if (!task.hasFinished) {
312
- task.addNote(`Execution error: ${error instanceof Error ? error.message : String(error)}`);
334
+ task.addNote(`Execution error: ${message}`);
313
335
  }
314
- stop();
336
+ if (error instanceof Error && error.name === 'AbortError') {
337
+ stop();
338
+ return;
339
+ }
340
+ conversation.addUserText(`Previous AI call failed: ${message}. Take a different approach on the next step.`);
315
341
  },
316
342
  });
317
343
  if (task.hasFinished)
@@ -354,6 +380,17 @@ export class Tester extends TaskAgent {
354
380
  ...task,
355
381
  };
356
382
  }
383
+ shouldAnalyzeProgress(iteration, currentState) {
384
+ if (this.consecutiveFailures >= 3)
385
+ return true;
386
+ if (this.consecutiveEmptyResults >= 2)
387
+ return true;
388
+ if (iteration % this.progressCheckInterval !== 0)
389
+ return false;
390
+ if (this.lastAnalyzedStateHash === currentState.hash)
391
+ return false;
392
+ return true;
393
+ }
357
394
  async prepareInstructionsForNextStep(task) {
358
395
  let outcomeStatus = dedent `
359
396
  <task>
@@ -432,19 +469,23 @@ export class Tester extends TaskAgent {
432
469
  this.explorer.clearOtherTabsInfo();
433
470
  }
434
471
  if (isNewUrl) {
472
+ const alreadySeenUiMap = this.seenUiMapUrls.has(currentUrl);
435
473
  let research = '';
436
- try {
437
- research = await this.researcher.research(currentState);
438
- }
439
- catch (err) {
440
- if (!(err instanceof ErrorPageError))
441
- throw err;
442
- tag('warning').log(`Research skipped: ${err.message}`);
474
+ if (!alreadySeenUiMap) {
475
+ try {
476
+ research = await this.researcher.research(currentState);
477
+ }
478
+ catch (err) {
479
+ if (!(err instanceof ErrorPageError))
480
+ throw err;
481
+ tag('warning').log(`Research skipped: ${err.message}`);
482
+ }
443
483
  }
444
484
  this.pageStateHash = currentStateHash;
445
485
  this.pageActionResult = currentState;
446
486
  let uiMapSection = '';
447
487
  if (research) {
488
+ this.seenUiMapUrls.add(currentUrl);
448
489
  uiMapSection = dedent `
449
490
 
450
491
  Page UI Map
@@ -454,6 +495,9 @@ export class Tester extends TaskAgent {
454
495
  </page_ui_map>
455
496
  `;
456
497
  }
498
+ else if (alreadySeenUiMap) {
499
+ uiMapSection = `\n\n<page_ui_map>UI map for ${currentUrl} was shown earlier in this session — refer to it above.</page_ui_map>`;
500
+ }
457
501
  context += dedent `
458
502
  Context:
459
503
 
@@ -651,9 +695,8 @@ export class Tester extends TaskAgent {
651
695
  ${this.provider.getSystemPromptForAgent('tester', this.explorer.getStateManager().getCurrentState()?.url) || ''}
652
696
  `;
653
697
  }
654
- async buildTestPrompt(task, actionResult) {
698
+ buildScenarioBlock(task, actionResult) {
655
699
  const knowledge = this.getKnowledge(actionResult);
656
- const pageContext = await this.reinjectContextIfNeeded(1, actionResult);
657
700
  return dedent `
658
701
  <task>
659
702
  SCENARIO GOAL: ${task.scenario}
@@ -680,8 +723,6 @@ export class Tester extends TaskAgent {
680
723
  ${this.buildAvailableFiles()}
681
724
 
682
725
  ${knowledge}
683
-
684
- ${pageContext}
685
726
  `;
686
727
  }
687
728
  getDeletableSessionNames(task) {