npm - explorbot - Versions diffs - 0.1.13 → 0.1.16 - Mend

explorbot 0.1.13 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/dist/package.json +3 -2
package/dist/src/action.js +3 -2
package/dist/src/ai/conversation.js +20 -4
package/dist/src/ai/historian/utils.js +8 -1
package/dist/src/ai/pilot.js +198 -260
package/dist/src/ai/provider.js +25 -12
package/dist/src/ai/quartermaster.js +2 -2
package/dist/src/ai/researcher/focus.js +51 -10
package/dist/src/ai/researcher/sections.js +8 -4
package/dist/src/ai/researcher.js +9 -24
package/dist/src/ai/rules.js +2 -0
package/dist/src/ai/session-analyst.js +46 -41
package/dist/src/ai/tester.js +63 -22
package/dist/src/ai/tools.js +19 -4
package/dist/src/commands/explore-command.js +8 -2
package/dist/src/components/StatusPane.js +6 -1
package/dist/src/experience-tracker.js +9 -0
package/dist/src/explorer.js +2 -5
package/dist/src/reporter.js +41 -1
package/dist/src/stats.js +2 -1
package/dist/src/test-plan.js +47 -3
package/package.json +3 -2
package/src/action.ts +3 -2
package/src/ai/conversation.ts +21 -4
package/src/ai/historian/utils.ts +8 -1
package/src/ai/pilot.ts +199 -259
package/src/ai/provider.ts +24 -12
package/src/ai/quartermaster.ts +2 -2
package/src/ai/researcher/focus.ts +57 -8
package/src/ai/researcher/sections.ts +7 -3
package/src/ai/researcher.ts +8 -23
package/src/ai/rules.ts +2 -0
package/src/ai/session-analyst.ts +47 -41
package/src/ai/tester.ts +55 -20
package/src/ai/tools.ts +18 -4
package/src/commands/explore-command.ts +9 -2
package/src/components/StatusPane.tsx +6 -3
package/src/experience-tracker.ts +9 -0
package/src/explorer.ts +1 -4
package/src/reporter.ts +44 -1
package/src/stats.ts +3 -1
package/src/test-plan.ts +62 -3

package/src/ai/provider.ts CHANGED Viewed

@@ -19,6 +19,15 @@ const responseLog = createDebug('explorbot:provider:in');
 class AiError extends Error {}
 export class ContextLengthError extends Error {}
+function extractCachedTokens(usage: any): number {
+  if (!usage) return 0;
+  const direct = usage.cachedInputTokens ?? usage.inputTokenDetails?.cacheReadTokens;
+  if (typeof direct === 'number') return direct;
+  const raw = usage.raw;
+  const fromRaw = raw?.prompt_tokens_details?.cached_tokens ?? raw?.promptTokensDetails?.cachedTokens;
+  return typeof fromRaw === 'number' ? fromRaw : 0;
+}
 function rejectAfterIdle(ms: number, signal: { cancelled: boolean }): Promise<never> {
   return new Promise((_, reject) => {
     const tick = () => {
@@ -265,9 +274,10 @@ export class Provider {
       if (response.usage) {
         Stats.recordTokens(options.agentName || 'unknown', modelName, {
-          input: response.usage.promptTokens || 0,
-          output: response.usage.completionTokens || 0,
-          total: response.usage.totalTokens || 0,
+          input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
+          output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
+          total: response.usage.totalTokens ?? 0,
+          cached: extractCachedTokens(response.usage),
         });
       }
@@ -355,9 +365,10 @@ export class Provider {
       if (response.usage) {
         Stats.recordTokens(options.agentName || 'unknown', modelName, {
-          input: response.usage.promptTokens || 0,
-          output: response.usage.completionTokens || 0,
-          total: response.usage.totalTokens || 0,
+          input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
+          output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
+          total: response.usage.totalTokens ?? 0,
+          cached: extractCachedTokens(response.usage),
         });
       }
@@ -428,9 +439,10 @@ export class Provider {
       if (response.usage) {
         Stats.recordTokens(options.agentName || 'unknown', modelName, {
-          input: response.usage.promptTokens || 0,
-          output: response.usage.completionTokens || 0,
-          total: response.usage.totalTokens || 0,
+          input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
+          output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
+          total: response.usage.totalTokens ?? 0,
+          cached: extractCachedTokens(response.usage),
         });
       }
@@ -625,9 +637,9 @@ export class Provider {
       if (response.usage) {
         Stats.recordTokens('vision', this.getModelName(this.config.visionModel), {
-          input: response.usage.promptTokens || 0,
-          output: response.usage.completionTokens || 0,
-          total: response.usage.totalTokens || 0,
+          input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
+          output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
+          total: response.usage.totalTokens ?? 0,
         });
       }

package/src/ai/quartermaster.ts CHANGED Viewed

@@ -240,11 +240,11 @@ Focus on what would confuse a real user or caused the agent to make mistakes.`;
     const criticalViolations = report.axeViolations.filter((v) => v.impact === 'critical' || v.impact === 'serious');
     for (const v of criticalViolations.slice(0, 3)) {
       const nodeHtml = v.nodes[0]?.html.slice(0, 100) || '';
-      task.addNote(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
+      task.addVerificationDetail(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
     }
     for (const issue of report.semanticIssues.slice(0, 3)) {
-      task.addNote(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
+      task.addVerificationDetail(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
     }
   }

package/src/ai/researcher/focus.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { detectFocusArea } from '../../utils/aria.ts';
+import type { Page } from 'playwright';
 import { mdq } from '../../utils/markdown-query.ts';
 import type { ResearchSection } from './parser.ts';
 import type { ResearchResult } from './research-result.ts';
@@ -10,16 +10,65 @@ export function hasFocusedSection(text: string): boolean {
   return text.includes(FOCUSED_MARKER);
 }
-export function detectFocusFromAria(ariaSnapshot: string | null, sections: ResearchSection[]): string | null {
-  const focusArea = detectFocusArea(ariaSnapshot);
-  if (!focusArea.detected) return null;
+interface FocusProbe {
+  name: string;
+  isDialog: boolean;
+  zIndex: number;
+  hasShadow: boolean;
+}
+export async function detectFocusedSection(page: Page, sections: ResearchSection[]): Promise<string | null> {
+  const candidates: FocusProbe[] = [];
+  for (const section of sections) {
+    if (!section.containerCss) continue;
+    const key = section.name.toLowerCase().replace(/^section:\s*/, '');
+    if (FOCUS_SKIP_SECTIONS.has(key)) continue;
+    try {
+      const locator = page.locator(section.containerCss).first();
+      if (!(await locator.isVisible())) continue;
+      const probe = await locator.evaluate((el) => {
+        const dialogSelector = '[role="dialog"], [role="alertdialog"], [aria-modal="true"]';
+        const isDialog = el.matches(dialogSelector) || !!el.querySelector(dialogSelector);
+        let cur: Element | null = el;
+        let maxZ = 0;
+        while (cur && cur !== document.body) {
+          const cs = window.getComputedStyle(cur);
+          if (cs.position !== 'static') {
+            const z = Number.parseInt(cs.zIndex, 10);
+            if (!Number.isNaN(z) && z > maxZ) maxZ = z;
+          }
+          cur = cur.parentElement;
+        }
-  if (focusArea.type === 'dialog' || focusArea.type === 'modal') {
-    const dialogSection = sections.find((s) => s.containerCss && (s.containerCss.includes('[role="dialog"]') || s.containerCss.includes('[role="alertdialog"]') || s.containerCss.includes('[aria-modal')));
-    if (dialogSection) return dialogSection.name;
+        const shadow = window.getComputedStyle(el).boxShadow;
+        const hasShadow = !!shadow && shadow !== 'none';
+        return { isDialog, zIndex: maxZ, hasShadow };
+      });
+      candidates.push({ name: section.name, ...probe });
+    } catch {}
   }
-  return null;
+  if (candidates.length === 0) return null;
+  const dialogs = candidates.filter((c) => c.isDialog);
+  const pool = dialogs.length > 0 ? dialogs : candidates;
+  const winner = pool.reduce<FocusProbe | null>((best, c) => {
+    if (!best) return c;
+    if (c.zIndex !== best.zIndex) return c.zIndex > best.zIndex ? c : best;
+    if (c.hasShadow !== best.hasShadow) return c.hasShadow ? c : best;
+    return best;
+  }, null);
+  if (!winner) return null;
+  if (dialogs.length === 0 && winner.zIndex === 0 && !winner.hasShadow) return null;
+  return winner.name;
 }
 export function markSectionAsFocused(result: ResearchResult, sectionName: string): void {

package/src/ai/researcher/sections.ts CHANGED Viewed

@@ -7,7 +7,9 @@ import { tag } from '../../utils/logger.js';
 import { RulesLoader } from '../../utils/rules-loader.ts';
 import type { Provider } from '../provider.js';
 import { locatorRule as generalLocatorRuleText } from '../rules.js';
+import { markSectionAsFocused } from './focus.ts';
 import type { Constructor } from './mixin.ts';
+import { ResearchResult } from './research-result.ts';
 export interface SectionMethods {
   researchBySections(): Promise<string>;
@@ -54,9 +56,11 @@ export function WithSections<T extends Constructor>(Base: T) {
         throw new Error('Per-section research produced no sections — AI responses all empty or NOT_PRESENT');
       }
-      let merged = parts.join('\n\n');
-      if (focusCss) merged += '\n\n> Focused: Focus';
-      return merged;
+      const merged = parts.join('\n\n');
+      if (!focusCss) return merged;
+      const focused = new ResearchResult(merged, this.actionResult?.url || '');
+      markSectionAsFocused(focused, 'Focus');
+      return focused.text;
     }
     private async _detectFocusCss(): Promise<string | null> {

package/src/ai/researcher.ts CHANGED Viewed

@@ -24,7 +24,7 @@ import { ContextLengthError, type Provider } from './provider.js';
 import { findSimilarResearch, getCachedResearch, saveResearch } from './researcher/cache.ts';
 import { type CoordinateMethods, WithCoordinates } from './researcher/coordinates.ts';
 import { type DeepAnalysisMethods, WithDeepAnalysis } from './researcher/deep-analysis.ts';
-import { detectFocusFromAria, hasFocusedSection, markSectionAsFocused, pickDefaultFocusedSection } from './researcher/focus.ts';
+import { detectFocusedSection, hasFocusedSection, markSectionAsFocused, pickDefaultFocusedSection } from './researcher/focus.ts';
 import { type LocatorMethods, WithLocators } from './researcher/locators.ts';
 import { extractValidContainers, formatResearchSummary, parseResearchSections } from './researcher/parser.ts';
 import { ResearchResult } from './researcher/research-result.ts';
@@ -234,17 +234,12 @@ export class Researcher extends ResearcherBase implements Agent {
         await this.fixBrokenSections(result, activeConversation);
       }
-      // Focused section: parse AI declaration, then ARIA fallback
-      const focusMatch = result.text.match(/^>\s*Focused:\s*(.+)/m);
-      if (focusMatch) {
-        result.text = result.text.replace(focusMatch[0], '');
-        markSectionAsFocused(result, focusMatch[1].trim());
-      }
-      if (!hasFocusedSection(result.text)) {
+      // Focused section: unified Playwright probe (HTML+CSS+visibility).
+      // Must run BEFORE visuallyAnnotateContainers — annotation overlays inject z-index 99998+ which would pollute the scoring.
+      if (!interrupted() && this.hasScreenshotToAnalyze) {
         const sections = parseResearchSections(result.text);
-        const ariaSnapshot = this.actionResult?.getCompactARIA() || '';
-        const focusedName = detectFocusFromAria(ariaSnapshot, sections);
-        if (focusedName) markSectionAsFocused(result, focusedName);
+        const focused = await detectFocusedSection(this.explorer.playwrightHelper.page, sections);
+        if (focused) markSectionAsFocused(result, focused);
       }
       // Stage 4: Visual analysis
@@ -281,8 +276,8 @@ export class Researcher extends ResearcherBase implements Agent {
         await this.backfillBrokenLocators(result);
       }
-      // Focused section: final fallback
-      if (!hasFocusedSection(result.text)) {
+      // Focused section: final fallback (vision-only — without a screenshot we don't infer focus)
+      if (this.hasScreenshotToAnalyze && !hasFocusedSection(result.text)) {
         const sections = parseResearchSections(result.text);
         const fallback = pickDefaultFocusedSection(sections);
         if (fallback) markSectionAsFocused(result, fallback);
@@ -451,16 +446,6 @@ export class Researcher extends ResearcherBase implements Agent {
       | Element | ARIA | CSS | eidx |
       </section_format>
-      <focused_section>
-      At the end of your output, declare the primary focus area on a single line:
-      > Focused: <exact section name>
-      - If a dialog/modal/drawer/overlay exists, it is focused.
-      - Otherwise pick the section where the main business action happens (list for catalog, detail for item page, content for article).
-      - Navigation and menu/toolbar are never focused.
-      </focused_section>
     `;
   }

package/src/ai/rules.ts CHANGED Viewed

@@ -241,6 +241,8 @@ export function multipleTabsRule(tabs: Array<{ url: string; title: string }>): s
 export const actionRule = dedent`
   <actions>
+  \`faker\` (from @faker-js/faker) is available inside I.* calls for generating data, e.g. I.fillField('Bio', faker.lorem.paragraphs(5)).
   ### I.click
   clicks on the element by its locator

package/src/ai/session-analyst.ts CHANGED Viewed

@@ -19,69 +19,71 @@ export class SessionAnalyst implements Agent {
     const eligible = tests.filter((t) => t.startTime != null);
     if (eligible.length === 0) return '';
-    const model = this.provider.getModelForAgent('analyst');
+    const model = this.provider.getAgenticModel('analyst');
     const customPrompt = this.provider.getSystemPromptForAgent('analyst', undefined);
     const systemPrompt = dedent`
-      You write a brief end-of-session report after autonomous exploratory testing. Your reader is a developer who needs to know in seconds: what is broken, how to reproduce it, and which results were inconclusive.
+      You write a TERSE end-of-session report. Reader is a developer who wants to UNDERSTAND THE FEATURE — what works, what is broken, what is unclear. Every word must earn its place.
-      Output MARKDOWN. No JSON, no preamble, no closing remarks. Start with the heading.
+      Output MARKDOWN. No JSON, no preamble, no closing summary.
-      ## Clustering
-      Group by ROOT CAUSE, not by scenario. If three tests fail for the same dropdown, that is ONE defect listing all three test refs (#3, #5, #7). Do not produce one cluster per test.
+      NO EMOJI. No 🔴 🟡 🟢 ✅, no escape sequences like \\u2705. Use plain text severity tags: [High], [Medium], [Low] for defects.
-      ## Bucketing
-      Use the FINAL verdict (the test's \`result\` field) as the starting point. Mid-test errors that the automation recovered from do NOT make a passed test unreliable.
+      ## Reporting unit
-      - **Defect** — real product bug. \`result: failed\` AND the failure reflects the app misbehaving (not the automation). The automation completed its interactions, the app contradicted the expected outcome. Severity required.
-      - **UX issue** — app works but the UI is ambiguous, controls are hidden, or labels are unclear. Worth flagging to design.
-      - **Execution issue** — the FINAL verdict is unreliable. Only two cases:
-        1. \`result: failed\` AND the failure was automation, environment, or UI/UX (locator missing, timeout, AI loop, navigation stuck, modal trapped focus, no accessible label) — i.e. the test could not conclude whether the app works.
-        2. \`result: passed\` AND clear evidence in the log shows the user-visible goal was NOT achieved (no confirmation visible, no state change verified, the assertion was vacuous).
+      Report at the level of FEATURES / FLOWS / PAGES. Tests are evidence, not the unit. Several tests covering the same flow → ONE entry citing all of them.
-      A test that passed and shows no contrary evidence belongs in NO section. Do not list passed tests just because the log contains intermediate retries or recovered failures.
+      ## Walk every test
-      ## Severity emoji (defects only)
-      - 🔴 critical or high — core flow blocked, data loss, security
-      - 🟡 medium — partial breakage with workaround
-      - 🟢 low — cosmetic
+      PASSED test: did all steps run, was the goal actually verified, did the user-visible goal happen? All yes → contributes to What works. Any no → Execution issue (false positive).
-      ## Required format
+      FAILED test, first match wins: (1) goal achieved but mis-verified → Execution. (2) automation failure (locator/timeout/loop/modal/a11y) → Execution. (3) bad preconditions or data → Execution. (4) wrong URL/environment → Execution. (5) app contradicted expected outcome → Defect.
+      Crucial distinction: "the app misbehaved" vs "the automation could not interact with the app". ONLY the first is a Defect. If the automation gives up before the app responds — timeout, retries exhausted, dead loop / loop detected, could not click or find an element — that is an Execution issue regardless of what the log calls it. Failure inside the automation ≠ failure inside the product.
+      A solitary failure where adjacent tests on the same feature passed → Execution, not Defect.
+      ## Severity (defects only)
+      [High] blocks a core flow · [Medium] degrades a flow but workaround exists · [Low] cosmetic / edge case
+      ## Format
       # Session Analysis
-      <one sentence: total tests, defect count, headline finding>
+      <ONE or TWO sentences describing the FEATURE STATE — what was explored, whether the core flow holds, what the standout problem is. NO test counts, NO "N tests run". Talk about the product, not the run.>
+      ## Coverage
+      - Pages: <paths>
+      - Features: <capabilities>
+      ## What works
+      - **<feature>** — #2, #7, #8
       ## Defects
-      ### 🔴 <plain-English title of the BUG, not the scenario name>
-      Affects: #3, #5, #7
+      ### [Medium] <plain-English bug title>
+      Affects: #3, #5
       Reproduce:
-        1. <concrete UI step a person can replay>
-        2. <next step>
-      Evidence: <one short observation from the test log>
-      ### 🟡 <next defect>
-      ...
+        1. <concrete UI step>
+        2. <next>
+      Evidence: <one short observation>
       ## UX issues
-      - **<title>** — #4
-        <one short evidence line>
+      - **<feature>** — <what's confusing> (#7)
       ## Execution Issues
+      - **#2 <scenario>** — <≤10 words, what was unreliable>
-      - **<short test name or scenario phrase>** — <plain-English one-liner: what made the result unreliable>
-      - **<…>** — <…>
+      ## Brevity rules
-      ## Rules
-      - Defects first, sorted by severity descending. Omit any section that has zero entries.
-      - Defect title describes the BUG ("Run-type dropdown does not filter"), never the scenario name.
-      - Reproduce steps are concrete UI actions derived from the log: URL + clicks + inputs. Imperative, one short line each.
-      - Evidence is the smallest factual observation from notes/steps that supports the claim — what was OBSERVED in the page (HTML, message, missing element). Never quote the test's \`result\` field as evidence; that is a tautology.
-      - **Execution Issues** entries must explain what actually went wrong in concrete terms a human understands: "could not find a Submit button after navigation", "page reloaded before the assertion ran", "passed without ever seeing a confirmation message", "marked failed but the new item appears in the list", "modal trapped focus and tests could not click outside", "ARIA tree had no labelled controls". Avoid jargon like "locator failed" without context. Never write category prefixes ("execution:", "false-positive:") — the section header already says it. No emoji on these entries.
-      - Do NOT include a passed test in any section unless evidence proves its goal was not achieved. Intermediate retries or recovered errors in the log are not grounds for listing a passed test.
-      - No editorialising, no restating the scenario verbatim, no closing summary.
+      - Headline: 2 sentences MAX. About the FEATURE, not the run. No counts, no "N tests", no "this session". Banned words: "exercised", "comprehensive", "notably", "this session", "module", "targeted", "covered creation".
+      - What works: feature name + test refs. NO parentheticals, NO caveats. If there's a caveat, the entry doesn't belong here.
+      - Defect title is the BUG ("Search returns non-matching results"), never the scenario name.
+      - Reproduce steps are imperative one-liners drawn from the log.
+      - Evidence is one short factual observation. Never quote the \`result\` field.
+      - Execution Issues: ONE line per test, ≤10 words, plain. Examples: "passed vacuously, no list assertion", "no file upload step in log", "dead loop on Save click". No prefixes, no nested explanation.
+      - Omit any empty section.
+      - Section order: Coverage → What works → Defects (severity desc) → UX issues → Execution Issues.
       ${customPrompt || ''}
     `;
@@ -101,7 +103,7 @@ export class SessionAnalyst implements Agent {
       { agentName: 'analyst' }
     );
-    return (response?.text || '').trim();
+    return decodeEscapes((response?.text || '').trim());
   }
   writeReport(markdown: string): string {
@@ -131,3 +133,7 @@ export class SessionAnalyst implements Agent {
     `;
   }
 }
+function decodeEscapes(text: string): string {
+  return text.replace(/\\u\{([0-9a-fA-F]+)\}/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16)));
+}

package/src/ai/tester.ts CHANGED Viewed

@@ -64,6 +64,8 @@ export class Tester extends TaskAgent implements Agent {
   private pageStateHash: string | null = null;
   private pageActionResult: ActionResult | null = null;
   private hooksRunner: HooksRunner;
+  private seenUiMapUrls = new Set<string>();
+  private lastAnalyzedStateHash: string | null = null;
   constructor(explorer: Explorer, provider: Provider, researcher: Researcher, navigator: Navigator, agentTools?: any) {
     super();
@@ -104,7 +106,7 @@ export class Tester extends TaskAgent implements Agent {
   }
   private get progressCheckInterval(): number {
-    return (this.explorer.getConfig().ai?.agents?.tester as any)?.progressCheckInterval ?? 5;
+    return (this.explorer.getConfig().ai?.agents?.tester as any)?.progressCheckInterval ?? 3;
   }
   getConversation(): Conversation | null {
@@ -123,6 +125,8 @@ export class Tester extends TaskAgent implements Agent {
     this.previousStateHash = null;
     this.pageStateHash = null;
     this.pageActionResult = null;
+    this.seenUiMapUrls.clear();
+    this.lastAnalyzedStateHash = null;
     this.explorer.getStateManager().clearHistory();
     this.resetFailureCount();
     this.pilot?.reset();
@@ -147,14 +151,20 @@ export class Tester extends TaskAgent implements Agent {
     const initialState = ActionResult.fromState(state);
     const conversation = this.provider.startConversation(this.getSystemMessage(), 'tester');
+    conversation.markLastMessageCacheable();
     this.currentConversation = conversation;
     const outputDir = ConfigParser.getInstance().getOutputDir();
     this.executionLogFile = join(outputDir, `tester_${task.sessionName}.md`);
     // Note: Markdown saving functionality removed from Conversation class
-    const initialPrompt = await this.buildTestPrompt(task, initialState);
-    conversation.addUserText(initialPrompt);
+    const scenarioBlock = this.buildScenarioBlock(task, initialState);
+    conversation.addUserText(scenarioBlock);
+    conversation.markLastMessageCacheable();
+    conversation.protectPrefix(conversation.messages.length);
+    const pageContext = await this.reinjectContextIfNeeded(1, initialState);
+    if (pageContext) conversation.addUserText(pageContext);
     return await Observability.run(
       `test: ${task.scenario}`,
@@ -177,6 +187,12 @@ export class Tester extends TaskAgent implements Agent {
     if (this.pilot) {
       try {
         const plan = await this.pilot.planTest(task, initialState);
+        if (task.hasFinished) {
+          offFailedRequest?.();
+          page?.off('pageerror', onPageError);
+          page?.off('console', onConsoleMessage);
+          return { success: task.isSuccessful };
+        }
         if (plan) {
           conversation.addUserText(`Pilot's test plan:\n${plan}\n\nFollow this plan while executing the test.`);
         }
@@ -200,13 +216,15 @@ export class Tester extends TaskAgent implements Agent {
     debugLog(`Navigating to ${task.startUrl}`);
     await this.explorer.visit(task.startUrl!);
-    const currentUrl = this.explorer.getStateManager().getCurrentState()?.url || task.startUrl || '';
+    const startState = this.explorer.getStateManager().getCurrentState();
+    if (startState) task.addUrlNote(startState);
+    const currentUrl = startState?.url || task.startUrl || '';
     await this.hooksRunner.runBeforeHook('tester', currentUrl);
     const offStateChange = this.explorer.getStateManager().onStateChange((event: StateTransition) => {
       if (task.hasFinished) return;
       if (event.toState?.url === event.fromState?.url) return;
-      task.addNote(`Navigated to ${event.toState?.url}`, TestResult.PASSED);
+      if (event.toState) task.addUrlNote(event.toState, event.fromState || undefined);
       task.states.push(event.toState);
     });
@@ -253,13 +271,13 @@ export class Tester extends TaskAgent implements Agent {
           `);
           }
-          conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 2);
+          conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 1);
           conversation.cleanupTag('page_html', '...cleaned HTML snapshot...', 1);
           conversation.cleanupTag('experience', '...cleaned experience...', 1);
           conversation.cleanupTag('applied_experience', '...cleaned past experience...', 1);
           conversation.cleanupTag('page_ui_map', '...cleaned UI map...', 1);
           conversation.cleanupTag('page_ui_map_overlay', '...cleaned UI overlay...', 1);
-          conversation.compactToolResults(3);
+          conversation.compactToolResults(2);
           if (iteration > 1) {
             const isNewPage = this.previousUrl !== null && this.previousUrl !== currentState.url;
@@ -270,16 +288,17 @@ export class Tester extends TaskAgent implements Agent {
             if (isNewPage && this.pilot) {
               const guidance = await this.pilot.reviewNewPage(task, currentState, conversation);
               if (guidance) nextStep += `\n\n${guidance}`;
-            } else if ((iteration % this.progressCheckInterval === 0 || this.consecutiveFailures >= 3 || this.consecutiveEmptyResults >= 2) && this.pilot) {
+            } else if (this.shouldAnalyzeProgress(iteration, currentState) && this.pilot) {
               const guidance = await this.pilot.analyzeProgress(task, currentState, conversation);
               if (guidance) nextStep += `\n\n${guidance}`;
               this.consecutiveFailures = 0;
+              this.lastAnalyzedStateHash = currentState.hash;
             }
             conversation.addUserText(nextStep);
           }
           const result = await this.provider.invokeConversation(conversation, tools, {
-            maxToolRoundtrips: 5,
+            maxToolRoundtrips: 3,
             toolChoice: 'required',
             stopWhen: () => task.hasFinished,
           });
@@ -368,10 +387,15 @@ export class Tester extends TaskAgent implements Agent {
             : undefined,
           catch: async ({ error, stop }) => {
             tag('error').log(`Test execution error: ${error}`);
+            const message = error instanceof Error ? error.message : String(error);
             if (!task.hasFinished) {
-              task.addNote(`Execution error: ${error instanceof Error ? error.message : String(error)}`);
+              task.addNote(`Execution error: ${message}`);
             }
-            stop();
+            if (error instanceof Error && error.name === 'AbortError') {
+              stop();
+              return;
+            }
+            conversation.addUserText(`Previous AI call failed: ${message}. Take a different approach on the next step.`);
           },
         }
       );
@@ -421,6 +445,14 @@ export class Tester extends TaskAgent implements Agent {
     };
   }
+  private shouldAnalyzeProgress(iteration: number, currentState: ActionResult): boolean {
+    if (this.consecutiveFailures >= 3) return true;
+    if (this.consecutiveEmptyResults >= 2) return true;
+    if (iteration % this.progressCheckInterval !== 0) return false;
+    if (this.lastAnalyzedStateHash === currentState.hash) return false;
+    return true;
+  }
   private async prepareInstructionsForNextStep(task: Test): Promise<string> {
     let outcomeStatus = dedent`
       <task>
@@ -511,17 +543,21 @@ export class Tester extends TaskAgent implements Agent {
     }
     if (isNewUrl) {
+      const alreadySeenUiMap = this.seenUiMapUrls.has(currentUrl);
       let research = '';
-      try {
-        research = await this.researcher.research(currentState);
-      } catch (err) {
-        if (!(err instanceof ErrorPageError)) throw err;
-        tag('warning').log(`Research skipped: ${err.message}`);
+      if (!alreadySeenUiMap) {
+        try {
+          research = await this.researcher.research(currentState);
+        } catch (err) {
+          if (!(err instanceof ErrorPageError)) throw err;
+          tag('warning').log(`Research skipped: ${err.message}`);
+        }
       }
       this.pageStateHash = currentStateHash;
       this.pageActionResult = currentState;
       let uiMapSection = '';
       if (research) {
+        this.seenUiMapUrls.add(currentUrl);
         uiMapSection = dedent`
           Page UI Map
@@ -530,6 +566,8 @@ export class Tester extends TaskAgent implements Agent {
           ${research}
           </page_ui_map>
         `;
+      } else if (alreadySeenUiMap) {
+        uiMapSection = `\n\n<page_ui_map>UI map for ${currentUrl} was shown earlier in this session — refer to it above.</page_ui_map>`;
       }
       context += dedent`
@@ -740,9 +778,8 @@ export class Tester extends TaskAgent implements Agent {
     `;
   }
-  private async buildTestPrompt(task: Test, actionResult: ActionResult): Promise<string> {
+  private buildScenarioBlock(task: Test, actionResult: ActionResult): string {
     const knowledge = this.getKnowledge(actionResult);
-    const pageContext = await this.reinjectContextIfNeeded(1, actionResult);
     return dedent`
       <task>
@@ -770,8 +807,6 @@ export class Tester extends TaskAgent implements Agent {
       ${this.buildAvailableFiles()}
       ${knowledge}
-      ${pageContext}
     `;
   }

package/src/ai/tools.ts CHANGED Viewed

@@ -510,7 +510,7 @@ export function createAgentTools({
           }
           return successToolResult('see', {
-            analysis: analysisResult,
+            analysis: cap(analysisResult, ANALYSIS_OUTPUT_CAP),
             message: `Successfully analyzed screenshot for: ${request}`,
             suggestion: 'Visual confirmation is valid evidence for test results. Use record() to note the visual findings.',
           });
@@ -559,8 +559,8 @@ export function createAgentTools({
             url: currentState.url,
             title: currentState.title,
             suggestion: 'If not enough context received, call see() to visually identify elements in page contents',
-            aria,
-            html,
+            aria: cap(aria, ARIA_OUTPUT_CAP),
+            html: cap(html, HTML_OUTPUT_CAP),
             reminder: 'Context provided. Do not call context() again until you perform actions or suspect page changed.',
           });
         } catch (error) {
@@ -657,7 +657,7 @@ export function createAgentTools({
           return successToolResult('research', {
             analysis: researchResult,
-            aria: ActionResult.fromState(currentState).getInteractiveARIA(),
+            aria: cap(ActionResult.fromState(currentState).getInteractiveARIA(), ARIA_OUTPUT_CAP),
             message: `Successfully researched page: ${currentState.url}.`,
             suggestion: dedent`
               You received comprehensive UI map report. Use it to understand the page structure and navigate to the elements.
@@ -1001,6 +1001,16 @@ export function createAgentTools({
 const PAGE_DIFF_SUGGESTION = 'Analyze page diff. htmlParts shows what changed and WHERE — each part has a container selector. Use the container as context when clicking elements from the diff.';
+const ARIA_OUTPUT_CAP = 4000;
+const HTML_OUTPUT_CAP = 6000;
+const ANALYSIS_OUTPUT_CAP = 2000;
+function cap(text: string | undefined | null, max: number): string {
+  if (!text) return '';
+  if (text.length <= max) return text;
+  return `${text.slice(0, max)}\n[...truncated; ${text.length - max} chars omitted...]`;
+}
 function transformContainsCommand(command: string): string {
   if (!command.includes(':contains(')) return command;
@@ -1044,8 +1054,12 @@ function successToolResult(action: string, data?: Record<string, any>, source?:
   if (data?.pageDiff) {
     let suggestion = PAGE_DIFF_SUGGESTION;
     const ariaChanges = data.pageDiff.ariaChanges || '';
+    const urlChanged = data.pageDiff.urlChanged === true;
+    const hasHtmlParts = Array.isArray(data.pageDiff.htmlParts) && data.pageDiff.htmlParts.length > 0;
     if (countAriaChanges(ariaChanges) >= 50) {
       suggestion = `MAJOR PAGE CHANGE. Page entered a different mode. Check htmlParts and iframes in pageDiff before next action. ${suggestion}`;
+    } else if (!urlChanged && !ariaChanges && !hasHtmlParts) {
+      suggestion = 'Action ran without error but produced no observable change (URL, ARIA and HTML all unchanged). The locator likely matched a non-interactive ancestor or an element outside the intended control. Re-locate via xpathCheck() or verify with see() before treating this as success.';
     } else if (ariaChanges.includes('heading') && ariaChanges.includes('added')) {
       suggestion += ' WARNING: A new panel or modal may have appeared. If this was not the intended action, close it and try a different element.';
     }