explorbot 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,15 @@ const responseLog = createDebug('explorbot:provider:in');
19
19
  class AiError extends Error {}
20
20
  export class ContextLengthError extends Error {}
21
21
 
22
+ function extractCachedTokens(usage: any): number {
23
+ if (!usage) return 0;
24
+ const direct = usage.cachedInputTokens ?? usage.inputTokenDetails?.cacheReadTokens;
25
+ if (typeof direct === 'number') return direct;
26
+ const raw = usage.raw;
27
+ const fromRaw = raw?.prompt_tokens_details?.cached_tokens ?? raw?.promptTokensDetails?.cachedTokens;
28
+ return typeof fromRaw === 'number' ? fromRaw : 0;
29
+ }
30
+
22
31
  function rejectAfterIdle(ms: number, signal: { cancelled: boolean }): Promise<never> {
23
32
  return new Promise((_, reject) => {
24
33
  const tick = () => {
@@ -265,9 +274,10 @@ export class Provider {
265
274
 
266
275
  if (response.usage) {
267
276
  Stats.recordTokens(options.agentName || 'unknown', modelName, {
268
- input: response.usage.promptTokens || 0,
269
- output: response.usage.completionTokens || 0,
270
- total: response.usage.totalTokens || 0,
277
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
278
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
279
+ total: response.usage.totalTokens ?? 0,
280
+ cached: extractCachedTokens(response.usage),
271
281
  });
272
282
  }
273
283
 
@@ -355,9 +365,10 @@ export class Provider {
355
365
 
356
366
  if (response.usage) {
357
367
  Stats.recordTokens(options.agentName || 'unknown', modelName, {
358
- input: response.usage.promptTokens || 0,
359
- output: response.usage.completionTokens || 0,
360
- total: response.usage.totalTokens || 0,
368
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
369
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
370
+ total: response.usage.totalTokens ?? 0,
371
+ cached: extractCachedTokens(response.usage),
361
372
  });
362
373
  }
363
374
 
@@ -428,9 +439,10 @@ export class Provider {
428
439
 
429
440
  if (response.usage) {
430
441
  Stats.recordTokens(options.agentName || 'unknown', modelName, {
431
- input: response.usage.promptTokens || 0,
432
- output: response.usage.completionTokens || 0,
433
- total: response.usage.totalTokens || 0,
442
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
443
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
444
+ total: response.usage.totalTokens ?? 0,
445
+ cached: extractCachedTokens(response.usage),
434
446
  });
435
447
  }
436
448
 
@@ -625,9 +637,9 @@ export class Provider {
625
637
 
626
638
  if (response.usage) {
627
639
  Stats.recordTokens('vision', this.getModelName(this.config.visionModel), {
628
- input: response.usage.promptTokens || 0,
629
- output: response.usage.completionTokens || 0,
630
- total: response.usage.totalTokens || 0,
640
+ input: response.usage.inputTokens ?? response.usage.promptTokens ?? 0,
641
+ output: response.usage.outputTokens ?? response.usage.completionTokens ?? 0,
642
+ total: response.usage.totalTokens ?? 0,
631
643
  });
632
644
  }
633
645
 
@@ -240,11 +240,11 @@ Focus on what would confuse a real user or caused the agent to make mistakes.`;
240
240
  const criticalViolations = report.axeViolations.filter((v) => v.impact === 'critical' || v.impact === 'serious');
241
241
  for (const v of criticalViolations.slice(0, 3)) {
242
242
  const nodeHtml = v.nodes[0]?.html.slice(0, 100) || '';
243
- task.addNote(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
243
+ task.addVerificationDetail(`🔴 A11Y [${v.impact}] ${v.id}: ${v.description} — ${nodeHtml}`);
244
244
  }
245
245
 
246
246
  for (const issue of report.semanticIssues.slice(0, 3)) {
247
- task.addNote(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
247
+ task.addVerificationDetail(`💡 UX [${issue.type}] ${issue.element}: ${issue.suggestion}`);
248
248
  }
249
249
  }
250
250
 
package/src/ai/rules.ts CHANGED
@@ -241,6 +241,8 @@ export function multipleTabsRule(tabs: Array<{ url: string; title: string }>): s
241
241
 
242
242
  export const actionRule = dedent`
243
243
  <actions>
244
+ \`faker\` (from @faker-js/faker) is available inside I.* calls for generating data, e.g. I.fillField('Bio', faker.lorem.paragraphs(5)).
245
+
244
246
  ### I.click
245
247
 
246
248
  clicks on the element by its locator
@@ -19,69 +19,71 @@ export class SessionAnalyst implements Agent {
19
19
  const eligible = tests.filter((t) => t.startTime != null);
20
20
  if (eligible.length === 0) return '';
21
21
 
22
- const model = this.provider.getModelForAgent('analyst');
22
+ const model = this.provider.getAgenticModel('analyst');
23
23
  const customPrompt = this.provider.getSystemPromptForAgent('analyst', undefined);
24
24
 
25
25
  const systemPrompt = dedent`
26
- You write a brief end-of-session report after autonomous exploratory testing. Your reader is a developer who needs to know in seconds: what is broken, how to reproduce it, and which results were inconclusive.
26
+ You write a TERSE end-of-session report. Reader is a developer who wants to UNDERSTAND THE FEATURE what works, what is broken, what is unclear. Every word must earn its place.
27
27
 
28
- Output MARKDOWN. No JSON, no preamble, no closing remarks. Start with the heading.
28
+ Output MARKDOWN. No JSON, no preamble, no closing summary.
29
29
 
30
- ## Clustering
31
- Group by ROOT CAUSE, not by scenario. If three tests fail for the same dropdown, that is ONE defect listing all three test refs (#3, #5, #7). Do not produce one cluster per test.
30
+ NO EMOJI. No 🔴 🟡 🟢 ✅, no escape sequences like \\u2705. Use plain text severity tags: [High], [Medium], [Low] for defects.
32
31
 
33
- ## Bucketing
34
- Use the FINAL verdict (the test's \`result\` field) as the starting point. Mid-test errors that the automation recovered from do NOT make a passed test unreliable.
32
+ ## Reporting unit
35
33
 
36
- - **Defect** real product bug. \`result: failed\` AND the failure reflects the app misbehaving (not the automation). The automation completed its interactions, the app contradicted the expected outcome. Severity required.
37
- - **UX issue** — app works but the UI is ambiguous, controls are hidden, or labels are unclear. Worth flagging to design.
38
- - **Execution issue** — the FINAL verdict is unreliable. Only two cases:
39
- 1. \`result: failed\` AND the failure was automation, environment, or UI/UX (locator missing, timeout, AI loop, navigation stuck, modal trapped focus, no accessible label) — i.e. the test could not conclude whether the app works.
40
- 2. \`result: passed\` AND clear evidence in the log shows the user-visible goal was NOT achieved (no confirmation visible, no state change verified, the assertion was vacuous).
34
+ Report at the level of FEATURES / FLOWS / PAGES. Tests are evidence, not the unit. Several tests covering the same flow ONE entry citing all of them.
41
35
 
42
- A test that passed and shows no contrary evidence belongs in NO section. Do not list passed tests just because the log contains intermediate retries or recovered failures.
36
+ ## Walk every test
43
37
 
44
- ## Severity emoji (defects only)
45
- - 🔴 critical or high — core flow blocked, data loss, security
46
- - 🟡 medium — partial breakage with workaround
47
- - 🟢 low — cosmetic
38
+ PASSED test: did all steps run, was the goal actually verified, did the user-visible goal happen? All yes → contributes to What works. Any no → Execution issue (false positive).
48
39
 
49
- ## Required format
40
+ FAILED test, first match wins: (1) goal achieved but mis-verified → Execution. (2) automation failure (locator/timeout/loop/modal/a11y) → Execution. (3) bad preconditions or data → Execution. (4) wrong URL/environment → Execution. (5) app contradicted expected outcome → Defect.
41
+
42
+ Crucial distinction: "the app misbehaved" vs "the automation could not interact with the app". ONLY the first is a Defect. If the automation gives up before the app responds — timeout, retries exhausted, dead loop / loop detected, could not click or find an element — that is an Execution issue regardless of what the log calls it. Failure inside the automation ≠ failure inside the product.
43
+
44
+ A solitary failure where adjacent tests on the same feature passed → Execution, not Defect.
45
+
46
+ ## Severity (defects only)
47
+ [High] blocks a core flow · [Medium] degrades a flow but workaround exists · [Low] cosmetic / edge case
48
+
49
+ ## Format
50
50
 
51
51
  # Session Analysis
52
52
 
53
- <one sentence: total tests, defect count, headline finding>
53
+ <ONE or TWO sentences describing the FEATURE STATE — what was explored, whether the core flow holds, what the standout problem is. NO test counts, NO "N tests run". Talk about the product, not the run.>
54
+
55
+ ## Coverage
56
+ - Pages: <paths>
57
+ - Features: <capabilities>
58
+
59
+ ## What works
60
+ - **<feature>** — #2, #7, #8
54
61
 
55
62
  ## Defects
56
63
 
57
- ### 🔴 <plain-English title of the BUG, not the scenario name>
58
- Affects: #3, #5, #7
64
+ ### [Medium] <plain-English bug title>
65
+ Affects: #3, #5
59
66
  Reproduce:
60
- 1. <concrete UI step a person can replay>
61
- 2. <next step>
62
- Evidence: <one short observation from the test log>
63
-
64
- ### 🟡 <next defect>
65
- ...
67
+ 1. <concrete UI step>
68
+ 2. <next>
69
+ Evidence: <one short observation>
66
70
 
67
71
  ## UX issues
68
-
69
- - **<title>** — #4
70
- <one short evidence line>
72
+ - **<feature>** — <what's confusing> (#7)
71
73
 
72
74
  ## Execution Issues
75
+ - **#2 <scenario>** — <≤10 words, what was unreliable>
73
76
 
74
- - **<short test name or scenario phrase>** — <plain-English one-liner: what made the result unreliable>
75
- - **<…>** — <…>
77
+ ## Brevity rules
76
78
 
77
- ## Rules
78
- - Defects first, sorted by severity descending. Omit any section that has zero entries.
79
- - Defect title describes the BUG ("Run-type dropdown does not filter"), never the scenario name.
80
- - Reproduce steps are concrete UI actions derived from the log: URL + clicks + inputs. Imperative, one short line each.
81
- - Evidence is the smallest factual observation from notes/steps that supports the claim — what was OBSERVED in the page (HTML, message, missing element). Never quote the test's \`result\` field as evidence; that is a tautology.
82
- - **Execution Issues** entries must explain what actually went wrong in concrete terms a human understands: "could not find a Submit button after navigation", "page reloaded before the assertion ran", "passed without ever seeing a confirmation message", "marked failed but the new item appears in the list", "modal trapped focus and tests could not click outside", "ARIA tree had no labelled controls". Avoid jargon like "locator failed" without context. Never write category prefixes ("execution:", "false-positive:") the section header already says it. No emoji on these entries.
83
- - Do NOT include a passed test in any section unless evidence proves its goal was not achieved. Intermediate retries or recovered errors in the log are not grounds for listing a passed test.
84
- - No editorialising, no restating the scenario verbatim, no closing summary.
79
+ - Headline: 2 sentences MAX. About the FEATURE, not the run. No counts, no "N tests", no "this session". Banned words: "exercised", "comprehensive", "notably", "this session", "module", "targeted", "covered creation".
80
+ - What works: feature name + test refs. NO parentheticals, NO caveats. If there's a caveat, the entry doesn't belong here.
81
+ - Defect title is the BUG ("Search returns non-matching results"), never the scenario name.
82
+ - Reproduce steps are imperative one-liners drawn from the log.
83
+ - Evidence is one short factual observation. Never quote the \`result\` field.
84
+ - Execution Issues: ONE line per test, ≤10 words, plain. Examples: "passed vacuously, no list assertion", "no file upload step in log", "dead loop on Save click". No prefixes, no nested explanation.
85
+ - Omit any empty section.
86
+ - Section order: Coverage What works Defects (severity desc) → UX issues → Execution Issues.
85
87
 
86
88
  ${customPrompt || ''}
87
89
  `;
@@ -101,7 +103,7 @@ export class SessionAnalyst implements Agent {
101
103
  { agentName: 'analyst' }
102
104
  );
103
105
 
104
- return (response?.text || '').trim();
106
+ return decodeEscapes((response?.text || '').trim());
105
107
  }
106
108
 
107
109
  writeReport(markdown: string): string {
@@ -131,3 +133,7 @@ export class SessionAnalyst implements Agent {
131
133
  `;
132
134
  }
133
135
  }
136
+
137
+ function decodeEscapes(text: string): string {
138
+ return text.replace(/\\u\{([0-9a-fA-F]+)\}/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16)));
139
+ }
package/src/ai/tester.ts CHANGED
@@ -64,6 +64,8 @@ export class Tester extends TaskAgent implements Agent {
64
64
  private pageStateHash: string | null = null;
65
65
  private pageActionResult: ActionResult | null = null;
66
66
  private hooksRunner: HooksRunner;
67
+ private seenUiMapUrls = new Set<string>();
68
+ private lastAnalyzedStateHash: string | null = null;
67
69
 
68
70
  constructor(explorer: Explorer, provider: Provider, researcher: Researcher, navigator: Navigator, agentTools?: any) {
69
71
  super();
@@ -104,7 +106,7 @@ export class Tester extends TaskAgent implements Agent {
104
106
  }
105
107
 
106
108
  private get progressCheckInterval(): number {
107
- return (this.explorer.getConfig().ai?.agents?.tester as any)?.progressCheckInterval ?? 5;
109
+ return (this.explorer.getConfig().ai?.agents?.tester as any)?.progressCheckInterval ?? 3;
108
110
  }
109
111
 
110
112
  getConversation(): Conversation | null {
@@ -123,6 +125,8 @@ export class Tester extends TaskAgent implements Agent {
123
125
  this.previousStateHash = null;
124
126
  this.pageStateHash = null;
125
127
  this.pageActionResult = null;
128
+ this.seenUiMapUrls.clear();
129
+ this.lastAnalyzedStateHash = null;
126
130
  this.explorer.getStateManager().clearHistory();
127
131
  this.resetFailureCount();
128
132
  this.pilot?.reset();
@@ -147,14 +151,20 @@ export class Tester extends TaskAgent implements Agent {
147
151
  const initialState = ActionResult.fromState(state);
148
152
 
149
153
  const conversation = this.provider.startConversation(this.getSystemMessage(), 'tester');
154
+ conversation.markLastMessageCacheable();
150
155
  this.currentConversation = conversation;
151
156
 
152
157
  const outputDir = ConfigParser.getInstance().getOutputDir();
153
158
  this.executionLogFile = join(outputDir, `tester_${task.sessionName}.md`);
154
159
  // Note: Markdown saving functionality removed from Conversation class
155
160
 
156
- const initialPrompt = await this.buildTestPrompt(task, initialState);
157
- conversation.addUserText(initialPrompt);
161
+ const scenarioBlock = this.buildScenarioBlock(task, initialState);
162
+ conversation.addUserText(scenarioBlock);
163
+ conversation.markLastMessageCacheable();
164
+ conversation.protectPrefix(conversation.messages.length);
165
+
166
+ const pageContext = await this.reinjectContextIfNeeded(1, initialState);
167
+ if (pageContext) conversation.addUserText(pageContext);
158
168
 
159
169
  return await Observability.run(
160
170
  `test: ${task.scenario}`,
@@ -177,6 +187,12 @@ export class Tester extends TaskAgent implements Agent {
177
187
  if (this.pilot) {
178
188
  try {
179
189
  const plan = await this.pilot.planTest(task, initialState);
190
+ if (task.hasFinished) {
191
+ offFailedRequest?.();
192
+ page?.off('pageerror', onPageError);
193
+ page?.off('console', onConsoleMessage);
194
+ return { success: task.isSuccessful };
195
+ }
180
196
  if (plan) {
181
197
  conversation.addUserText(`Pilot's test plan:\n${plan}\n\nFollow this plan while executing the test.`);
182
198
  }
@@ -200,13 +216,15 @@ export class Tester extends TaskAgent implements Agent {
200
216
  debugLog(`Navigating to ${task.startUrl}`);
201
217
  await this.explorer.visit(task.startUrl!);
202
218
 
203
- const currentUrl = this.explorer.getStateManager().getCurrentState()?.url || task.startUrl || '';
219
+ const startState = this.explorer.getStateManager().getCurrentState();
220
+ if (startState) task.addUrlNote(startState);
221
+ const currentUrl = startState?.url || task.startUrl || '';
204
222
  await this.hooksRunner.runBeforeHook('tester', currentUrl);
205
223
 
206
224
  const offStateChange = this.explorer.getStateManager().onStateChange((event: StateTransition) => {
207
225
  if (task.hasFinished) return;
208
226
  if (event.toState?.url === event.fromState?.url) return;
209
- task.addNote(`Navigated to ${event.toState?.url}`, TestResult.PASSED);
227
+ if (event.toState) task.addUrlNote(event.toState, event.fromState || undefined);
210
228
  task.states.push(event.toState);
211
229
  });
212
230
 
@@ -253,13 +271,13 @@ export class Tester extends TaskAgent implements Agent {
253
271
  `);
254
272
  }
255
273
 
256
- conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 2);
274
+ conversation.cleanupTag('page_aria', '...cleaned aria snapshot...', 1);
257
275
  conversation.cleanupTag('page_html', '...cleaned HTML snapshot...', 1);
258
276
  conversation.cleanupTag('experience', '...cleaned experience...', 1);
259
277
  conversation.cleanupTag('applied_experience', '...cleaned past experience...', 1);
260
278
  conversation.cleanupTag('page_ui_map', '...cleaned UI map...', 1);
261
279
  conversation.cleanupTag('page_ui_map_overlay', '...cleaned UI overlay...', 1);
262
- conversation.compactToolResults(3);
280
+ conversation.compactToolResults(2);
263
281
 
264
282
  if (iteration > 1) {
265
283
  const isNewPage = this.previousUrl !== null && this.previousUrl !== currentState.url;
@@ -270,16 +288,17 @@ export class Tester extends TaskAgent implements Agent {
270
288
  if (isNewPage && this.pilot) {
271
289
  const guidance = await this.pilot.reviewNewPage(task, currentState, conversation);
272
290
  if (guidance) nextStep += `\n\n${guidance}`;
273
- } else if ((iteration % this.progressCheckInterval === 0 || this.consecutiveFailures >= 3 || this.consecutiveEmptyResults >= 2) && this.pilot) {
291
+ } else if (this.shouldAnalyzeProgress(iteration, currentState) && this.pilot) {
274
292
  const guidance = await this.pilot.analyzeProgress(task, currentState, conversation);
275
293
  if (guidance) nextStep += `\n\n${guidance}`;
276
294
  this.consecutiveFailures = 0;
295
+ this.lastAnalyzedStateHash = currentState.hash;
277
296
  }
278
297
  conversation.addUserText(nextStep);
279
298
  }
280
299
 
281
300
  const result = await this.provider.invokeConversation(conversation, tools, {
282
- maxToolRoundtrips: 5,
301
+ maxToolRoundtrips: 3,
283
302
  toolChoice: 'required',
284
303
  stopWhen: () => task.hasFinished,
285
304
  });
@@ -421,6 +440,14 @@ export class Tester extends TaskAgent implements Agent {
421
440
  };
422
441
  }
423
442
 
443
+ private shouldAnalyzeProgress(iteration: number, currentState: ActionResult): boolean {
444
+ if (this.consecutiveFailures >= 3) return true;
445
+ if (this.consecutiveEmptyResults >= 2) return true;
446
+ if (iteration % this.progressCheckInterval !== 0) return false;
447
+ if (this.lastAnalyzedStateHash === currentState.hash) return false;
448
+ return true;
449
+ }
450
+
424
451
  private async prepareInstructionsForNextStep(task: Test): Promise<string> {
425
452
  let outcomeStatus = dedent`
426
453
  <task>
@@ -511,17 +538,21 @@ export class Tester extends TaskAgent implements Agent {
511
538
  }
512
539
 
513
540
  if (isNewUrl) {
541
+ const alreadySeenUiMap = this.seenUiMapUrls.has(currentUrl);
514
542
  let research = '';
515
- try {
516
- research = await this.researcher.research(currentState);
517
- } catch (err) {
518
- if (!(err instanceof ErrorPageError)) throw err;
519
- tag('warning').log(`Research skipped: ${err.message}`);
543
+ if (!alreadySeenUiMap) {
544
+ try {
545
+ research = await this.researcher.research(currentState);
546
+ } catch (err) {
547
+ if (!(err instanceof ErrorPageError)) throw err;
548
+ tag('warning').log(`Research skipped: ${err.message}`);
549
+ }
520
550
  }
521
551
  this.pageStateHash = currentStateHash;
522
552
  this.pageActionResult = currentState;
523
553
  let uiMapSection = '';
524
554
  if (research) {
555
+ this.seenUiMapUrls.add(currentUrl);
525
556
  uiMapSection = dedent`
526
557
 
527
558
  Page UI Map
@@ -530,6 +561,8 @@ export class Tester extends TaskAgent implements Agent {
530
561
  ${research}
531
562
  </page_ui_map>
532
563
  `;
564
+ } else if (alreadySeenUiMap) {
565
+ uiMapSection = `\n\n<page_ui_map>UI map for ${currentUrl} was shown earlier in this session — refer to it above.</page_ui_map>`;
533
566
  }
534
567
 
535
568
  context += dedent`
@@ -740,9 +773,8 @@ export class Tester extends TaskAgent implements Agent {
740
773
  `;
741
774
  }
742
775
 
743
- private async buildTestPrompt(task: Test, actionResult: ActionResult): Promise<string> {
776
+ private buildScenarioBlock(task: Test, actionResult: ActionResult): string {
744
777
  const knowledge = this.getKnowledge(actionResult);
745
- const pageContext = await this.reinjectContextIfNeeded(1, actionResult);
746
778
 
747
779
  return dedent`
748
780
  <task>
@@ -770,8 +802,6 @@ export class Tester extends TaskAgent implements Agent {
770
802
  ${this.buildAvailableFiles()}
771
803
 
772
804
  ${knowledge}
773
-
774
- ${pageContext}
775
805
  `;
776
806
  }
777
807
 
package/src/ai/tools.ts CHANGED
@@ -510,7 +510,7 @@ export function createAgentTools({
510
510
  }
511
511
 
512
512
  return successToolResult('see', {
513
- analysis: analysisResult,
513
+ analysis: cap(analysisResult, ANALYSIS_OUTPUT_CAP),
514
514
  message: `Successfully analyzed screenshot for: ${request}`,
515
515
  suggestion: 'Visual confirmation is valid evidence for test results. Use record() to note the visual findings.',
516
516
  });
@@ -559,8 +559,8 @@ export function createAgentTools({
559
559
  url: currentState.url,
560
560
  title: currentState.title,
561
561
  suggestion: 'If not enough context received, call see() to visually identify elements in page contents',
562
- aria,
563
- html,
562
+ aria: cap(aria, ARIA_OUTPUT_CAP),
563
+ html: cap(html, HTML_OUTPUT_CAP),
564
564
  reminder: 'Context provided. Do not call context() again until you perform actions or suspect page changed.',
565
565
  });
566
566
  } catch (error) {
@@ -657,7 +657,7 @@ export function createAgentTools({
657
657
 
658
658
  return successToolResult('research', {
659
659
  analysis: researchResult,
660
- aria: ActionResult.fromState(currentState).getInteractiveARIA(),
660
+ aria: cap(ActionResult.fromState(currentState).getInteractiveARIA(), ARIA_OUTPUT_CAP),
661
661
  message: `Successfully researched page: ${currentState.url}.`,
662
662
  suggestion: dedent`
663
663
  You received comprehensive UI map report. Use it to understand the page structure and navigate to the elements.
@@ -1001,6 +1001,16 @@ export function createAgentTools({
1001
1001
 
1002
1002
  const PAGE_DIFF_SUGGESTION = 'Analyze page diff. htmlParts shows what changed and WHERE — each part has a container selector. Use the container as context when clicking elements from the diff.';
1003
1003
 
1004
+ const ARIA_OUTPUT_CAP = 4000;
1005
+ const HTML_OUTPUT_CAP = 6000;
1006
+ const ANALYSIS_OUTPUT_CAP = 2000;
1007
+
1008
+ function cap(text: string | undefined | null, max: number): string {
1009
+ if (!text) return '';
1010
+ if (text.length <= max) return text;
1011
+ return `${text.slice(0, max)}\n[...truncated; ${text.length - max} chars omitted...]`;
1012
+ }
1013
+
1004
1014
  function transformContainsCommand(command: string): string {
1005
1015
  if (!command.includes(':contains(')) return command;
1006
1016
 
@@ -1044,8 +1054,12 @@ function successToolResult(action: string, data?: Record<string, any>, source?:
1044
1054
  if (data?.pageDiff) {
1045
1055
  let suggestion = PAGE_DIFF_SUGGESTION;
1046
1056
  const ariaChanges = data.pageDiff.ariaChanges || '';
1057
+ const urlChanged = data.pageDiff.urlChanged === true;
1058
+ const hasHtmlParts = Array.isArray(data.pageDiff.htmlParts) && data.pageDiff.htmlParts.length > 0;
1047
1059
  if (countAriaChanges(ariaChanges) >= 50) {
1048
1060
  suggestion = `MAJOR PAGE CHANGE. Page entered a different mode. Check htmlParts and iframes in pageDiff before next action. ${suggestion}`;
1061
+ } else if (!urlChanged && !ariaChanges && !hasHtmlParts) {
1062
+ suggestion = 'Action ran without error but produced no observable change (URL, ARIA and HTML all unchanged). The locator likely matched a non-interactive ancestor or an element outside the intended control. Re-locate via xpathCheck() or verify with see() before treating this as success.';
1049
1063
  } else if (ariaChanges.includes('heading') && ariaChanges.includes('added')) {
1050
1064
  suggestion += ' WARNING: A new panel or modal may have appeared. If this was not the intended action, close it and try a different element.';
1051
1065
  }
@@ -1,6 +1,7 @@
1
1
  import figureSet from 'figures';
2
2
  import { getStyles } from '../ai/planner/styles.js';
3
3
  import { outputPath } from '../config.js';
4
+ import { normalizeUrl } from '../state-manager.js';
4
5
  import { Stats } from '../stats.js';
5
6
  import type { Plan } from '../test-plan.js';
6
7
  import { getCliName } from '../utils/cli-name.ts';
@@ -11,6 +12,8 @@ import { type NextStepSection, printNextSteps, relativeToCwd } from '../utils/ne
11
12
  import { safeFilename } from '../utils/strings.ts';
12
13
  import { BaseCommand, type Suggestion } from './base-command.js';
13
14
 
15
+ const MAX_SUB_PAGE_ATTEMPTS = 30;
16
+
14
17
  export class ExploreCommand extends BaseCommand {
15
18
  name = 'explore';
16
19
  description = 'Start web exploration';
@@ -27,6 +30,7 @@ export class ExploreCommand extends BaseCommand {
27
30
  maxTests?: number;
28
31
  private testsRun = 0;
29
32
  private completedPlans: Plan[] = [];
33
+ private failedSubPages = new Set<string>();
30
34
 
31
35
  async execute(args: string): Promise<void> {
32
36
  const { opts, args: remaining } = this.parseArgs(args);
@@ -46,10 +50,12 @@ export class ExploreCommand extends BaseCommand {
46
50
 
47
51
  if (!feature && !this.isLimitReached()) {
48
52
  const planner = this.explorBot.agentPlanner();
49
- while (true) {
53
+ let attempts = 0;
54
+ while (attempts < MAX_SUB_PAGE_ATTEMPTS) {
55
+ attempts++;
50
56
  if (this.isLimitReached()) break;
51
57
 
52
- const candidates = planner.collectSubPageCandidates(mainPlan, mainUrl || '/');
58
+ const candidates = planner.collectSubPageCandidates(mainPlan, mainUrl || '/').filter((c) => !this.failedSubPages.has(normalizeUrl(c.url)));
53
59
  if (candidates.length === 0) break;
54
60
 
55
61
  const pick = await planner.pickNextSubPage(candidates);
@@ -64,6 +70,7 @@ export class ExploreCommand extends BaseCommand {
64
70
  this.completedPlans.push(subPlan);
65
71
  }
66
72
  } catch (err) {
73
+ this.failedSubPages.add(normalizeUrl(pick.url));
67
74
  tag('warning').log(`Sub-page exploration failed: ${err instanceof Error ? err.message : err}`);
68
75
  }
69
76
  }
@@ -52,9 +52,12 @@ export const StatusPane: React.FC<{ onComplete?: () => void }> = ({ onComplete }
52
52
  <Text bold>Usage</Text>
53
53
  </Box>
54
54
  <Row label="Time" value={Stats.getElapsedTime()} />
55
- {tokenRows.map(([model, tokens]) => (
56
- <Row key={model} label={model} value={`${Stats.humanizeTokens(tokens.total)} tokens`} />
57
- ))}
55
+ {tokenRows.map(([model, tokens]) => {
56
+ const cached = tokens.cached ?? 0;
57
+ const cachePct = tokens.input > 0 ? Math.round((cached / tokens.input) * 100) : 0;
58
+ const suffix = cached > 0 ? ` (${Stats.humanizeTokens(cached)} cached, ${cachePct}%)` : '';
59
+ return <Row key={model} label={model} value={`${Stats.humanizeTokens(tokens.total)} tokens${suffix}`} />;
60
+ })}
58
61
  </>
59
62
  )}
60
63
  </Box>
@@ -3,6 +3,7 @@ import { basename, dirname, join } from 'node:path';
3
3
  import matter from 'gray-matter';
4
4
  import { type Tokens, marked } from 'marked';
5
5
  import type { ActionResult } from './action-result.js';
6
+ import { isNonReusableCode } from './ai/historian/utils.ts';
6
7
  import { ConfigParser } from './config.js';
7
8
  import { KnowledgeTracker } from './knowledge-tracker.js';
8
9
  import type { WebPageState } from './state-manager.js';
@@ -166,6 +167,10 @@ export class ExperienceTracker {
166
167
  writeAction(state: ActionResult, action: ActionInput): void {
167
168
  if (this.disabled || this.isWritingDisabled(state)) return;
168
169
  if (!action.code?.trim()) return;
170
+ if (isNonReusableCode(action.code)) {
171
+ debugLog('Skipping action with non-reusable code: %s', action.code);
172
+ return;
173
+ }
169
174
 
170
175
  this.ensureExperienceFile(state);
171
176
  const stateHash = state.getStateHash();
@@ -189,6 +194,10 @@ export class ExperienceTracker {
189
194
  writeFlow(state: ActionResult, body: string, relatedUrls?: string[]): void {
190
195
  if (this.disabled || this.isWritingDisabled(state)) return;
191
196
  if (!body?.trim()) return;
197
+ if (isNonReusableCode(body)) {
198
+ debugLog('Skipping flow body with non-reusable code');
199
+ return;
200
+ }
192
201
 
193
202
  this.ensureExperienceFile(state);
194
203
  const stateHash = state.getStateHash();
package/src/explorer.ts CHANGED
@@ -549,10 +549,7 @@ class Explorer {
549
549
  if (!this.stateManager.getCurrentState()) return;
550
550
 
551
551
  const lastScreenshot = ActionResult.fromState(this.stateManager.getCurrentState()!).screenshotFile;
552
- if (!lastScreenshot) return;
553
-
554
- const screenshotPath = outputPath('states', lastScreenshot);
555
- test.addArtifact(screenshotPath);
552
+ test.setActiveNoteScreenshot(lastScreenshot);
556
553
  };
557
554
 
558
555
  const dialogHandler = (dialog: any) => {
package/src/reporter.ts CHANGED
@@ -110,7 +110,7 @@ export class Reporter {
110
110
  const timeoutMs = Number(process.env.TESTOMATIO_TIMEOUT_MS || '15000');
111
111
  const timeoutPromise = new Promise<'timeout'>((resolve) => setTimeout(() => resolve('timeout'), timeoutMs));
112
112
 
113
- const result = await Promise.race([this.client.createRun().then(() => 'success' as const), timeoutPromise]);
113
+ const result = await Promise.race([this.client.createRun({ configuration: { exploratory: true } }).then(() => 'success' as const), timeoutPromise]);
114
114
 
115
115
  if (result === 'timeout') {
116
116
  debugLog('Reporter run creation timed out');
@@ -145,6 +145,7 @@ export class Reporter {
145
145
  message: note.message,
146
146
  status: note.status,
147
147
  screenshot: note.screenshot,
148
+ log: note.log,
148
149
  }))
149
150
  .sort((a, b) => a.startTime - b.startTime);
150
151
 
@@ -180,9 +181,18 @@ export class Reporter {
180
181
  if (noteEntry.screenshot) {
181
182
  step.artifacts = [outputPath('states', noteEntry.screenshot)];
182
183
  }
184
+ if (noteEntry.log) {
185
+ step.log = noteEntry.log;
186
+ }
183
187
  steps.push(step);
184
188
  }
185
189
 
190
+ const verificationStep = this.buildVerificationStep(test, lastScreenshotFile);
191
+ if (verificationStep) {
192
+ steps.push(verificationStep);
193
+ return steps;
194
+ }
195
+
186
196
  if (lastScreenshotFile && steps.length > 0) {
187
197
  const lastStep = steps[steps.length - 1];
188
198
  const screenshotPath = outputPath('states', lastScreenshotFile);
@@ -196,6 +206,39 @@ export class Reporter {
196
206
  return steps;
197
207
  }
198
208
 
209
+ private buildVerificationStep(test: Test, lastScreenshotFile?: string): Step | undefined {
210
+ const v = test.verification;
211
+ if (!v) return undefined;
212
+
213
+ const subSteps: Step[] = [];
214
+ if (v.message) subSteps.push({ category: 'framework', title: v.message, duration: 0 });
215
+ if (v.url) {
216
+ subSteps.push({
217
+ category: 'framework',
218
+ title: v.pageLabel ? `Navigated to ${v.pageLabel}` : 'Final page',
219
+ log: v.url,
220
+ duration: 0,
221
+ });
222
+ }
223
+ for (const detail of v.details) {
224
+ subSteps.push({ category: 'framework', title: detail, duration: 0 });
225
+ }
226
+
227
+ const screenshotFile = v.screenshot || lastScreenshotFile;
228
+
229
+ const step: Step = {
230
+ category: 'user',
231
+ title: 'Verification',
232
+ duration: 0,
233
+ status: v.status || 'none',
234
+ steps: subSteps.length > 0 ? subSteps : undefined,
235
+ };
236
+ if (screenshotFile) {
237
+ step.artifacts = [outputPath('states', screenshotFile)];
238
+ }
239
+ return step;
240
+ }
241
+
199
242
  async reportTest(test: Test, meta?: ReporterMeta): Promise<void> {
200
243
  await this.startRun();
201
244