npm - @sun-asterisk/sungen - Versions diffs - 2.7.0-beta.1 → 3.0.0-beta.71 - Mend

@sun-asterisk/sungen 2.7.0-beta.1 → 3.0.0-beta.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

package/src/harness/catalog/universal-viewpoints.yaml ADDED Viewed

@@ -0,0 +1,114 @@
+# Seed Universal Viewpoint Catalog (bundled, local — NOT a server).
+# Role: REFERENCE for the Harness Coverage Gate (the "answer key" sensors check
+# against). The AI still GENERATES the viewpoint-overview; this catalog only
+# verifies that critical themes for a page-type were not missed.
+#
+# Each page-type lists must-cover themes. A theme is "covered" when the project's
+# viewpoint-overview (or generated scenarios) contains one of its keywords.
+# See docs/orchestration-spec.md §5.2 and reports/sungen_refactor_spec.md §9.
+#
+# `depth:` (optional, harness-roadmap P1) marks a theme as DATA-correctness:
+#   requires: data-assertion  → scenarios on this theme must assert DATA (not just
+#                               visibility) to count as "deep". Drives the depth gate.
+#   cross_screen: true        → genuine depth needs another screen → use a flow
+#                               (P5 `remember` / `see all … contain`); on one screen
+#                               it should be @manual + deferred-to-flow, not shallow.
+#   keywords:                 → PRECISE data-noun keywords for depth matching (kept
+#                               separate from coverage keywords to avoid matching
+#                               navigation scenarios like "API list page").
+#   template:                 → the deep step the generator should emit by default.
+# Themes with no `depth:` are visibility/navigation — landing/seeing IS the assertion.
+page_types:
+  ecommerce-list:
+    detect_keywords: [cart, product, checkout, catalog, brand, category, "add to cart"]
+    must_cover:
+      - theme: list-data
+        keywords: [list, displayed, card, "product card", grid]
+        depth:
+          requires: data-assertion
+          cross_screen: false
+          keywords: ["product price", "product card", "every card", "all product", "product name", "displays a price", "card displays"]
+          template: "User see all [Product Price] contain {{currency}}   (and name/image per card)"
+      - theme: product-detail-consistency
+        keywords: [consistent, consistency, match, "same product", "correct product"]
+        depth:
+          requires: data-assertion
+          cross_screen: true
+          keywords: ["product detail", "detail page", "same product name", "matching name", "consistency", "same price"]
+          template: "User remember [Product Name] text as {{v}} … User see [Detail Product Name] header with {{v}}"
+      - theme: cart-correctness
+        keywords: ["cart contains", quantity, "price", subtotal, "in cart", "cart content", "cart product"]
+        depth:
+          requires: data-assertion
+          cross_screen: true
+          keywords: ["cart product", "cart contains", "in cart", "cart item", "appears in the cart", "cart line", "subtotal", "quantity"]
+          template: "User remember [Product Name] text as {{v}} … User see all [Cart Product Name] contain {{v}}"
+      - theme: category-filter-correctness
+        keywords: ["belong to the selected category", "category result", "matching the selected category", "products belong to", "only products that belong to the selected category"]
+        depth:
+          requires: data-assertion
+          cross_screen: true
+          keywords: ["belong to the selected category", "category result", "products that belong", "matching the selected category"]
+          template: "User see all [Result Product Name] contain {{selected_category}}"
+      - theme: brand-filter-correctness
+        keywords: ["belong to the selected brand", "brand result", "only products ... brand", "products that belong to the selected brand", "all displayed products belong to"]
+        depth:
+          requires: data-assertion
+          cross_screen: true
+          keywords: ["belong to the selected brand", "brand result", "products that belong to the selected brand"]
+          template: "User see all [Result Product Name] contain {{selected_brand}}"
+      - theme: add-to-cart
+        keywords: ["add to cart", added, "added to cart"]
+        depth:
+          requires: data-assertion
+          cross_screen: false
+          keywords: ["add to cart", "added to cart", "added confirmation", "added message"]
+          template: "User see [Added Message] text contains {{added_message}}   (not just the modal)"
+  form:
+    detect_keywords: [form, submit, field, input, validation]
+    must_cover:
+      - theme: required-validation
+        keywords: [required, empty, "must be", validation]
+        depth:
+          requires: data-assertion
+          cross_screen: false
+          keywords: ["required", "must be", "error message", "validation message"]
+          template: "User see [Field Error] message with {{error_text}}"
+      - theme: format-boundary
+        keywords: [format, invalid, boundary, length, range]
+        depth:
+          requires: data-assertion
+          cross_screen: false
+          keywords: ["invalid format", "boundary", "max length", "min length", "out of range"]
+          template: "User see [Field Error] message with {{error_text}}"
+      - theme: submit-success
+        keywords: [submit, success, saved, created]
+        depth:
+          requires: data-assertion
+          cross_screen: false
+          keywords: ["success message", "saved", "created", "confirmation"]
+          template: "User see [Success] message with {{success_text}}"
+  auth:
+    detect_keywords: [login, logout, password, signin, "sign in", credential]
+    must_cover:
+      - theme: valid-login
+        keywords: ["valid", login, success]
+      - theme: invalid-credential
+        keywords: ["invalid", "wrong password", error, incorrect]
+        depth:
+          requires: data-assertion
+          cross_screen: false
+          keywords: ["wrong password", "incorrect", "invalid credential", "error message"]
+          template: "User see [Login Error] message with {{error_text}}"
+      - theme: access-control
+        keywords: ["unauthorized", "redirect", "not logged in", permission]
+# Universal themes worth checking on ANY page-type (low-weight reminders).
+universal:
+  - theme: error-empty-state
+    keywords: ["empty", "no data", "no result", "failed", "error state"]
+  - theme: accessibility
+    keywords: ["keyboard", "tab order", "accessible", "aria", "focus"]

package/src/harness/challenge.ts ADDED Viewed

@@ -0,0 +1,131 @@
+/**
+ * Challenge / Exploration Harness (harness-roadmap P4) — Loop 2.
+ *
+ * Production mode (Loop 1: create-test → audit gate → repair) is deterministic by
+ * design: same spec + viewpoint → same official suite. That is the *feature* for
+ * delivery/CI, but it can feel like "a machine that always outputs the same thing".
+ *
+ * The Challenge Harness is the antidote: it does NOT regenerate the suite — it
+ * ATTACKS the existing one to surface what production missed. It is advisory
+ * (never auto-merges) and read-only.
+ *
+ * This module is the DETERMINISTIC spine — three structural critics:
+ *   1. Coverage   — over-covered (low-value) areas + shallow gate themes.
+ *   2. Depth      — titles that claim a collection but assert a single element.
+ *   3. Novelty    — risk-based prompts the AI `sungen-challenge` agent expands into
+ *                   concrete novelty candidates (semantic — not deterministic here).
+ *
+ * The AI agent layer adds the semantic + novelty judgement on top of this spine.
+ */
+import * as path from 'path';
+import { loadScenarios, ScenarioInfo } from './parse';
+import { runAudit } from './audit';
+export interface ChallengeFinding {
+  scenario?: string;
+  issue: string;
+  suggestion: string;
+}
+export interface ChallengeReport {
+  screen: string;
+  // Coverage critic
+  overCovered: { bucket: string; count: number; note: string }[];
+  shallowThemes: string[];
+  // Depth critic
+  collectionClaimSingular: ChallengeFinding[];
+  // Novelty critic (deterministic prompts → AI agent fills candidates)
+  noveltyPrompts: string[];
+  // Roll-up
+  explorationReadiness: string[];
+}
+// A collection claim = a PLURAL set noun (or an explicit quantifier) used with a
+// DISPLAY verb. Plural-only avoids false-positives on single-item actions like
+// "Adding A product ... shows the dialog" (which correctly asserts one item).
+const PLURAL_NOUN = /\b(cards|items|products|rows|results|prices|entries|records)\b/i;
+const QUANTIFIER = /\b(all|every|each)\b/i;
+const DISPLAY_VERB = /\b(displays?|shows?|lists?|grid|contains?)\b/i;
+/** Risk lenses the Novelty critic prompts the AI to explore (beyond the catalog). */
+const NOVELTY_LENSES = [
+  'double-submit / rapid repeat of the primary action (duplicate side-effects?)',
+  'state after partial / slow load (assert against a not-yet-ready page)',
+  'boundary & unusual data (very long text, 0 / max quantity, special chars)',
+  'concurrency / back-button / refresh mid-flow',
+  'historical-incident mindset — what has broken on similar screens before?',
+];
+export function buildChallenge(screenDir: string, screenName: string): ChallengeReport {
+  const featurePath = path.join(screenDir, 'features', `${screenName}.feature`);
+  const scenarios: ScenarioInfo[] = loadScenarios(featurePath);
+  const audit = runAudit(screenDir, screenName);
+  // 1. Coverage critic — over-covered buckets (secondary >> business-core) + shallow themes.
+  const buckets = audit.balance.byBucket;
+  const core = buckets['business-core'] || 0;
+  const overCovered: ChallengeReport['overCovered'] = [];
+  for (const [bucket, count] of Object.entries(buckets)) {
+    if (['business-core', 'other'].includes(bucket)) continue;
+    if (core > 0 && count > core * 1.5) {
+      overCovered.push({ bucket, count, note: `${count} scenarios vs ${core} business-core — likely low-value expansion; trim toward correctness.` });
+    }
+  }
+  const shallowThemes = audit.gate.gaps.filter((g) => g.status === 'shallow').map((g) => g.theme);
+  // 2. Depth critic — title claims a collection but the assertion is singular (no "see all").
+  const collectionClaimSingular: ChallengeFinding[] = [];
+  for (const s of scenarios) {
+    if (s.manual || s.category === 'NAV') continue;
+    const claimsCollection = (PLURAL_NOUN.test(s.name) || QUANTIFIER.test(s.name)) && DISPLAY_VERB.test(s.name);
+    const assertsAll = /\bsee all\b/.test(s.haystack);
+    if (claimsCollection && !assertsAll) {
+      collectionClaimSingular.push({
+        scenario: s.name,
+        issue: 'Title implies a set (cards/items/all) but the assertion targets a single element.',
+        suggestion: 'Prove EVERY member: `Then User see all [<Card/Row>] contain {{...}}` instead of a single `see [X]`.',
+      });
+    }
+  }
+  // 3. Novelty critic — deterministic prompts; the AI agent expands these into candidates.
+  const noveltyPrompts = NOVELTY_LENSES.map((l) => `Find 1 non-obvious, valuable scenario via: ${l}`);
+  // Roll-up — exploration readiness signals (not a fake score).
+  const explorationReadiness: string[] = [];
+  if (collectionClaimSingular.length) explorationReadiness.push(`${collectionClaimSingular.length} title↔assertion gap(s) — deterministic depth critic flagged these; an AI Business-Depth critic should confirm + fix.`);
+  if (overCovered.length) explorationReadiness.push(`${overCovered.length} possibly over-covered area(s) — rebalance toward correctness.`);
+  if (shallowThemes.length) explorationReadiness.push(`Shallow themes: ${shallowThemes.join(', ')}.`);
+  explorationReadiness.push('Novelty candidates are NOT generated deterministically — run the `sungen-challenge` agent (Claude) or its inline criteria (Copilot) to propose them, then QA accept/reject (≤20% of official, no auto-merge).');
+  return { screen: screenName, overCovered, shallowThemes, collectionClaimSingular, noveltyPrompts, explorationReadiness };
+}
+/** Render the Challenge Report as Markdown (advisory — not part of the official suite). */
+export function renderChallengeMarkdown(r: ChallengeReport): string {
+  const lines: string[] = [];
+  lines.push(`# Challenge Report — ${r.screen}`, '');
+  lines.push('> Advisory (Loop 2 / exploration mode). Does NOT change the official suite — it attacks it to surface blind spots. QA decides what to adopt.', '');
+  lines.push('## Depth — title claims a collection but asserts a single element');
+  if (r.collectionClaimSingular.length) {
+    lines.push('| Scenario | Issue | Suggested |', '|---|---|---|');
+    for (const f of r.collectionClaimSingular) lines.push(`| ${f.scenario} | ${f.issue} | ${f.suggestion} |`);
+  } else lines.push('_none_');
+  lines.push('');
+  lines.push('## Coverage — possibly over-covered / shallow');
+  if (r.overCovered.length) for (const o of r.overCovered) lines.push(`- **${o.bucket}** — ${o.note}`);
+  if (r.shallowThemes.length) lines.push(`- Shallow themes: ${r.shallowThemes.join(', ')}`);
+  if (!r.overCovered.length && !r.shallowThemes.length) lines.push('_balanced_');
+  lines.push('');
+  lines.push('## Novelty — prompts for the AI critic (expand into candidates, ≤20% of official, no auto-merge)');
+  for (const p of r.noveltyPrompts) lines.push(`- ${p}`);
+  lines.push('');
+  lines.push('## Exploration readiness');
+  for (const e of r.explorationReadiness) lines.push(`- ${e}`);
+  lines.push('');
+  return lines.join('\n');
+}

package/src/harness/feedback.ts ADDED Viewed

@@ -0,0 +1,84 @@
+/**
+ * Feedback — local-first capture of QA feedback. The most valuable use of
+ * feedback is closing the learning loop WITHIN a project (feed reuse/regenerate
+ * + improve the Guide), which needs no server. A future opt-in `feedback sync`
+ * can push anonymized metadata to a central API (Nấc 2) — not implemented here.
+ *
+ * Two purposes are kept distinct:
+ *   - test-design: a viewpoint/scenario is wrong / missing / duplicate  → knowledge
+ *   - product:     Sungen itself did X wrong                            → telemetry/issue
+ *
+ * Storage: .sungen/feedback/feedback.jsonl (append-only)
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+export type FeedbackType = 'test-design' | 'product' | 'other';
+export type FeedbackDecision = 'accept' | 'reject' | 'edit' | 'add' | 'none';
+export interface FeedbackEntry {
+  ts: string;
+  type: FeedbackType;
+  screen?: string;
+  target?: string;        // viewpoint id / scenario / command / artifact the feedback is about
+  decision?: FeedbackDecision;
+  message: string;
+  reason?: string;
+  source: string;         // who (default: qa)
+  auditScore?: number;    // snapshot of current audit score if available
+}
+function feedbackPath(): string {
+  return path.join(process.cwd(), '.sungen', 'feedback', 'feedback.jsonl');
+}
+function currentAuditScore(screen?: string): number | undefined {
+  if (!screen) return undefined;
+  const p = path.join(process.cwd(), '.sungen', 'reports', `${screen}-audit.json`);
+  if (!fs.existsSync(p)) return undefined;
+  try { return JSON.parse(fs.readFileSync(p, 'utf-8'))?.score?.overall; } catch { return undefined; }
+}
+export function recordFeedback(entry: Omit<FeedbackEntry, 'ts' | 'source' | 'auditScore'> & { ts?: string; source?: string }): string {
+  const p = feedbackPath();
+  fs.mkdirSync(path.dirname(p), { recursive: true });
+  const full: FeedbackEntry = {
+    ts: entry.ts ?? new Date().toISOString(),
+    type: entry.type,
+    screen: entry.screen,
+    target: entry.target,
+    decision: entry.decision ?? 'none',
+    message: entry.message,
+    reason: entry.reason,
+    source: entry.source ?? 'qa',
+    auditScore: currentAuditScore(entry.screen),
+  };
+  fs.appendFileSync(p, JSON.stringify(full) + '\n', 'utf-8');
+  return p;
+}
+export function readFeedback(): FeedbackEntry[] {
+  const p = feedbackPath();
+  if (!fs.existsSync(p)) return [];
+  return fs.readFileSync(p, 'utf-8').split('\n').filter(Boolean).map((l) => JSON.parse(l));
+}
+export interface FeedbackSummary {
+  total: number;
+  byType: Record<string, number>;
+  byDecision: Record<string, number>;
+  entries: FeedbackEntry[];
+}
+export function summarize(filter?: { screen?: string; type?: FeedbackType }): FeedbackSummary {
+  let entries = readFeedback();
+  if (filter?.screen) entries = entries.filter((e) => e.screen === filter.screen);
+  if (filter?.type) entries = entries.filter((e) => e.type === filter.type);
+  const byType: Record<string, number> = {};
+  const byDecision: Record<string, number> = {};
+  for (const e of entries) {
+    byType[e.type] = (byType[e.type] || 0) + 1;
+    byDecision[e.decision || 'none'] = (byDecision[e.decision || 'none'] || 0) + 1;
+  }
+  return { total: entries.length, byType, byDecision, entries };
+}

package/src/harness/intent.ts ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * Intent Profile (harness-roadmap P3) — lets the gate flex to project intent.
+ *
+ * Read from qa/context.md so a project can declare what it cares about:
+ *
+ *   ## Intent
+ *   focus: security          # functional | e-commerce | security | smoke
+ *   risk_tier: high          # high | normal | low   (reserved — future weighting)
+ *   tier_scope: full         # tier-1 | full         (reserved)
+ *
+ * Keys may appear anywhere in context.md (a heading is optional). Unknown / missing
+ * values fall back to the safe default (functional), so behaviour is unchanged for
+ * projects that haven't declared an intent.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+export type IntentFocus = 'functional' | 'e-commerce' | 'security' | 'smoke';
+export interface IntentProfile {
+  focus: IntentFocus;
+  riskTier: 'high' | 'normal' | 'low';
+  tierScope: 'tier-1' | 'full';
+  source: 'context.md' | 'default';
+}
+const DEFAULT_INTENT: IntentProfile = {
+  focus: 'functional', riskTier: 'normal', tierScope: 'full', source: 'default',
+};
+const FOCI: IntentFocus[] = ['functional', 'e-commerce', 'security', 'smoke'];
+/** Resolve project root from a screen/flow dir (…/qa/screens/<name>). */
+export function projectRootFromScreenDir(screenDir: string): string {
+  return path.resolve(screenDir, '..', '..', '..');
+}
+export function readIntent(projectRoot: string): IntentProfile {
+  const ctx = path.join(projectRoot, 'qa', 'context.md');
+  if (!fs.existsSync(ctx)) return DEFAULT_INTENT;
+  let text: string;
+  try { text = fs.readFileSync(ctx, 'utf-8').toLowerCase(); } catch { return DEFAULT_INTENT; }
+  const grab = (key: string): string | undefined => {
+    const m = text.match(new RegExp(`(?:^|\\n)\\s*${key}\\s*:\\s*([a-z0-9-]+)`));
+    return m?.[1];
+  };
+  const focusRaw = grab('focus');
+  const focus = (FOCI.includes(focusRaw as IntentFocus) ? focusRaw : DEFAULT_INTENT.focus) as IntentFocus;
+  const risk = grab('risk_tier');
+  const riskTier = (['high', 'normal', 'low'].includes(risk as string) ? risk : DEFAULT_INTENT.riskTier) as IntentProfile['riskTier'];
+  const scope = grab('tier_scope');
+  const tierScope = (['tier-1', 'full'].includes(scope as string) ? scope : DEFAULT_INTENT.tierScope) as IntentProfile['tierScope'];
+  const found = focusRaw || risk || scope;
+  return { focus, riskTier, tierScope, source: found ? 'context.md' : 'default' };
+}

package/src/harness/ledger.ts ADDED Viewed

@@ -0,0 +1,155 @@
+/**
+ * Usage Ledger — observability for AI resource spend during test design.
+ *
+ * The orchestrator records one event per step; the report aggregates them and
+ * derives efficiency metrics so you can SEE where cost goes (e.g. "50% of tokens
+ * spent in repair" → first-pass generation is weak; fix the Guide, not the count).
+ *
+ * Tokens are OPTIONAL: exact counts come from the orchestrator/harness when known;
+ * otherwise the ledger still reports time, repair rounds, and per-step structure.
+ * (Exact AI token capture is a harness-integration concern; the schema is ready.)
+ *
+ * Events: .sungen/ledger/<screen>.jsonl (append-only)
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+export interface LedgerEvent {
+  ts: string;
+  step: string;            // e.g. discovery | viewpoint | gherkin | audit | repair:1
+  runId?: string;          // groups events of ONE create-test invocation (P2)
+  model?: string;
+  tokensIn?: number;
+  tokensOut?: number;
+  ms?: number;
+  note?: string;
+}
+// The ledger is append-only across runs. When events carry no runId, a gap longer
+// than this between consecutive timestamps marks a new run. Default report/trace
+// scope to the LATEST run so multiple invocations don't get conflated.
+const RUN_GAP_MS = 30 * 60 * 1000;
+/** Split events into runs — by runId when present, else by timestamp gap. */
+export function segmentRuns(events: LedgerEvent[]): LedgerEvent[][] {
+  if (events.length === 0) return [];
+  if (events.some((e) => e.runId)) {
+    const order: string[] = [];
+    const groups = new Map<string, LedgerEvent[]>();
+    for (const e of events) {
+      const key = e.runId ?? '__none__';
+      if (!groups.has(key)) { groups.set(key, []); order.push(key); }
+      groups.get(key)!.push(e);
+    }
+    return order.map((k) => groups.get(k)!);
+  }
+  const sorted = [...events].sort((a, b) => Date.parse(a.ts) - Date.parse(b.ts));
+  const runs: LedgerEvent[][] = [[sorted[0]]];
+  for (let i = 1; i < sorted.length; i++) {
+    const gap = Date.parse(sorted[i].ts) - Date.parse(sorted[i - 1].ts);
+    if (gap > RUN_GAP_MS) runs.push([]);
+    runs[runs.length - 1].push(sorted[i]);
+  }
+  return runs;
+}
+/** Events of the most recent run only. */
+export function latestRunEvents(events: LedgerEvent[]): LedgerEvent[] {
+  const runs = segmentRuns(events);
+  return runs.length ? runs[runs.length - 1] : [];
+}
+function ledgerPath(screen: string): string {
+  return path.join(process.cwd(), '.sungen', 'ledger', `${screen}.jsonl`);
+}
+export function recordEvent(screen: string, ev: Omit<LedgerEvent, 'ts'> & { ts?: string }): string {
+  const p = ledgerPath(screen);
+  fs.mkdirSync(path.dirname(p), { recursive: true });
+  const full: LedgerEvent = { ts: ev.ts ?? new Date().toISOString(), ...ev };
+  fs.appendFileSync(p, JSON.stringify(full) + '\n', 'utf-8');
+  return p;
+}
+export function readEvents(screen: string): LedgerEvent[] {
+  const p = ledgerPath(screen);
+  if (!fs.existsSync(p)) return [];
+  return fs.readFileSync(p, 'utf-8').split('\n').filter(Boolean).map((l) => JSON.parse(l));
+}
+export interface LedgerReport {
+  screen: string;
+  runs: number;                 // total runs detected in the ledger
+  runScope: 'latest' | 'all';   // which runs this report covers
+  events: number;
+  totalTokens: number;
+  totalMs: number;
+  byStep: Record<string, { tokens: number; ms: number; count: number }>;
+  repairRounds: number;
+  repairTokenPct: number;       // share of tokens spent in repair (0..1)
+  coveredCriticalViewpoints: number | null;
+  tokensPerCoveredCritical: number | null;
+  scenarioCount: number | null;
+  tokensPerScenario: number | null;
+  verdicts: string[];
+}
+export function buildReport(screen: string, opts: { allRuns?: boolean } = {}): LedgerReport {
+  const allEvents = readEvents(screen);
+  const runCount = segmentRuns(allEvents).length;
+  const events = opts.allRuns ? allEvents : latestRunEvents(allEvents);
+  const byStep: LedgerReport['byStep'] = {};
+  let totalTokens = 0, totalMs = 0, repairTokens = 0;
+  const repairSteps = new Set<string>();
+  for (const e of events) {
+    const tok = (e.tokensIn ?? 0) + (e.tokensOut ?? 0);
+    const stepKey = e.step.replace(/:\d+$/, '');
+    byStep[stepKey] = byStep[stepKey] || { tokens: 0, ms: 0, count: 0 };
+    byStep[stepKey].tokens += tok;
+    byStep[stepKey].ms += e.ms ?? 0;
+    byStep[stepKey].count += 1;
+    totalTokens += tok;
+    totalMs += e.ms ?? 0;
+    if (/^repair/i.test(e.step)) { repairTokens += tok; repairSteps.add(e.step); }
+  }
+  // Pull audit signals if present
+  let coveredCritical: number | null = null;
+  let scenarioCount: number | null = null;
+  const auditPath = path.join(process.cwd(), '.sungen', 'reports', `${screen}-audit.json`);
+  if (fs.existsSync(auditPath)) {
+    try {
+      const a = JSON.parse(fs.readFileSync(auditPath, 'utf-8'));
+      coveredCritical = a?.gate?.themesCovered ?? null;
+      scenarioCount = a?.scenarioCount ?? null;
+    } catch { /* ignore */ }
+  }
+  const verdicts: string[] = [];
+  const repairPct = totalTokens ? repairTokens / totalTokens : 0;
+  if (totalTokens === 0) verdicts.push('No token data recorded — record steps with --tokens-in/--tokens-out for cost metrics (time & rounds still shown).');
+  if (repairPct > 0.4) verdicts.push(`High repair cost (${(repairPct * 100).toFixed(0)}% of tokens in repair) → first-pass generation is weak; improve the Guide/viewpoint step, not the count.`);
+  if (repairSteps.size >= 3) verdicts.push(`${repairSteps.size} repair rounds — near/over budget; check which sensor keeps failing.`);
+  if (coveredCritical != null && totalTokens && coveredCritical > 0 && totalTokens / coveredCritical > 30000) {
+    verdicts.push(`~${Math.round(totalTokens / coveredCritical / 1000)}k tokens per covered critical viewpoint — expensive; check for low-value expansion.`);
+  }
+  if (verdicts.length === 0) verdicts.push('Within expected efficiency.');
+  return {
+    screen,
+    runs: runCount,
+    runScope: opts.allRuns ? 'all' : 'latest',
+    events: events.length,
+    totalTokens,
+    totalMs,
+    byStep,
+    repairRounds: repairSteps.size,
+    repairTokenPct: Math.round(repairPct * 100) / 100,
+    coveredCriticalViewpoints: coveredCritical,
+    tokensPerCoveredCritical: coveredCritical && totalTokens ? Math.round(totalTokens / coveredCritical) : null,
+    scenarioCount,
+    tokensPerScenario: scenarioCount && totalTokens ? Math.round(totalTokens / scenarioCount) : null,
+    verdicts,
+  };
+}