npm - @sun-asterisk/sungen - Versions diffs - 3.2.1-beta.1 → 3.2.2-beta.10 - Mend

@sun-asterisk/sungen 3.2.1-beta.1 → 3.2.2-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/src/harness/journey.ts ADDED Viewed

@@ -0,0 +1,333 @@
+/**
+ * Journey board (epic #381, story S1) — the durable, read-only "you are here" view.
+ *
+ * Reconstructs the QA's position from artifacts ALREADY on disk (no recomputation, no context
+ * needed): the audit report (`.sungen/reports/<slug>-audit.json`) supplies the obligation status
+ * via its calibration axes + gate + findings; the ledger (`.sungen/ledger/<slug>.jsonl`) supplies
+ * the phase history ("you are here"). The output answers the three QA questions — what's next /
+ * what to review / what's doubtful — and persists `.sungen/journey/<slug>.{json,board.md}`.
+ *
+ * S1 = the read-only synthesis. S2 (this file) adds the **writable lifecycle**: persisted
+ * waivers (reason-required, anti-amnesia), reconcile (auto-close satisfied; re-surface a waiver
+ * when its evidence changed), via `runJourney` + `waive`. Gate-bound predicates + inter-phase
+ * gates are S3. Pure-deterministic, no AI.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import * as crypto from 'crypto';
+import { reportSlug } from './unit-paths';
+export type ObStatus = 'satisfied' | 'needs-work' | 'pending' | 'waived';
+export interface Obligation {
+  id: string;
+  title: string;
+  status: ObStatus;
+  detail: string;
+  waivedReason?: string;   // S2 — set when the QA explicitly waived this obligation
+}
+/** S5 — a parsed anchor so an IDE can jump straight to what needs review. */
+export interface Anchor { vpId?: string; file?: string; line?: number; }
+export interface ReviewItem { text: string; anchor: Anchor; }
+export interface JourneyReport {
+  unit: string;
+  generatedFrom: string[];          // which artifacts were available
+  youAreHere: string;               // last ledger phase, or 'not started'
+  phasesDone: string[];
+  gateStatus: string | null;
+  score: number | null;
+  obligations: Obligation[];
+  needsYou: ReviewItem[];           // S5 — the review queue: human-judgment findings, each with an anchor
+  nextSuggested: string;
+}
+function readJSON(p: string): any {
+  try { return fs.existsSync(p) ? JSON.parse(fs.readFileSync(p, 'utf-8')) : null; } catch { return null; }
+}
+function readLedgerPhases(p: string): string[] {
+  if (!fs.existsSync(p)) return [];
+  const out: string[] = [];
+  for (const line of fs.readFileSync(p, 'utf-8').split('\n')) {
+    if (!line.trim()) continue;
+    try { const d = JSON.parse(line); if (d.step) out.push(String(d.step)); } catch { /* skip */ }
+  }
+  return out;
+}
+/** Findings that ask for HUMAN judgment (the review-queue seed), vs pure machine-repair targets. */
+function isHumanFinding(f: string): boolean {
+  return /@manual|MANUAL-|DEPTH-DEFERRED|UNSOURCEABLE|CAPABILITY-SUGGESTION|judgment|oracle|review/i.test(f);
+}
+/** S5 — pull an IDE-jumpable anchor out of a finding (VP id, scenario, or `file.ext:line`). */
+function extractAnchor(text: string): Anchor {
+  const vp = text.match(/\bVP-[A-Z0-9-]+\b/);
+  const fl = text.match(/([\w./-]+\.(?:md|feature|ts|yaml)):(\d+)/);
+  return { vpId: vp ? vp[0] : undefined, file: fl ? fl[1] : undefined, line: fl ? Number(fl[2]) : undefined };
+}
+const SAT = 0.8; // axis at/above this = satisfied (below = needs-work)
+function computeFresh(projectRoot: string, unit: string): JourneyReport {
+  const slug = reportSlug(unit);
+  const audit = readJSON(path.join(projectRoot, '.sungen', 'reports', `${slug}-audit.json`));
+  const phases = readLedgerPhases(path.join(projectRoot, '.sungen', 'ledger', `${slug}.jsonl`));
+  const from: string[] = [];
+  if (audit) from.push('audit');
+  if (phases.length) from.push('ledger');
+  const youAreHere = phases.length ? phases[phases.length - 1] : 'not started';
+  const ran = (p: string) => phases.some((s) => s === p || s.startsWith(p));
+  const obligations: Obligation[] = [];
+  const needsYou: ReviewItem[] = [];
+  if (!audit) {
+    // Nothing measured yet — the journey hasn't really begun.
+    obligations.push({ id: 'OB-create', title: 'Generate test cases', status: 'pending', detail: 'No audit report yet — run /sungen:create-test.' });
+    return {
+      unit, generatedFrom: from, youAreHere, phasesDone: phases, gateStatus: null, score: null,
+      obligations, needsYou, nextSuggested: 'Run `/sungen:create-test ' + unit + '` to begin.',
+    };
+  }
+  const ax = (audit.calibration && audit.calibration.axes) || {};
+  const depthThreshold = (audit.depth && audit.depth.threshold) || SAT;
+  const ob = (id: string, title: string, val: number | undefined, thr: number, detail: string): Obligation => ({
+    id, title,
+    status: val === undefined ? 'pending' : val >= thr ? 'satisfied' : 'needs-work',
+    detail: val === undefined ? detail : `${Math.round(val * 100)}% (need ${Math.round(thr * 100)}%) — ${detail}`,
+  });
+  obligations.push(ob('OB-spec', 'Spec readiness', ax.specFR, 1, 'FR/sections sufficient to design from'));
+  obligations.push(ob('OB-coverage', 'Viewpoint coverage', ax.coverage, SAT, 'critical themes covered'));
+  obligations.push(ob('OB-depth', 'Assertion depth', ax.businessDepth, depthThreshold, 'business-critical scenarios assert data'));
+  obligations.push(ob('OB-trace', 'Traceability', ax.taxonomy ?? ax.traceability, SAT, 'scenarios trace to viewpoints'));
+  // Automation obligation: automatable @manual still pending a driver = needs-work.
+  const autoPending = audit.automatableManual && audit.automatableManual.automatable > 0;
+  obligations.push({
+    id: 'OB-automation', title: 'Automation coverage',
+    status: autoPending ? 'needs-work' : 'satisfied',
+    detail: autoPending ? `${audit.automatableManual.automatable} @manual scenario(s) a driver could automate` : 'no capability-pending automatable cases',
+  });
+  // Human sign-off: always pending until S5 explicit signoff; carries the review-queue count.
+  const manualCount = (audit.manualOracle && audit.manualOracle.total) || 0;
+  obligations.push({
+    id: 'OB-signoff', title: 'Human sign-off', status: 'pending',
+    detail: `${needsYou.length} item(s) queued for your review (single sign-off at the end)`,
+  });
+  // Needs-you: human-judgment findings (the review-queue seed), with their anchors (titles) intact.
+  for (const f of (audit.findings || [])) if (isHumanFinding(f)) needsYou.push({ text: f, anchor: extractAnchor(f) });
+  // Re-stamp the signoff detail now that needsYou is filled.
+  const signoff = obligations.find((o) => o.id === 'OB-signoff');
+  if (signoff) signoff.detail = `${needsYou.length} item(s) queued for your review · ${manualCount} @manual scenario(s)`;
+  // Next suggested = first non-satisfied obligation; if all satisfied but run-test not done → run.
+  const firstGap = obligations.find((o) => o.status !== 'satisfied' && o.id !== 'OB-signoff');
+  let nextSuggested: string;
+  if (firstGap) nextSuggested = `Repair "${firstGap.title}" (${firstGap.detail}).`;
+  else if (!ran('run')) nextSuggested = `Quality satisfied — run \`/sungen:run-test ${unit}\`.`;
+  else nextSuggested = `All obligations satisfied — review the ${needsYou.length} queued item(s), then sign off & deliver.`;
+  return {
+    unit, generatedFrom: from, youAreHere, phasesDone: phases,
+    gateStatus: audit.gateStatus ?? null, score: (audit.score && audit.score.overall) ?? null,
+    obligations, needsYou, nextSuggested,
+  };
+}
+// ---------------- S2: writable lifecycle — persisted waivers + reconcile ----------------
+interface Waiver { reason: string; at: string; auditHashAtWaive: string; }
+interface Signoff { by?: string; at: string; auditHashAtSignoff: string; }   // S5
+interface JourneyState { unit: string; auditHash: string; waivers: Record<string, Waiver>; signoff?: Signoff; }
+function statePath(projectRoot: string, slug: string): string {
+  return path.join(projectRoot, '.sungen', 'journey', `${slug}.state.json`);
+}
+/** Evidence cursor: the audit report's content hash. A waiver is invalidated when this changes. */
+function auditHashOf(projectRoot: string, slug: string): string {
+  const p = path.join(projectRoot, '.sungen', 'reports', `${slug}-audit.json`);
+  return fs.existsSync(p) ? crypto.createHash('sha256').update(fs.readFileSync(p)).digest('hex') : '';
+}
+function loadState(p: string): JourneyState | null { return readJSON(p); }
+function saveState(p: string, s: JourneyState): void {
+  fs.mkdirSync(path.dirname(p), { recursive: true });
+  fs.writeFileSync(p, JSON.stringify(s, null, 2), 'utf-8');
+}
+/** Recompute nextSuggested AFTER waivers are applied (a waived obligation is not a gap). */
+function computeNext(r: JourneyReport, unit: string): string {
+  const gap = r.obligations.find((o) => o.status !== 'satisfied' && o.status !== 'waived' && o.id !== 'OB-signoff');
+  if (gap) return `Repair "${gap.title}" (${gap.detail}).`;
+  if (!r.phasesDone.some((s) => s === 'run' || s.startsWith('run'))) return `Quality satisfied — run \`/sungen:run-test ${unit}\`.`;
+  return `All obligations satisfied/waived — review the ${r.needsYou.length} queued item(s), then sign off & deliver.`;
+}
+/**
+ * The public entry: compute fresh, then RECONCILE with the persisted state —
+ *  - auto-close is automatic (fresh recompute reflects the current artifacts);
+ *  - an active waiver (evidence unchanged) sets status='waived' (carries the reason);
+ *  - a STALE waiver (audit changed since it was waived) is re-surfaced for re-decision (anti-amnesia).
+ * Then persist the current evidence cursor.
+ */
+export function runJourney(projectRoot: string, unit: string): JourneyReport {
+  const slug = reportSlug(unit);
+  const report = computeFresh(projectRoot, unit);
+  const sp = statePath(projectRoot, slug);
+  const state = loadState(sp) || { unit, auditHash: '', waivers: {} };
+  const curHash = auditHashOf(projectRoot, slug);
+  for (const ob of report.obligations) {
+    const w = state.waivers[ob.id];
+    if (!w) continue;
+    if (w.auditHashAtWaive === curHash) {
+      ob.status = 'waived';
+      ob.waivedReason = w.reason;
+      ob.detail = `waived — ${w.reason}`;
+    } else {
+      report.needsYou.unshift({ text: `⚠️ Waiver on "${ob.title}" is STALE (evidence changed since ${w.at}) — re-decide. Was: ${w.reason}`, anchor: { vpId: ob.id } });
+    }
+  }
+  // S5 — sign-off reconcile: a recorded sign-off satisfies OB-signoff while its evidence holds;
+  // if the audit changed since, the sign-off is STALE → re-surface for re-review (anti-amnesia).
+  const signoffOb = report.obligations.find((o) => o.id === 'OB-signoff');
+  if (signoffOb && state.signoff) {
+    if (state.signoff.auditHashAtSignoff === curHash) {
+      signoffOb.status = 'satisfied';
+      signoffOb.detail = `signed off${state.signoff.by ? ' by ' + state.signoff.by : ''} at ${state.signoff.at}`;
+    } else {
+      report.needsYou.unshift({ text: `⚠️ Sign-off is STALE (suite changed since ${state.signoff.at}) — re-review and sign off again.`, anchor: { vpId: 'OB-signoff' } });
+    }
+  }
+  report.nextSuggested = computeNext(report, unit);
+  saveState(sp, { unit, auditHash: curHash, waivers: state.waivers, signoff: state.signoff });
+  return report;
+}
+/**
+ * Waive an obligation — REQUIRES a reason (anti-amnesia: a waiver leaves a recorded "why").
+ * Records the current evidence cursor so reconcile can invalidate it if the audit changes.
+ */
+export function waive(projectRoot: string, unit: string, obId: string, reason: string): JourneyReport {
+  if (!reason || !reason.trim()) {
+    throw new Error('A reason is required to waive (anti-amnesia: a waiver must record WHY). Use --reason "...".');
+  }
+  const slug = reportSlug(unit);
+  const fresh = computeFresh(projectRoot, unit);
+  const valid = fresh.obligations.map((o) => o.id);
+  if (!valid.includes(obId)) {
+    throw new Error(`Unknown obligation "${obId}". Valid: ${valid.join(', ')}`);
+  }
+  const sp = statePath(projectRoot, slug);
+  const state = loadState(sp) || { unit, auditHash: '', waivers: {} };
+  state.waivers[obId] = { reason: reason.trim(), at: new Date().toISOString(), auditHashAtWaive: auditHashOf(projectRoot, slug) };
+  saveState(sp, state);
+  return runJourney(projectRoot, unit);
+}
+/**
+ * Sign off — the single human gate (S5). Allowed ONLY when every other deliver-required
+ * obligation is satisfied or waived (you cannot sign off an unclean suite). Records who/when +
+ * the evidence cursor; reconcile invalidates it (stale → re-review) if the suite later changes.
+ */
+export function signoff(projectRoot: string, unit: string, by?: string): JourneyReport {
+  const before = runJourney(projectRoot, unit);
+  const blockers = before.obligations.filter((o) => o.id !== 'OB-signoff' && o.status !== 'satisfied' && o.status !== 'waived');
+  if (blockers.length) {
+    throw new Error(`Cannot sign off — still open: ${blockers.map((b) => b.id).join(', ')}. Fix, or waive each with a reason, first.`);
+  }
+  const slug = reportSlug(unit);
+  const sp = statePath(projectRoot, slug);
+  const state = loadState(sp) || { unit, auditHash: '', waivers: {} };
+  state.signoff = { by, at: new Date().toISOString(), auditHashAtSignoff: auditHashOf(projectRoot, slug) };
+  saveState(sp, state);
+  return runJourney(projectRoot, unit);
+}
+const ICON: Record<ObStatus, string> = { satisfied: '✅', 'needs-work': '⚠️ ', pending: '⏳', waived: '🚫' };
+export function renderJourneyBoard(r: JourneyReport): string {
+  const L: string[] = [];
+  L.push(`# Journey — ${r.unit}`);
+  L.push('');
+  L.push(`📍 **You are here:** ${r.youAreHere}` + (r.phasesDone.length ? `  (phases: ${r.phasesDone.join(' → ')})` : ''));
+  if (r.gateStatus) L.push(`   gate: **${r.gateStatus}** · score: ${r.score ?? '—'}/10`);
+  L.push('');
+  L.push('## What must still be true');
+  for (const o of r.obligations) L.push(`- ${ICON[o.status]} **${o.title}** — ${o.detail}`);
+  L.push('');
+  L.push(`## 🔎 Review queue — needs your judgment (${r.needsYou.length})`);
+  if (r.needsYou.length) {
+    for (const it of r.needsYou.slice(0, 20)) {
+      const a = it.anchor;
+      const tag = [a.vpId, a.file && `${a.file}${a.line ? ':' + a.line : ''}`].filter(Boolean).join(' · ');
+      L.push(`- ${it.text}${tag ? `  〔${tag}〕` : ''}`);
+    }
+  } else L.push('- (nothing queued)');
+  L.push('');
+  L.push(`## → Next`);
+  L.push(`${r.nextSuggested}`);
+  L.push('');
+  return L.join('\n');
+}
+// ---------------- S3: inter-phase gate — obligations as HALT predicates (#398) ----------------
+//
+// A phase boundary is a deterministic gate: before the next phase may run, this phase's required
+// obligations must each be `satisfied` OR `waived` (S2 — an explicit, reasoned human acceptance).
+// A required obligation still `needs-work`/`pending` is a BLOCKER → HALT (no silent bad output
+// crosses the boundary, §9). Reuses runJourney → obligations already reflect waivers/reconcile.
+export type GatePhase = 'create' | 'run' | 'deliver';
+export interface GateVerdict {
+  unit: string;
+  phase: GatePhase;
+  status: 'pass' | 'halt';
+  required: string[];
+  blockers: { id: string; title: string; detail: string }[];
+  waivedCredit: { id: string; title: string }[]; // required obligations accepted via an explicit waiver
+}
+const PHASE_REQUIRED: Record<GatePhase, string[]> = {
+  // post-create (design quality): spec + coverage + depth + traceability must hold.
+  create: ['OB-spec', 'OB-coverage', 'OB-depth', 'OB-trace'],
+  // post-run: the design gates + automation coverage.
+  run: ['OB-spec', 'OB-coverage', 'OB-depth', 'OB-trace', 'OB-automation'],
+  // pre-delivery: everything automated PLUS the single human sign-off (S5).
+  deliver: ['OB-spec', 'OB-coverage', 'OB-depth', 'OB-trace', 'OB-automation', 'OB-signoff'],
+};
+export function runGate(projectRoot: string, unit: string, phase: GatePhase): GateVerdict {
+  const r = runJourney(projectRoot, unit);
+  const required = PHASE_REQUIRED[phase];
+  const reqObs = r.obligations.filter((o) => required.includes(o.id));
+  const blockers = reqObs
+    .filter((o) => o.status !== 'satisfied' && o.status !== 'waived')
+    .map((o) => ({ id: o.id, title: o.title, detail: o.detail }));
+  const waivedCredit = reqObs.filter((o) => o.status === 'waived').map((o) => ({ id: o.id, title: o.title }));
+  return { unit, phase, status: blockers.length ? 'halt' : 'pass', required, blockers, waivedCredit };
+}
+export function renderGate(v: GateVerdict): string {
+  const L: string[] = [];
+  L.push('');
+  L.push(`━━━ Gate: ${v.unit} @ phase "${v.phase}" → ${v.status === 'pass' ? '✅ PASS' : '⛔ HALT'} ━━━`);
+  if (v.blockers.length) {
+    L.push('  Blocking obligations (must be satisfied or explicitly waived):');
+    for (const b of v.blockers) L.push(`   • ${b.id} ${b.title} — ${b.detail}`);
+    L.push('  → Self-correct (repair / run-test), or `sungen journey --screen ' + v.unit + ' --waive <OB> --reason "..."` if accepted.');
+  } else {
+    L.push('  All required obligations satisfied' + (v.waivedCredit.length ? ` (${v.waivedCredit.length} accepted via waiver)` : '') + '.');
+  }
+  L.push('');
+  return L.join('\n');
+}

package/src/harness/parse.ts CHANGED Viewed

@@ -106,7 +106,10 @@ function classifyScenario(sc: ParsedScenario): ScenarioInfo {
   const deferredToFlow = tags.includes('@deferred:flow');
   const ownedByFlow = (tags.find((t: string) => /^@owned-by:/i.test(t)) || '').slice('@owned-by:'.length) || undefined;
   // @deferred:flow is owned by a flow → not automated on this screen, so it accounts like @manual (H6).
-  const manual = tags.includes('@manual') || deferredToFlow;
+  // Recognize both bare `@manual` and the reason-coded `@manual:Mx` convention (what the generator emits);
+  // must match capability-plan.ts's detection, or `@manual:Mx` scenarios stay in the businessDepth
+  // denominator and silently suppress the ratio (#386).
+  const manual = tags.some((t) => /^@manual\b/i.test(t)) || deferredToFlow;
   const casesTag = tags.find((t) => t.startsWith('@cases:'));
   const casesDataset = casesTag ? casesTag.slice('@cases:'.length).trim() : undefined;
   // Named-query references: @query:<name>[(overrides)] tags + inline `query [name]` step refs.

package/src/orchestrator/ai-rules-updater.ts CHANGED Viewed

@@ -74,6 +74,7 @@ export const AI_RULES_FILE_MAPPING: [string, string][] = [
   ['claude-agent-reviewer.md', '.claude/agents/sungen-reviewer.md'],
   ['claude-agent-discovery.md', '.claude/agents/sungen-discovery.md'],
   ['claude-agent-challenge.md', '.claude/agents/sungen-challenge.md'],
+  ['claude-agent-generator.md', '.claude/agents/sungen-generator.md'],
   // Skills — GitHub Copilot
   ['github-skill-sungen-gherkin-syntax.md', '.github/skills/sungen-gherkin-syntax/SKILL.md'],

package/src/orchestrator/templates/ai-instructions/claude-agent-generator.md ADDED Viewed

@@ -0,0 +1,44 @@
+---
+name: sungen-generator
+description: Generates Gherkin scenarios for ONE shard (a viewpoint theme or a spec section) in an isolated context and writes a self-contained fragment — so create-test can fan out many generators in parallel and the orchestrator stays lean. Each shard owns a disjoint VP-prefix namespace, so fragments merge without renumbering. Invoked by create-test/design during parallel generation.
+tools: Read, Grep, Glob, Bash, Write, Edit, Skill
+---
+You are a **single-shard test-case generator**. You run in an **isolated context** and produce the scenarios for **exactly one shard** — never the whole screen. The orchestrator runs several of you in parallel, then merges the fragments. Keeping each fragment small is also what keeps every generator under the output-token cap.
+## What a shard is
+A shard is one **coverage unit**, sized for real parallelism (not the 5 coarse viewpoint-router groups — a screen loads only 1–2 of those). It is **one of**:
+- a **viewpoint theme** — a `VP-` prefix from the viewpoint overview (e.g. `VP-SEC`, `VP-ERROR-EMPTY-STATE`, `VP-CAROUSEL`), or
+- a **spec section** — one `spec.md` section per the `sungen-tc-generation` Mapping Contract (Table 1).
+Your shard owns its `VP-` prefix, so your ids never collide with sibling shards.
+## Inputs (passed by the orchestrator)
+- **Your shard**: the theme/section name + its viewpoint items (the slice).
+- **The `sungen-discovery` report** (Step 3): condensed facts — use it instead of re-reading every source.
+- **Relevant context**: only the `spec.md` section(s) your shard maps to, and **which** `sungen-viewpoint` group file holds your shard's patterns (load only that one).
+- **Unit context**: screen vs flow, the unit name, the chosen tier (1 / 2 / 3 / full), and your fragment paths.
+## Generate (your shard ONLY)
+1. Load **only** the skills you need: `sungen-tc-generation` (output format + mapping), `sungen-gherkin-syntax` (step patterns), and the **one** `sungen-viewpoint` group file your shard belongs to. Do not load the others.
+2. Produce the scenarios for your shard's viewpoint items at the requested tier, following the skill's mapping contract. Keep every `VP-` id under **your shard's prefix** so it stays in a disjoint namespace.
+3. **Flows**: use `[Screen:Element]` namespace refs, namespace test-data by phase, add the `@flow` tag per the skill.
+4. Tag `@manual:Mx` (with a reason) only for true judgment / missing-capability items, per the skill.
+## Write your fragment (do NOT write the final feature)
+Write two self-contained fragment files (the orchestrator merges them):
+- `.sungen/fragments/<unit>/<shard>.feature` — a **headerless** block: just your `@tag`-decorated `Scenario:` / `Scenario Outline:` blocks, no `Feature:` line (the orchestrator owns the single Feature header).
+- `.sungen/fragments/<unit>/<shard>.test-data.yaml` — only the `{{variables}}` your scenarios introduce.
+Distinct paths per shard ⇒ no write conflict with sibling generators.
+## Return (compact — your only message back)
+```
+SHARD: <theme-or-section>
+SCENARIOS: <n>  (VP ids: <VP-...-001..NNN>)
+TEST-DATA KEYS: <keys you added>
+SPEC SECTIONS COVERED: <list>
+ASSUMPTIONS / DEFERRED: <items you marked @manual or could not source>
+FRAGMENT: .sungen/fragments/<unit>/<shard>.feature
+```
+Keep it tight. Do not audit, do not merge, do not touch other shards' fragments or the final `.feature`.

package/src/orchestrator/templates/ai-instructions/claude-cmd-create-test.md CHANGED Viewed

@@ -71,18 +71,41 @@ If the unit is **api-first** (`qa/api/<name>/` or `qa/api/flows/<name>/`), the d
    Summarize what you found in requirements and present to the user.
 4. Follow the `sungen-tc-generation` skill for section identification, viewpoint generation, and output format. **Viewpoint loading discipline:** `sungen-viewpoint` is a **router** — from the page-type (form / list / detail / auth / dashboard …) read **only the matching group file(s)** (e.g. a login screen → group-e-identity; a product list → group-c-data-explore), never all five groups. This keeps the generation context lean. **For flows**, use the "Flow Test Generation" section in the skill. When requirements exist, use the "Requirements-Driven Generation" strategy. **For Tier 1**, apply the **Lightweight Guard** — verify required fields, validation rules, business rules, security checks, and key state transitions all have TCs after generation. **For Tier 2+**, **MUST** apply the full **Mapping Contract** — walk every `spec.md` section top-to-bottom and produce the indicated TCs per Table 1; handle `test-viewpoint.md` per Table 2. Do not silently skip sections.
-5. Generate or update `.feature` + `test-data.yaml` following `sungen-gherkin-syntax` and `sungen-tc-generation` skills. **For flows**: use `[Screen:Element]` namespace format, namespace test-data by phase, add `@flow` tag.
+5. Generate `.feature` + `test-data.yaml` following `sungen-gherkin-syntax` and `sungen-tc-generation`. **Partition the work into shards and generate them in parallel** when there are ≥2.
+   **5a. Decide the shards.** A shard is one **coverage unit** sized for parallelism — NOT the 5 coarse viewpoint-router groups (a screen loads only 1–2 of those). Use **either**:
+   - one **viewpoint theme** per shard — a `VP-` prefix from the viewpoint overview (`VP-SEC`, `VP-ERROR-EMPTY-STATE`, `VP-CAROUSEL`, …) — preferred when the viewpoint overview is rich (test-2/home had 47 items across many themes); **or**
+   - one **`spec.md` section** per shard (the Mapping Contract walk, Table 1) — preferred when generating from spec.
+   Each shard owns a disjoint `VP-` prefix ⇒ ids never collide. One shard → skip to 5c (no fan-out gain).
+   **5b. Parallel fan-out (Claude Code).** Spawn one **`sungen-generator`** sub-agent **per shard** (Task tool, `subagent_type: sungen-generator`) — issue all the Task calls **in a single message** so they run concurrently. Pass each: its shard (theme/section) + viewpoint slice, the **`sungen-discovery` report** (Step 3), only the `spec.md` section(s) it maps to, which one `sungen-viewpoint` group file holds its patterns, the unit (screen/flow) + name + tier, and its fragment paths `.sungen/fragments/<name>/<shard>.{feature,test-data.yaml}`. Each writes a **headerless** fragment + a test-data fragment and returns a compact summary. Small fragments also keep every generator under the output-token cap (the reason the single-pass path writes incrementally).
+   **5c. Merge (orchestrator — barrier; only after all generators return).**
+   - Write the final `qa/<screens|flows>/<name>/features/<name>.feature`: one `Feature:` header (+ `@flow` for flows), then concatenate the fragments in **stable order** — spec-section order top-to-bottom (or theme order from the viewpoint overview) — so output is coherent and reproducible across runs.
+   - **Dedup** cross-shard scenarios with near-identical titles (a generic "navigation works" from two shards): keep the earlier shard's, drop the duplicate, note it. No id renumber needed — prefixes are disjoint by construction.
+   - **Union** the test-data fragments into `test-data.yaml`; dedup keys, and **flag** any key two shards define with different values.
+   - Delete `.sungen/fragments/<name>/` once merged.
+   - Guarantees a **coherent** suite (no dup, valid ids, passes `audit`), not a byte-identical one — generation is AI-authored; the determinism asset lives downstream in the Gherkin→`.spec.ts` compiler.
+   **5d. Sequential fallback.** Use the single-context incremental path (Step 2: tier-by-tier `Write`/`Edit` batches) when: only **one** shard applies, **Copilot / no sub-agents**, or a constrained setup. Same output, just no speedup. **For flows**: `[Screen:Element]` namespace refs, test-data namespaced by phase, `@flow` tag.
+5.4. **Depth self-check (deterministic — run BEFORE the audit).** Run `sungen depth-lint --screen <name>` (Bash). It reuses the audit's businessDepth classifier and splits every shallow business-critical scenario into two actionable buckets — act on them now so the audit/repair loop doesn't burn rounds on depth:
+   - **DEEPEN IN PLACE** — add a real value assertion to each (`User see all [X] contain {{v}}`, `User remember [X] as {{v}}` + `… with {{v}}`). The printed `template` is a **hint** keyed off the theme — apply judgment to the scenario's actual claim; do NOT paste a value assertion that doesn't fit (e.g. a carousel-visibility scenario should assert the product SET, not a price). If a flagged scenario is genuinely visibility/behavior (not data-correctness), that's an over-count — leave it and note it, never fake an assertion.
+   - **CROSS-SCREEN** — route to a flow (`/sungen:add-flow`) or tag `@manual:Mx` + reason; do NOT fake an on-screen data assertion. This removes it from the screen's depth denominator honestly.
+   Re-run `sungen depth-lint` until `deepen` is empty (or only honest over-counts remain), THEN proceed to the gate. This lifts first-pass `businessDepth` mechanically instead of via 2–3 organic repair rounds.
 5.5. **Quality gate & repair (harness — always run, do NOT skip).** Follow the `sungen-harness-audit` skill:
    - Run `sungen audit --screen <name>` (Bash) and read `gateStatus` + `findings` (deterministic, structural).
    - **Independent semantic review.** **Claude Code:** spawn the **`sungen-reviewer`** sub-agent (Task tool, `subagent_type: sungen-reviewer`) — it judges what the gate can't (does each scenario's steps PROVE its title/viewpoint, observable Thens, business-critical assertion depth) and returns `VERDICT` + `ISSUES` with concrete fixes. **Merge its NEEDS-REPAIR issues with the audit findings.** (Copilot / no sub-agents: run the same review inline using the `sungen-reviewer` criteria.)
    - Repair **both** the audit findings and the reviewer issues (budget 3 rounds), then re-audit:
+   - **Repair runs single-agent by default** (it edits the one `.feature` — concurrent edits to the same file conflict, and BALANCE/dedup needs whole-suite context). **Exception:** a finding that is purely **additive new coverage** (GATE missing-theme → generate scenarios for an uncovered theme) is just more shards — fan it out as `sungen-generator` sub-agent(s) (new disjoint `VP-` prefix) and merge, exactly like Step 5b. Findings that **edit existing** scenarios (DEPTH/BALANCE/TRACE) stay serial.
    - If the gate FAILs or there are findings, **repair** (budget 3 rounds), then re-audit:
      - **GATE** missing critical theme → generate scenarios for it. If it is **cross-screen** (cart-correctness, product-detail-consistency, filter-result-correctness): **automate it in the flow** (`/sungen:add-flow` if none exists) with observable data assertions (`... with {{value}}`, `see all ... contain {{v}}`) — a single home→target journey runs as one Playwright test. Do **not** write a full `@manual` duplicate of it on the screen (that is a non-running dead copy — `sungen audit` flags it `MANUAL-AUTOMATABLE`), and do **not** fake a shallow single-screen pass. Reserve `@manual` for true judgment / missing-capability, tagged `@manual:Mx`.
      - **DEPTH** → replace `see [X] page/section` on business-critical scenarios with data assertions.
      - **BALANCE** → stop expanding secondary viewpoints; add business-core scenarios first.
      - **TRACE** → align `VP-` ids with the viewpoint-overview.
    - Stop when the gate PASSes and findings clear, **or** the budget is exhausted → report residual gaps honestly (never fake a pass).
+   - **Phase gate (boundary — do NOT skip).** Run `sungen gate --screen <name> --phase create` (Bash, exit 2 = HALT). It is the deterministic create-boundary: every required obligation (spec · coverage · depth · trace) must be **satisfied or explicitly waived**. On **HALT**, you have not cleared the phase — keep repairing the blocking obligation(s) within budget; if a blocker is a genuinely-accepted gap (e.g. cross-screen depth owned by a flow), record it with `sungen journey --screen <name> --waive <OB> --reason "..."` (reason mandatory). **Do not converge (step 6) past a HALT** without a fix or a reasoned waiver — no bad output crosses the boundary.
 5.6. **Record (reuse + observability).** Build the manifest and report usage:
    - `sungen manifest --screen <name>` — fingerprints for next-run change detection. On a **re-run**, start the whole command by `sungen manifest --screen <name> --diff` and only regenerate scenarios whose spec section changed (keep/regenerate/retire).

package/src/orchestrator/templates/ai-instructions/claude-cmd-run-test.md CHANGED Viewed

@@ -9,6 +9,8 @@ allowed-tools: Read, Grep, Bash, Glob, Edit, Write, AskUserQuestion, mcp__playwr
 You are a **Senior Developer**. Use `sungen-selector-fix`, `sungen-selector-keys`, and `sungen-error-mapping` skills.
+> ⛔ **Source of truth — the live page is NOT the oracle; `.feature`/`test-data`/`spec.md` are.** Auto-fix is for **selector-resolution** failures (wrong locator → fix `selectors.yaml`). An **assertion-value** failure where the app contradicts the spec is a **CANDIDATE BUG → report it, let it FAIL** — never loosen the rule, weaken the assertion, edit the expected value/`.feature`, or hand-edit the generated `.spec.ts` to make it pass. See `sungen-error-mapping` § "Source of truth". (A `password > 8` test that fails on 6 chars is a bug to report, not a `>= 6` edit.)
 ## Parameters
 Parse from `$ARGUMENTS`:
@@ -102,6 +104,7 @@ If the unit is **api-first**, skip every selector/capture phase (an API test has
 9. **Integrity check & trace (always run after the final run).**
    - `sungen script-check --screen <name>` — verify the generated spec is a **1:1** of the Gherkin (every non-@manual scenario ↔ one `test()`, no drift). If it reports **DRIFT** (spec hand-edited or stale), re-run `sungen generate --screen <name>` so the spec matches the feature, then re-run — **never hand-edit the generated spec** (auto-fix must edit `selectors.yaml`, not the `.spec.ts`).
    - `sungen ledger record --screen <name> --step run --ms <elapsed>` (record this run), then `sungen trace --screen <name>` — show the process map + bottlenecks + **HUMAN-LOOP FOCUS** (the @manual scenarios the QA must verify) to the user.
+   - **Phase gate (boundary — do NOT skip).** `sungen gate --screen <name> --phase run` (exit 2 = HALT): the run-boundary obligations (incl. automation) must be **satisfied or explicitly waived**. On **HALT**, classify per `sungen-error-mapping` § Source of truth (#387): a **selector-resolution** failure → fix `selectors.yaml` + re-run; an **assertion-vs-spec** failure → **report it as a candidate bug / leave it FAIL** (never weaken the assertion or edit the expected to pass); a genuinely-accepted gap → `sungen journey --screen <name> --waive <OB> --reason "..."`. Do **not** declare the run "done" past a HALT without a fix, a reported bug, or a reasoned waiver.
 10. **Capability-pending offer (consent-gated).** If `sungen audit --screen <name>` reports `AUTOMATION-READY-PENDING` (or the run shows `@requires:<cap>` tests skipped "requires …"), these are **automation-ready** scenarios waiting on an opt-in driver. Use `AskUserQuestion` to offer: *"N scenario(s) are automation-ready — enable `<cap>` to run them? (`sungen capability add <cap>`)"*. **Only on the user's yes** run `sungen capability add <cap>` then re-run those specs; on no, leave them skipped (they are NOT failures and NOT manual). **Never auto-install.**
 ## Playwright command guidelines

package/src/orchestrator/templates/ai-instructions/claude-skill-delivery.md CHANGED Viewed

@@ -88,6 +88,33 @@ Multi-locale (no `SUNGEN_ENV`): one **`<LOCALE> Auto`** sheet per locale + a sin
 ---
+## API delivery — extra worksheet
+For **api-kind units** (`qa/api/<area>/`), the `.xlsx` gains a third worksheet **`API detail`** (appended after Auto/Manual). The main BM-2-901-13 Testcases layout is unchanged. The CSV is unchanged (16-column, no extra sheet).
+### Required sources (API detail sheet only)
+| Source | Path | Created by |
+|--------|------|------------|
+| Endpoint catalog | `qa/api/<area>/api/apis.yaml` | `sungen add --api` or `sungen api import` |
+| Scenario annotations | `qa/api/<area>/features/<feature>.feature` | `create-test` |
+### API detail column mapping
+| Column | Source |
+|--------|--------|
+| Endpoint | `path` from `apis.yaml` catalog entry |
+| Method | `method` from catalog entry (uppercased) |
+| Auth / Datasource | catalog `datasource` + any `@auth:<role>` tag from scenarios calling this endpoint |
+| Request shape | catalog `body` + `params` fields composed as `body: {…}; params: [a, b]` |
+| Expected-status matrix | `@cases:<dataset>` label for data-driven scenarios; catalog `expect.status` as fallback |
+| Flow steps | Ordered `@api:<name>` call chain from multi-call scenarios (e.g. `register → count_users`) |
+| Concurrency invariant | `@concurrent:<N>` + `@query:<oracle>` from concurrent scenarios (e.g. `ok_count=2; @query user_count`) |
+**Sources are catalog + annotations only** — Field Metadata (FM) is not required for this sheet.
+---
 ## Excluded from CSV
 - `@steps:<name>` **base** scenarios — these are setup-only, inlined into `@extend:...` scenarios at compile time

package/src/orchestrator/templates/ai-instructions/claude-skill-error-mapping.md CHANGED Viewed

@@ -21,6 +21,23 @@ Then choose the fix from the patterns below.
 ---
+## ⛔ Source of truth — classify EVERY failure before you "fix" it
+`.feature` + `test-data.yaml` + `spec.md` are the **oracle**. The **live page is NOT** — it may be the thing that's broken. A failing test is not automatically a test to "make pass". Classify first:
+- **Selector-resolution failure** (element not found / wrong locator / strict-mode / wrong element type) → the test looked in the wrong place. **Fix the locator in `selectors.yaml`** (re-snapshot, copy the exact accessible name). Legit auto-fix.
+- **Assertion-value failure** (element FOUND, but observed value ≠ expected) → STOP and ask: *is the TEST wrong, or is the APP wrong?*
+  - Expected value/rule is wrong **relative to `spec.md`** (typo, stale test-data) → fix `test-data.yaml`/`.feature` so it matches the **spec** — never the live page.
+  - App behaviour contradicts `spec.md` (spec says X, app shows Y) → **CANDIDATE BUG**. **Report it** (let the test FAIL / surface to the QA in the run summary). **NEVER** change the expected value, loosen the rule, weaken the assertion (`toHaveText`→`toContainText` to dodge a mismatch), edit `.feature`, or edit the generated `.spec.ts` to make it pass.
+> **Cardinal sin (do NOT do this):** a `password > 8 chars` rule fails on a 6-char input → "fix" it to `>= 6` so the test passes. The logic is now meaningless. A failing assertion is a **finding**, not a chore.
+**Auto-fix loop scope:** the run-test auto-fix loop engages ONLY on **selector-resolution** failures. On an assertion-value failure where the app contradicts the spec → **HALT and report**, do not loop it into passing.
+**Never hand-edit the generated `.spec.ts`** (e.g. inserting `page.evaluate`/`fetch` to bypass a broken control). `sungen script-check` regenerates the spec from `.feature` and flags any edit as DRIFT — regenerate, don't patch.
+---
 ## Fix Priority (try in order)
 1. **Auth issue** — page redirected to login? Fix auth first, everything else is noise
@@ -43,11 +60,13 @@ Then choose the fix from the patterns below.
 | not a select | Custom dropdown, not native `<select>` | Set `variant: 'custom'` |
 | Frame not found | iframe selector wrong or doesn't exist | Fix `frame` value, verify iframe in snapshot |
-### Assertion errors → fix in `test-data.yaml` or `.feature`
+### Assertion errors → apply the Source-of-truth gate above FIRST
-| Error | Diagnosis | Fix |
+> The "Fix" column below applies **only when the expected value was wrong relative to `spec.md`** (a test defect). If the app's value contradicts the spec, the row is a **candidate bug → report it, do not edit the expected to match live**. Never weaken `toHaveText`→`toContainText` just to pass.
+| Error | Diagnosis | Fix (only if the TEST was wrong per spec) |
 |---|---|---|
-| toHaveText mismatch | Expected text differs from actual | Fix value in test-data. If element is input type → change Gherkin type to `field`/`textarea` (triggers `toHaveValue` instead) |
+| toHaveText mismatch | Expected text differs from actual | If the test's expected was wrong per spec → fix value in test-data. If element is input type → change Gherkin type to `field`/`textarea` (triggers `toHaveValue`). If the app value contradicts spec → **report as bug**. |
 | toHaveValue mismatch | Expected value differs from actual | Fix value in test-data |
 | toContainText mismatch | Partial text not found | Fix expected partial text in test-data |
 | toBeVisible timeout | Element exists but hidden, or name wrong | Check: is element conditionally visible? Wrong name? Inside dialog? |

package/src/orchestrator/templates/ai-instructions/claude-skill-gherkin-syntax.md CHANGED Viewed

@@ -214,6 +214,8 @@ Options: `nth` `exact` `scope` `match` `variant` `frame` `contenteditable` `colu
 | `@cases:dataset` | Data-driven: run the scenario once per row of the `dataset` LIST in test-data → one `test()` per row |
 | `@query:name` | Database: run the named query from `database/queries.yaml` (precondition) and bind its rows to `{{name}}`; assert with `expect {{name.count}} …` + path access. Override params `@query:name(p={{v}})`. Repeatable. (Optional Data Driver — see Database verification above) |
 | `@api:name` | API: run the named request from `api/apis.yaml` (precondition) and bind the response to `{{name}}`; assert with `expect {{name.status}} …` + path access (`{{name.body.<path>}}`). Override params `@api:name(p={{v}})`. Repeatable. (Optional API Driver) |
+| `@concurrent:N` | API idempotency: fire the bound `@api` request N times in parallel, then bind aggregates on the `@api` name — `{{name.ok_count}}` (2xx count) and `{{name.status_counts}}` (status→count map). Assert the exactly-once invariant (`expect {{name.ok_count}} is 1`); pair with `@query` as a DB oracle. Tag order = run order: `@api` (mutate) before `@query` (verify). (Optional API Driver) |
+| `@hybrid` | One unit, two capabilities: a signed-in browser session (UI) authorizes the `@api` call — the API request reuses the UI `storageState`. (Optional API + UI Drivers) |
 ### Data-driven scenarios (`@cases`)

package/src/orchestrator/templates/ai-instructions/claude-skill-tc-generation.md CHANGED Viewed

@@ -9,6 +9,8 @@ user-invocable: false
 - **Write incrementally — never emit the whole suite in one response.** Build the `.feature` in batches via successive `Write`/`Edit` (≈10–15 scenarios per call). For **Full coverage**, write tier-by-tier: `Write` Tier 1 → `Edit` append Tier 2 → `Edit` append Tier 3.
   → One huge `Write` can exceed the model's output-token cap → `API Error: Claude's response exceeded the N output token maximum`. Single-pass full coverage only fits when `CLAUDE_CODE_MAX_OUTPUT_TOKENS ≥ 64000`; otherwise batch. Batching also lets the audit/reviewer run per batch — higher quality.
+- **Sharded (parallel) generation — keep each shard self-contained.** When `create-test` fans out one `sungen-generator` sub-agent per shard (a viewpoint theme like `VP-SEC`, or a `spec.md` section — see create-test Steps 5a–5c), you are generating **only your shard**: emit your scenarios under **your own `VP-` prefix** (disjoint namespace, so ids never collide), as a **headerless fragment** (no `Feature:` line — the orchestrator owns the single header). Do not reference or renumber other shards. The orchestrator concatenates fragments in stable order (spec-section / theme order), dedups by title, and unions test-data. Small fragments also stay under the output-token cap by construction.
 - `spec_figma.md` exists → read file only, **NEVER** call `mcp__figma__*`
   → PAT auth flow already done by `sungen-capture` (mode figma-pat); re-calling fails or duplicates work.
@@ -273,6 +275,7 @@ Security:         [S1 – admin only]
 **Depth is a GATE dimension (harness-roadmap P1) — self-raise, never silently go shallow:**
 - For every data-correctness theme the catalog marks `depth.requires: data-assertion`, emit its `depth.template` shape by **default** — don't wait for the repair loop. `sungen audit` measures `businessDepth` (ratio of these scenarios that assert data) against an intent threshold (functional ≥ 0.70); below it the **gate FAILs**.
+- **Verify depth deterministically before the gate:** run `sungen depth-lint --screen <name>`. It classifies every shallow business-critical scenario into **deepen-in-place** (add the theme's value assertion — the printed `template` is a hint, fit it to the actual claim) vs **cross-screen** (route to a flow / `@manual:Mx`). Clear the `deepen` list first — this is the mechanical way to hit `businessDepth` on the first pass instead of churning repair rounds. Never fake a value assertion onto a visibility/behavior scenario the lint over-counts; leave it and note the over-count.
 - `depth.cross_screen: true` (cart / detail / filter / brand correctness) → write the deep capture/compare shape as an **automated flow scenario** (in the flow — do NOT leave a full-step `@manual` duplicate on the screen). `@manual` is **only** for genuine judgment (M6 visual/UX · M8 not-worth · M9 human) or a missing capability (M1–M5/M7), and it **must** carry a reason code (`@manual:Mx`, or a reason comment the planner can infer). A `@manual` scenario that still has full automatable steps (a data assertion, no visual/mock/a11y judgment) is now flagged by `sungen audit` as `MANUAL-AUTOMATABLE`, and business-critical scenarios you defer to `@manual` are reported as `DEPTH-DEFERRED` (they do NOT silently inflate `businessDepth`). Deferring automatable work to `@manual` lowers quality — automate it in the flow instead.
 - **Pick the right `@manual:Mx` code — it decides which driver can later automate the case** (`sungen audit` flags a code↔reason mismatch). Tag the code that matches the **oracle the reason describes**:

package/src/orchestrator/templates/ai-instructions/copilot-cmd-create-test.md CHANGED Viewed

@@ -64,8 +64,11 @@ If the unit is **api-first** (`qa/api/<name>/` or `qa/api/flows/<name>/`), the d
    Summarize what you found in requirements and present to the user.
 4. Follow the `sungen-tc-generation` skill for section identification, viewpoint generation, and output format. **For flows**, use the "Flow Test Generation" section in the skill. When requirements exist, use the "Requirements-Driven Generation" strategy. **For Tier 1**, apply the **Lightweight Guard** — verify required fields, validation rules, business rules, security checks, and key state transitions all have TCs after generation. **For Tier 2+**, **MUST** apply the full **Mapping Contract** — walk every `spec.md` section top-to-bottom and produce the indicated TCs per Table 1; handle `test-viewpoint.md` per Table 2. Do not silently skip sections. Present sections as a numbered list and let user pick.
-5. Generate or update `.feature` + `test-data.yaml` following `sungen-gherkin-syntax` and `sungen-tc-generation` skills. **For flows**: use `[Screen:Element]` namespace format, namespace test-data by phase, add `@flow` tag.
+5. Generate or update `.feature` + `test-data.yaml` following `sungen-gherkin-syntax` and `sungen-tc-generation` skills. Generate **group-by-group** (one viewpoint group at a time, tier-by-tier `Write`/`Edit` batches) to stay under the output-token cap. **For flows**: use `[Screen:Element]` namespace format, namespace test-data by phase, add `@flow` tag.
+   > **No parallel fan-out here.** Copilot has no sub-agents, so generation is sequential (the Claude Code variant fans out one `sungen-generator` per viewpoint group and merges). Same output, no speedup.
+5.4. **Depth self-check (deterministic — BEFORE the audit).** Run `sungen depth-lint --screen ${input:name}`. It splits every shallow business-critical scenario into **DEEPEN IN PLACE** (add a real value assertion — the printed `template` is a theme-keyed hint, apply judgment to the actual claim; never fake one onto a visibility/behavior scenario) and **CROSS-SCREEN** (route to a flow / tag `@manual:Mx` + reason — removes it from the depth denominator honestly). Act on both, re-run until `deepen` is empty (or only honest over-counts remain), THEN gate. Lifts first-pass `businessDepth` mechanically instead of via 2–3 repair rounds.
 5.5. **Quality gate & repair (harness — always run).** Per `sungen-harness-audit`: run `sungen audit --screen ${input:name}` (structural), THEN do an **independent semantic review inline** using the `sungen-reviewer` criteria (does each scenario's steps PROVE its title/viewpoint? observable Thens? business-critical assertion depth?). Merge both sets of issues; if gate FAILs / findings exist, repair (budget 3) and re-audit — GATE missing theme → generate it (cross-screen → **automate it in the flow** via `/sungen:add-flow`, NOT a full `@manual` screen duplicate — `sungen audit` flags an automatable `@manual` as `MANUAL-AUTOMATABLE`; reserve `@manual:Mx` for true judgment/missing-capability); DEPTH → add data assertions; BALANCE → add business-core first; TRACE → align VP ids. Never fake a pass.
+5.5b. **Phase gate (boundary — do NOT skip).** Run `sungen gate --screen ${input:name} --phase create` (exit 2 = HALT): every required obligation (spec · coverage · depth · trace) must be **satisfied or explicitly waived**. On **HALT**, keep repairing within budget; a genuinely-accepted gap → `sungen journey --screen ${input:name} --waive <OB> --reason "..."` (reason mandatory). Do **not** converge (step 6) past a HALT without a fix or a reasoned waiver.
 5.6. **Record.** `sungen manifest --screen ${input:name}`. Ledger **each phase** (not just repair) — pick one `runId` at the start and pass it so `trace`/`ledger report` show THIS run, not a mix: `sungen ledger record --screen ${input:name} --run <runId> --step <discovery|viewpoint|gherkin|audit|repair:N> --ms <elapsed>`. On re-run, start with `sungen manifest --screen ${input:name} --diff` and only regenerate changed sections.
 6. **Converge — show the trace.** Run `sungen trace --screen ${input:name}` and present: process map (phases + repair rounds), bottlenecks, **HUMAN-LOOP FOCUS** (@manual to verify), audit score + gate + residual gaps. Then offer next steps based on which tier was just generated:

package/src/orchestrator/templates/ai-instructions/copilot-cmd-run-test.md CHANGED Viewed

@@ -9,6 +9,8 @@ tools: [read, execute, edit, vscode/askQuestions, playwright/*]
 You are a **Senior Developer**. Use `sungen-selector-fix`, `sungen-selector-keys`, and `sungen-error-mapping` skills.
+> ⛔ **Source of truth — the live page is NOT the oracle; `.feature`/`test-data`/`spec.md` are.** Auto-fix is for **selector-resolution** failures (wrong locator → fix `selectors.yaml`). An **assertion-value** failure where the app contradicts the spec is a **CANDIDATE BUG → report it, let it FAIL** — never loosen the rule, weaken the assertion, edit the expected value/`.feature`, or hand-edit the generated `.spec.ts` to make it pass. See `sungen-error-mapping` § "Source of truth". (A `password > 8` test that fails on 6 chars is a bug to report, not a `>= 6` edit.)
 ## Parameters
 Parse from `$ARGUMENTS`:
@@ -93,6 +95,7 @@ If the unit is **api-first**, skip every selector/capture phase (an API test has
 7. **Phase 3 — Full Run**: Run all tests. Fix only **new** failures (elements unique to `@normal`/`@low`). Max 1 attempt. Don't loop on low-priority failures.
 8. **Phase 4 — Regression**: One final full run. Report results. No more fix loops.
 9. **Integrity & trace (always run after the final run).** `sungen script-check --screen <name>` — verify the spec is a **1:1** of the Gherkin; if **DRIFT**, re-run `sungen generate --screen <name>` (never hand-edit the `.spec.ts` — auto-fix edits `selectors.yaml`). Then `sungen ledger record --screen <name> --step run --ms <elapsed>` and `sungen trace --screen <name>` to show the process map + bottlenecks + **HUMAN-LOOP FOCUS**.
+9b. **Phase gate (boundary — do NOT skip).** `sungen gate --screen <name> --phase run` (exit 2 = HALT): run-boundary obligations (incl. automation) must be **satisfied or explicitly waived**. On HALT, classify per `sungen-error-mapping` § Source of truth (#387): selector-resolution failure → fix `selectors.yaml` + re-run; assertion-vs-spec failure → **report as a candidate bug / leave it FAIL** (never weaken to pass); accepted gap → `sungen journey --screen <name> --waive <OB> --reason "..."`. Don't declare "done" past a HALT without a fix, a reported bug, or a reasoned waiver.
 10. **Capability-pending offer (consent-gated).** If `sungen audit` reports `AUTOMATION-READY-PENDING` (or `@requires:<cap>` tests are skipped "requires …"), offer: *"N scenario(s) are automation-ready — enable `<cap>` to run them? (`sungen capability add <cap>`)"*. Only on the user's yes, run `sungen capability add <cap>` + re-run; on no, leave skipped (not failures, not manual). **Never auto-install.**
 ## Playwright command guidelines