npm - @sun-asterisk/sungen - Versions diffs - 3.1.2 → 3.2.0-beta.142 - Mend

@sun-asterisk/sungen 3.1.2 → 3.2.0-beta.142

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

package/src/harness/repair.ts ADDED Viewed

@@ -0,0 +1,75 @@
+/**
+ * Repair planner (#343) — the consumer of the `repair` capability SPI.
+ *
+ * Gathers the unit-capability's fix rules and matches them against the audit findings (always) and
+ * the latest Playwright failures (best-effort), turning them into a concrete fix plan. Deterministic:
+ * the AI repair loop and a human get the same proposals. Backs `sungen repair`.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import { capabilityRegistry } from '../capabilities/registry';
+import { discoverAndRegisterCapabilities } from '../capabilities/discover';
+import { scoringCapabilityFor } from './audit';
+export interface RepairProposal { source: 'audit' | 'runtime'; signal: string; ruleId: string; fix: string }
+export interface RepairPlan {
+  capability: string | undefined;
+  rulesAvailable: number;
+  proposals: RepairProposal[];
+  unmatched: string[];     // findings/failures with no matching rule (need a human)
+}
+/** Collect failure messages from a Playwright JSON result file (best-effort, defensive). */
+function failuresFromResult(file: string): string[] {
+  const out: string[] = [];
+  try {
+    const r = JSON.parse(fs.readFileSync(file, 'utf8'));
+    const visit = (suite: any) => {
+      for (const sp of suite.specs ?? []) {
+        for (const t of sp.tests ?? []) {
+          for (const res of t.results ?? []) {
+            if (res.status === 'failed' || res.status === 'timedOut') {
+              const msg = res.error?.message || res.errors?.[0]?.message || res.status;
+              out.push(`${sp.title}: ${String(msg).split('\n')[0].slice(0, 200)}`);
+            }
+          }
+        }
+      }
+      for (const s of suite.suites ?? []) visit(s);
+    };
+    for (const s of r.suites ?? []) visit(s);
+  } catch { /* missing/!json → no runtime signals */ }
+  return out;
+}
+/**
+ * Build the repair plan for a unit.
+ * @param unitId      capability-resolution id (`api/<area>`, `flows/<flow>`, or a screen)
+ * @param reportName  the bare name used for `.sungen/reports/<name>-audit.json` (+ test-result)
+ * @param generatedDir the unit's specs/generated dir (for runtime failures); optional
+ */
+export function planRepair(unitId: string, reportName: string, cwd: string, generatedDir?: string): RepairPlan {
+  discoverAndRegisterCapabilities();
+  const capId = scoringCapabilityFor(unitId, capabilityRegistry.defaultCapabilityId());
+  const rules = (capId ? capabilityRegistry.get(capId)?.repair?.rules : undefined) ?? [];
+  const signals: { source: 'audit' | 'runtime'; text: string }[] = [];
+  const auditPath = path.join(cwd, '.sungen', 'reports', `${reportName}-audit.json`);
+  if (fs.existsSync(auditPath)) {
+    try { for (const f of JSON.parse(fs.readFileSync(auditPath, 'utf8')).findings ?? []) signals.push({ source: 'audit', text: String(f) }); } catch { /* ignore */ }
+  }
+  if (generatedDir && fs.existsSync(generatedDir)) {
+    for (const f of fs.readdirSync(generatedDir)) {
+      if (/test-result.*\.json$/.test(f)) for (const msg of failuresFromResult(path.join(generatedDir, f))) signals.push({ source: 'runtime', text: msg });
+    }
+  }
+  const proposals: RepairProposal[] = [];
+  const unmatched: string[] = [];
+  for (const s of signals) {
+    const rule = rules.find((r) => r.match.test(s.text));
+    if (rule) proposals.push({ source: s.source, signal: s.text, ruleId: rule.id, fix: rule.fix });
+    else unmatched.push(s.text);
+  }
+  return { capability: capId, rulesAvailable: rules.length, proposals, unmatched };
+}

package/src/harness/script-check.ts CHANGED Viewed

@@ -16,6 +16,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { loadScenarios, ScenarioInfo } from './parse';
+import { featureBasename } from './unit-paths';
 export interface ScriptCheckResult {
   screen: string;
@@ -67,6 +68,9 @@ export function analyzeFaithfulness(specSrc: string, automatedTitles: Set<string
   const hollowSteps: { test: string; step: string }[] = [];
   for (const blk of extractTestBlocks(specSrc)) {
     if (!automatedTitles.has(blk.title)) continue; // only non-@manual scenarios
+    // TQ-11 — a capability-pending @requires scenario compiles to a `test.skip(true, …)` stub:
+    // it intentionally proves nothing here (it runs once the driver is added), so it is not a bypass.
+    if (blk.body.some((l) => /\btest\.skip\(\s*true\b/.test(l))) continue;
     const body = blk.body;
     // An assertion is a Playwright `expect(...)` OR a Data Driver DB assertion
     // (`db.assertRow/assertNoRow/assertCount/...`) — a DB check is a real oracle, so a
@@ -106,9 +110,18 @@ function normalize(src: string): string {
     .trim();
 }
-function findSpec(dir: string, name: string, flowMode: boolean): string | null {
+/** The unit kind — drives the generated-spec subdir + the qa source dir. */
+export type UnitKind = 'screen' | 'flow' | 'api';
+/** Generated-spec subdir for a unit: screen → <name>, flow → flows/<name>, api → api/<name>. */
+function specSubdir(dir: string, name: string, kind: UnitKind): string {
+  return kind === 'flow' ? path.join(dir, 'flows', name) : kind === 'api' ? path.join(dir, 'api', name) : path.join(dir, name);
+}
+function findSpec(dir: string, name: string, kind: UnitKind): string | null {
   // Screens compile to  <dir>/<name>/<feature>.spec.ts
   // Flows   compile to  <dir>/flows/<name>/<feature>.spec.ts
+  // Api     compile to  <dir>/api/<name>/<feature>.spec.ts
   // Scope the search to THIS target's own subdir — otherwise the first spec of
   // ANY other screen/flow is returned, which (for an uncompiled flow) falsely
   // reports the wrong screen's tests as drift.
@@ -121,19 +134,19 @@ function findSpec(dir: string, name: string, flowMode: boolean): string | null {
       else if (e.name.endsWith('.spec.ts')) hits.push(p);
     }
   };
-  const scoped = flowMode ? path.join(dir, 'flows', name) : path.join(dir, name);
+  const scoped = specSubdir(dir, name, kind);
   if (!fs.existsSync(scoped)) return null; // no spec for this target (e.g. not compiled yet)
   walk(scoped);
   return hits[0] ?? null;
 }
-export async function runScriptCheck(screenDir: string, screenName: string, flowMode: boolean): Promise<ScriptCheckResult> {
-  const featurePath = path.join(screenDir, 'features', `${screenName}.feature`);
+export async function runScriptCheck(screenDir: string, screenName: string, kind: UnitKind): Promise<ScriptCheckResult> {
+  const featurePath = path.join(screenDir, 'features', `${featureBasename(screenName)}.feature`);
   const scenarios = loadScenarios(featurePath);
   const automated = scenarios.filter((s) => !s.manual);
   const manual = scenarios.filter((s) => s.manual);
-  const committedSpec = findSpec(path.join(process.cwd(), 'specs', 'generated'), screenName, flowMode);
+  const committedSpec = findSpec(path.join(process.cwd(), 'specs', 'generated'), screenName, kind);
   const findings: string[] = [];
   let specTitles: string[] = [];
@@ -167,10 +180,14 @@ export async function runScriptCheck(screenDir: string, screenName: string, flow
     try {
       const { CodeGenerator } = require('../generators/test-generator/code-generator');
       const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'sungen-scriptcheck-'));
-      const qaSourceDir = path.join(process.cwd(), 'qa', flowMode ? 'flows' : 'screens');
-      const gen = new CodeGenerator({ framework: 'playwright', screenName, runtimeData: true, flowMode });
+      const qaSourceDir = path.join(process.cwd(), 'qa', kind === 'flow' ? 'flows' : kind === 'api' ? 'api' : 'screens');
+      // api units derive their unit id (api/<area>) from the feature path — like `generate --api`;
+      // screen/flow pass screenName + flowMode explicitly (unchanged → byte-identical regenerate).
+      const gen = kind === 'api'
+        ? new CodeGenerator({ framework: 'playwright', runtimeData: true })
+        : new CodeGenerator({ framework: 'playwright', screenName, runtimeData: true, flowMode: kind === 'flow' });
       await gen.generateAllTests(qaSourceDir, tmp, [featurePath]);
-      const fresh = findSpec(tmp, screenName, flowMode);
+      const fresh = findSpec(tmp, screenName, kind);
       if (fresh) {
         const a = normalize(specSrc);
         const b = normalize(fs.readFileSync(fresh, 'utf-8'));

package/src/harness/sensors.ts CHANGED Viewed

@@ -111,6 +111,11 @@ export interface DepthResult {
   businessCriticalShallow: number;   // = depth-required scenarios that are shallow
   bcDepthRatio: number;              // fraction of depth-required scenarios with a real data assertion
   shallowBusinessCritical: { name: string; category?: string }[];
+  // @manual scenarios that would be business-critical if automated (match a data-theme).
+  // They are EXCLUDED from bcDepthRatio, so deferring them to @manual collapses the
+  // denominator and inflates the ratio toward 1.0 — reported so a high ratio on a tiny
+  // denominator isn't misread as "all good" (TQ-3).
+  deferredBusinessCritical: number;
   // Depth-as-Gate (harness-roadmap P1)
   focus: string;                     // intent focus driving the threshold
   threshold: number;                 // required bcDepthRatio for this focus
@@ -124,6 +129,16 @@ const DEPTH_THRESHOLDS: Record<string, number> = {
 };
 const WARN_ONLY_FOCUS = new Set(['smoke']);
+/** The required businessDepth ratio for a focus (default `functional` = 0.7). Shared so a capability
+ *  gate (e.g. the API gate, which computes its own depth) uses the SAME thresholds as the UI gate. */
+export function depthThresholdFor(focus: string): number {
+  return DEPTH_THRESHOLDS[focus] ?? DEPTH_THRESHOLDS.functional;
+}
+/** Whether a depth miss only WARNs (vs FAILs) for a focus (smoke). */
+export function depthWarnOnly(focus: string): boolean {
+  return WARN_ONLY_FOCUS.has(focus);
+}
 /**
  * Depth = do DATA-correctness scenarios actually assert DATA (not just visibility)?
  * "Depth-required" is CATALOG-DRIVEN: only scenarios matching a theme whose
@@ -151,6 +166,8 @@ export function assertionDepth(
   const required = nonManual.filter(isDepthRequired);
   const reqShallow = required.filter((s) => s.shallow);
+  // Business-critical scenarios deferred to @manual (match a data-theme but excluded above).
+  const deferredBusinessCritical = scenarios.filter((s) => s.manual && isDepthRequired(s)).length;
   // No data-theme scenarios on this screen → depth is not the binding constraint
   // (the viewpoint gate already flags missing data themes). Don't double-penalize.
   const ratio = required.length ? 1 - reqShallow.length / required.length : 1;
@@ -167,12 +184,64 @@ export function assertionDepth(
     businessCriticalShallow: reqShallow.length,
     bcDepthRatio: ratio,
     shallowBusinessCritical: reqShallow.map((s) => ({ name: s.name, category: s.category })),
+    deferredBusinessCritical,
     focus,
     threshold,
     verdict,
   };
 }
+// ---------- Sensor 2b: Automatable-@manual (TQ-2) ----------
+export interface AutomatableManualResult {
+  manualTotal: number;                                  // all @manual scenarios
+  automatable: number;                                  // @manual that are actually automatable
+  scenarios: { name: string; category?: string }[];     // the automatable ones (to surface)
+}
+// Genuine-judgment markers (M6/M8/M9 territory): visual/responsive/a11y/mock/network/
+// external/empty-state — these legitimately stay @manual (or need a future driver).
+const JUDGMENT_MARKER =
+  /\b(visual|responsive|layout|breakpoint|mobile|tablet|viewport|accessib|a11y|keyboard|screen reader|focus order|\baria\b|empty[- ]?(state|product|list|category|cart)|no[- ]?result|missing (image|product|data)|placeholder|fallback|slow|failing|offline|network|loading|spinner|external|new tab|video tutorial|email|mailbox|download|payment gateway|exploratory|not worth)\b/;
+/**
+ * Automatable-@manual (TQ-2) — a `@manual` scenario whose steps are fully DSL-expressible
+ * (it carries a real data assertion) and shows no genuine-judgment marker is *automatable*:
+ * it was deferred (typically cross-screen → a flow) rather than truly un-automatable. Leaving
+ * it `@manual` creates a non-running duplicate AND inflates businessDepth (it's excluded from
+ * the ratio). The UI analog of the API driver's `api-manual-automatable`.
+ */
+export function automatableManual(scenarios: ScenarioInfo[]): AutomatableManualResult {
+  const manual = scenarios.filter((s) => s.manual);
+  const automatable = manual.filter((s) => s.hasDataAssertion && !JUDGMENT_MARKER.test(s.haystack));
+  return {
+    manualTotal: manual.length,
+    automatable: automatable.length,
+    scenarios: automatable.map((s) => ({ name: s.name, category: s.category })),
+  };
+}
+// ---------- TQ-4: deferral-aware coverage credit ----------
+/**
+ * Which of the given gate gap-themes are deeply covered by a FLOW scenario (a cross-screen
+ * deferral the flow actually fulfils). Returns theme → covering flow. The screen audit uses
+ * this to credit an inherently-cross-screen theme to the flow that owns it, instead of
+ * double-counting it as a screen gap. A flow scenario covers a theme when its haystack hits
+ * the theme keywords AND it carries a data assertion (`deep`).
+ */
+export function flowCoveredThemes(
+  gaps: { theme: string; keywords: string[] }[],
+  flowScenarios: { flow: string; haystack: string; deep: boolean }[],
+): { theme: string; flow: string }[] {
+  const out: { theme: string; flow: string }[] = [];
+  for (const g of gaps) {
+    const hit = flowScenarios.find((s) => s.deep && g.keywords.some((k) => s.haystack.includes(k.toLowerCase())));
+    if (hit) out.push({ theme: g.theme, flow: hit.flow });
+  }
+  return out;
+}
 /** Collect data-correctness themes (depth.requires) for a page-type + universal. */
 export function dataThemesFor(catalog: Catalog, pageType: string | null): CatalogTheme[] {
   const themes: CatalogTheme[] = [];
@@ -384,8 +453,8 @@ const CLAIM_RULES: ClaimRule[] = [
     // "double-click does not create two orders" — not a per-feature keyword.
     claim: 'no-side-effect/no-duplicate',
     title: /(?=.*\b(submit|sen[dt]|resend|resubmit|re-?fire|re-?issue|re-?post|repost|create|charge|order|payment|\bpay\b|email|request|\botp\b|insert|register|book|duplicate|double[- ]?submit|again|twice)\b)(?=.*(\bno\b|\bnot\b|n['’]t\b|\bnever\b|\bwithout\b|\bcannot\b|prevent|block|avoid|reject|disabl|\bdeny\b|denies|\bkhông\b|\bchưa\b))/i,
-    proof: /\bcount\b|row with \{\{|table with|tohavecount|is hidden|are hidden|not complete|no longer/,
-    need: 'a record/request-count proof (count stays at one, e.g. `User see [Table] row with {{count}}`) or @manual with a request-count oracle',
+    proof: /\bcount\b|ok_count|status_counts|row with \{\{|table with|tohavecount|is hidden|are hidden|not complete|no longer/,
+    need: 'a record/request-count proof (count stays at one, e.g. `User see [Table] row with {{count}}`, an API `{{name.ok_count}}` invariant, or a `@query` DB count) or @manual with a request-count oracle',
     hint: 'a "does-not-happen / does-not-repeat" claim about a state-changing action is NOT proven by a terminal `see [...] page` — that page is identical whether or not the action (re-)fired. Prove the side-effect count is unchanged, or mark @manual with a setup→action→assert-no-duplicate oracle.',
     severity: 'fail',
   },

package/src/harness/trace.ts CHANGED Viewed

@@ -13,6 +13,7 @@
  */
 import * as fs from 'fs';
 import * as path from 'path';
+import { reportSlug } from './unit-paths';
 import { segmentRuns, latestRunEvents, LedgerEvent } from './ledger';
 interface ManualItem { scenario: string; reason: string }
@@ -22,7 +23,7 @@ function readJson(p: string): any | null {
 }
 function readLedger(screen: string): any[] {
-  const p = path.join(process.cwd(), '.sungen', 'ledger', `${screen}.jsonl`);
+  const p = path.join(process.cwd(), '.sungen', 'ledger', `${reportSlug(screen)}.jsonl`);
   if (!fs.existsSync(p)) return [];
   return fs.readFileSync(p, 'utf-8').split('\n').filter(Boolean).map((l) => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean);
 }
@@ -76,7 +77,7 @@ export function buildTrace(screenDir: string, screenName: string): TraceReport {
   const recordedSteps = [...new Set(ledger.map((e) => e.step.replace(/:\d+$/, '')))];
   const missingSteps = EXPECTED_PHASES.filter((p) => !recordedSteps.includes(p));
-  const auditRaw = readJson(path.join(process.cwd(), '.sungen', 'reports', `${screenName}-audit.json`));
+  const auditRaw = readJson(path.join(process.cwd(), '.sungen', 'reports', `${reportSlug(screenName)}-audit.json`));
   let audit: TraceReport['audit'] = null;
   if (auditRaw) {
     const subs: Record<string, number> = {
@@ -91,7 +92,7 @@ export function buildTrace(screenDir: string, screenName: string): TraceReport {
     };
   }
-  const scRaw = readJson(path.join(process.cwd(), '.sungen', 'reports', `${screenName}-script-check.json`));
+  const scRaw = readJson(path.join(process.cwd(), '.sungen', 'reports', `${reportSlug(screenName)}-script-check.json`));
   const drift = scRaw ? scRaw.drift : null;
   const manual = parseManual(path.join(screenDir, 'features', `${screenName}.feature`));

package/src/harness/unit-paths.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * Unit-path helpers (api-flow fix). A unit id may be a bare name (`orders`, `login`) or a nested
+ * api-flow id (`flows/<flow>`). Two derivations the harness/CLI need:
+ *  - featureBasename: the `.feature` filename — the LAST path segment (`flows/x` → `x`), so
+ *    `<dir>/features/<basename>.feature` resolves (the bug: the full id looked for
+ *    `features/flows/x.feature` → 0 scenarios).
+ *  - reportSlug: a flat key for `.sungen/reports/<slug>-*.json` + `.sungen/ledger/<slug>.jsonl`
+ *    (`flows/x` → `flows-x`), so artifacts never nest under a `flows/` subdir and read/write agree.
+ * Bare names (no slash) are unchanged by both → no regression for screens/flows/areas.
+ */
+import * as path from 'path';
+export const featureBasename = (unit: string): string => path.basename(unit);
+export const reportSlug = (unit: string): string => unit.replace(/[\\/]+/g, '-');

package/src/index.ts ADDED Viewed

@@ -0,0 +1,32 @@
+/**
+ * Public API of `@sun-asterisk/sungen` — the capability SPI plus the shared compiler/harness surface
+ * that capability drivers (`@sungen/driver-*`) build against. Drivers import from here; core never
+ * imports from a driver (discovery loads them at runtime). Keep this surface small and intentional.
+ */
+// --- Capability SPI ---
+export { capabilityRegistry, CapabilityRegistry } from './capabilities/registry';
+export type { CapabilityDescriptor } from './capabilities/registry';
+export type { Sensor, SensorFinding, AdvisoryScanInput, GateInput } from './capabilities/sensor';
+export type { Context, DiscoveryProvider, ContextMapper, GenerationUnit, RepairProvider, RepairRule } from './capabilities/context';
+export { discoverUnitContext } from './orchestrator/context-discovery';
+export type { DiscoveredContext } from './orchestrator/context-discovery';
+// --- Step-pattern authoring (a driver contributes step patterns via its descriptor) ---
+export type { PatternContext, StepPattern, StepTemplateData } from './generators/test-generator/patterns/types';
+export type { MappedStep } from './generators/test-generator/step-mapper';
+export type { ParsedStep } from './generators/gherkin-parser';
+export { getPathCode, inferPath, resolvePathVariables } from './generators/test-generator/utils/path-inference';
+// --- Precondition-annotation override grammar (shared by the @query / @api driver codegen) ---
+export { parseQueryOverrides } from './harness/annotation-overrides';
+// --- Named-query catalog (shared: the DB driver's codegen + core's data-driven advisory lint) ---
+export { resolveQuery, compileQuery, lintCatalog } from './harness/query-catalog';
+export type { QueryEntry } from './harness/query-catalog';
+// --- Shared harness: viewpoint catalog + coverage gate / assertion depth ---
+// (the UI capability's gateProvider composes these; they also back core's ingest + audit fallback)
+export { loadCatalog, viewpointGate, assertionDepth, dataThemesFor, depthThresholdFor, depthWarnOnly } from './harness/sensors';
+export type { Catalog, GateResult, DepthResult } from './harness/sensors';
+export type { ScenarioInfo, ViewpointEntry } from './harness/parse';

package/src/orchestrator/ai-rules-updater.ts CHANGED Viewed

@@ -47,6 +47,7 @@ export const AI_RULES_FILE_MAPPING: [string, string][] = [
   ['claude-skill-selector-fix.md', '.claude/skills/sungen-selector-fix/SKILL.md'],
   ['claude-skill-tc-review.md', '.claude/skills/sungen-tc-review/SKILL.md'],
   ['claude-skill-harness-audit.md', '.claude/skills/sungen-harness-audit/SKILL.md'],
+  ['claude-skill-api-design.md', '.claude/skills/sungen-api-design/SKILL.md'],
   ['claude-skill-ingest-legacy.md', '.claude/skills/sungen-ingest-legacy/SKILL.md'],
   ['claude-skill-viewpoint.md', '.claude/skills/sungen-viewpoint/SKILL.md'],
   ['claude-skill-viewpoint-group-a-data-entry.md', '.claude/skills/sungen-viewpoint/group-a-data-entry.md'],
@@ -79,6 +80,7 @@ export const AI_RULES_FILE_MAPPING: [string, string][] = [
   ['github-skill-sungen-selector-fix.md', '.github/skills/sungen-selector-fix/SKILL.md'],
   ['github-skill-sungen-tc-review.md', '.github/skills/sungen-tc-review/SKILL.md'],
   ['github-skill-sungen-harness-audit.md', '.github/skills/sungen-harness-audit/SKILL.md'],
+  ['github-skill-sungen-api-design.md', '.github/skills/sungen-api-design/SKILL.md'],
   ['github-skill-sungen-ingest-legacy.md', '.github/skills/sungen-ingest-legacy/SKILL.md'],
   ['github-skill-sungen-viewpoint.md', '.github/skills/sungen-viewpoint/SKILL.md'],
   ['github-skill-sungen-viewpoint-group-a-data-entry.md', '.github/skills/sungen-viewpoint/group-a-data-entry.md'],

package/src/orchestrator/context-discovery.ts ADDED Viewed

@@ -0,0 +1,50 @@
+/**
+ * Orchestration phase: Discover → Contextualize (AO-3).
+ *
+ * The capability SPI declared `discovery` (sources → Context slice) + `contextMapper` (Context →
+ * generation units) since R1, but nothing consumed them. This is the deterministic consumer the
+ * orchestration loop (`/sungen:design`) calls before generating: resolve the unit's capability,
+ * run ITS discovery + contextMapper, and hand back the normalized Context + the generation-unit
+ * work-list. Capability-agnostic — a screen, an api area, or a future `mobile/` unit all flow here.
+ */
+import { capabilityRegistry } from '../capabilities/registry';
+import { discoverAndRegisterCapabilities } from '../capabilities/discover';
+import type { Context, GenerationUnit } from '../capabilities/context';
+export interface DiscoveredContext {
+  capability: string;
+  context: Context;
+  units: GenerationUnit[];
+}
+/** Map a unit id (relative to qa/) to a discovery target. */
+function targetForUnit(unitId: string): Context['target'] {
+  if (unitId.startsWith('api/')) return { kind: 'api', id: unitId };
+  if (unitId.startsWith('flows/')) return { kind: 'flow', id: unitId };
+  return { kind: 'screen', id: unitId };
+}
+/**
+ * Run the Discover → Contextualize phase for a unit. `unitId` is the catalog-resolution id
+ * (`<screen>` · `flows/<flow>` · `api/<area>` · `api/flows/<flow>`).
+ */
+export async function discoverUnitContext(unitId: string, cwd: string = process.cwd()): Promise<DiscoveredContext> {
+  discoverAndRegisterCapabilities();
+  const target = targetForUnit(unitId);
+  // The discovering capability is the first whose DiscoveryProvider claims the target; else the default.
+  const cap = capabilityRegistry.all().find((c) => c.discovery?.appliesTo?.(target))
+    ?? (capabilityRegistry.defaultCapabilityId() ? capabilityRegistry.get(capabilityRegistry.defaultCapabilityId()!) : undefined);
+  let context: Context = { target: { ...target, capability: cap?.id }, sources: {}, facts: {} };
+  if (cap?.discovery) {
+    const slice = await cap.discovery.discover(target, { cwd });
+    context = {
+      target: { ...target, capability: cap.id },
+      sources: { ...context.sources, ...(slice.sources ?? {}) },
+      facts: { ...context.facts, ...(slice.facts ?? {}) },
+      ...(slice.connectivity ? { connectivity: slice.connectivity } : {}),
+    };
+  }
+  const units = cap?.contextMapper?.decompose(context) ?? [];
+  return { capability: cap?.id ?? 'ui', context, units };
+}

package/src/orchestrator/templates/ai-instructions/claude-agent-reviewer.md CHANGED Viewed

@@ -17,8 +17,14 @@ You are an **independent Senior QA Reviewer**. You did **not** write these tests
    - **Negative / "does-not-happen" claims** (any language — "does not", "no", "prevents", "không", "chưa"): the proof must be a step whose result **differs** between the claim holding and not holding. Ask: *would this `Then` still pass if the bad thing happened?* If yes, it proves nothing. The classic trap: title "browser back does **not** re-submit" with `Then see [sent] page` — that page is identical whether or not the request re-fired. Demand a **contrast/count** proof (record count unchanged, state hidden/empty, error shown) or a justified `@manual` with a setup→action→assert-absence oracle. This generalises to every side-effect (re-charge, duplicate order, resend OTP, data leak), not just re-submit.
 2. **Observable Then.** Is each `Then` an **observable outcome**, not a restated action or a tautology (e.g. `Then User see [Carousel] section` after clicking next — proves nothing changed)?
 3. **Business-critical depth.** For cart / product-detail / filter / list viewpoints, do steps assert **DATA** (name, price, quantity, all-items-belong) — not just page/modal visibility? Recommend the concrete deep step: `User remember [X] text as {{v}}` + `... with {{v}}`, or `User see all [X] contain {{v}}`.
-4. **@manual justification.** Is each `@manual` genuinely unautomatable (cross-screen/external/visual) — or a cop-out to dodge the gate? Cross-screen → should be a flow.
+4. **@manual justification.** Is each `@manual` genuinely unautomatable (external/visual/a11y/mock-needed/judgment) — or a cop-out to dodge the gate? **Cross-screen is NOT a valid `@manual` reason** — a home→detail/cart journey runs as one automated test, so it belongs in a **flow** (`/sungen:add-flow`), not a `@manual` screen copy. A `@manual` scenario that still carries full automatable steps (a data assertion, no visual/mock/a11y judgment) is automatable — flag it (the gate reports it as `MANUAL-AUTOMATABLE`). Genuine `@manual` must name its reason (`@manual:Mx`).
 5. **Meaning-level duplicates & missing criticals** the keyword gate can't see.
+6. **API units** (`qa/api/<area>/` — `@api` scenarios, no UI). Judge what the api gate can't:
+   - **Prove the effect, not the status.** A mutating endpoint's success path asserting only `{{r.status}} is 201` proves nothing about WHAT changed — demand a **body** assertion (`{{r.body.id}}` / `{{r.body.<field>}}`), a **`@query`** DB side-effect, or (idempotency) a `{{r.ok_count}}` invariant. This is the API businessDepth bar.
+   - **Error matrix coherent.** `@cases` rows are a real failure family (validation/auth/conflict) with realistic inputs → declared statuses, not padding.
+   - **Flows self-clean.** A CRUD/auth chain deletes what it created (final `@api:delete_*`) or is `@cleanup`-tagged.
+   - **Idempotency uses the DB oracle.** A "no double-charge / exactly once" claim is proven by `@concurrent` + a `@query` count, not HTTP status alone (status can lie under a race).
+   - **Auth negatives** exist for protected mutations (401/403), not just the happy path.
 ## Output (do NOT edit any file)
 Return a concise verdict:

package/src/orchestrator/templates/ai-instructions/claude-cmd-create-test.md CHANGED Viewed

@@ -23,7 +23,11 @@ You are a **Senior QA Engineer** specialized in test case design. You structure
 Parse **name** from `$ARGUMENTS`. If missing, ask the user.
-**Auto-detect context**: check if `qa/flows/<name>/` exists → flow mode. Else check `qa/screens/<name>/` → screen mode. This determines paths, generation strategy, and CLI commands.
+**Auto-detect context**: check if `qa/api/<name>/` or `qa/api/flows/<name>/` exists → **API unit mode** (below). Else if `qa/flows/<name>/` → flow mode. Else `qa/screens/<name>/` → screen mode. This determines paths, generation strategy, and CLI commands.
+## API unit mode (driver-api)
+If the unit is **api-first** (`qa/api/<name>/` or `qa/api/flows/<name>/`), the design loop differs — **no visual capture, no selectors**; the contract is the named-endpoint catalog. **Follow the `sungen-api-design` skill end-to-end** instead of the screen/flow steps below: `sungen context --area <name>` (discover) → API viewpoint overview → generate `@api`/`@cases`/flow/`@concurrent`/`@query` scenarios → **`sungen audit --area <name>` gate + the `sungen-reviewer` sub-agent + repair loop to businessDepth ≥ 0.7** → record + trace. Then jump to the "Converge" next-step options (recommend `/sungen:run-test <name>`). The capture / viewpoint-group / selector steps do **not** apply.
 ## Steps
@@ -31,9 +35,10 @@ Parse **name** from `$ARGUMENTS`. If missing, ask the user.
    **Screen**: Verify `qa/screens/<name>/` exists. If not → `/sungen:add-screen` first.
 2. Check if `.feature` file already has scenarios.
    - If yes → use `AskUserQuestion` to ask the update mode (see `sungen-tc-generation` skill — mode depends on which tiers already exist).
-   - If no → fresh creation. Use `AskUserQuestion` to ask generation scope:
-     - **Tier 1 — Critical & High priority** — ~10-15 scenarios/section covering happy paths, core validation, security basics **(Recommended)**
-     - **Full coverage — All tiers at once** — generates Tier 1 + 2 + 3 in one run. Large output (~40-60 scenarios/section), best for experienced users who want complete coverage immediately
+   - If no → fresh creation. **Write the feature file incrementally** (successive `Write`/`Edit`, ≈10-15 scenarios per call) — never emit the whole suite in one response, or it can exceed the model's output-token cap (`API Error: Claude's response exceeded the N output token maximum`). Use `AskUserQuestion` to ask generation scope:
+     - **Tier 1 — Critical & High priority** — ~10-15 scenarios/section: happy paths, core validation, security basics **(Recommended)**
+     - **Full coverage (incremental)** — Tier 1 + 2 + 3, written tier-by-tier in batches (`Write` T1 → `Edit` append T2 → `Edit` append T3). Safe on any output-token budget.
+     - **Full coverage (single pass)** — generate everything in one go (~40-60 scenarios/section). Faster, but **only if you raised your output cap** (`CLAUDE_CODE_MAX_OUTPUT_TOKENS ≥ 64000`) — otherwise it errors mid-generation. For power users on a high-token model/config.
 3. **Read project context + screen requirements**
    **Project context** — check `qa/context.md` (project root, not screen-specific):
@@ -73,7 +78,7 @@ Parse **name** from `$ARGUMENTS`. If missing, ask the user.
    - **Independent semantic review.** **Claude Code:** spawn the **`sungen-reviewer`** sub-agent (Task tool, `subagent_type: sungen-reviewer`) — it judges what the gate can't (does each scenario's steps PROVE its title/viewpoint, observable Thens, business-critical assertion depth) and returns `VERDICT` + `ISSUES` with concrete fixes. **Merge its NEEDS-REPAIR issues with the audit findings.** (Copilot / no sub-agents: run the same review inline using the `sungen-reviewer` criteria.)
    - Repair **both** the audit findings and the reviewer issues (budget 3 rounds), then re-audit:
    - If the gate FAILs or there are findings, **repair** (budget 3 rounds), then re-audit:
-     - **GATE** missing critical theme → generate scenarios for it. If it is **cross-screen** (cart-correctness, product-detail-consistency, filter-result-correctness): write the scenario with **observable data assertions** (`... with {{value}}`, `table ... with {{value}}`), tag it `@manual`, and add a comment `# Deferred to a flow (<screen> -> <target>) for automation`. Do **not** fake a shallow single-screen pass.
+     - **GATE** missing critical theme → generate scenarios for it. If it is **cross-screen** (cart-correctness, product-detail-consistency, filter-result-correctness): **automate it in the flow** (`/sungen:add-flow` if none exists) with observable data assertions (`... with {{value}}`, `see all ... contain {{v}}`) — a single home→target journey runs as one Playwright test. Do **not** write a full `@manual` duplicate of it on the screen (that is a non-running dead copy — `sungen audit` flags it `MANUAL-AUTOMATABLE`), and do **not** fake a shallow single-screen pass. Reserve `@manual` for true judgment / missing-capability, tagged `@manual:Mx`.
      - **DEPTH** → replace `see [X] page/section` on business-critical scenarios with data assertions.
      - **BALANCE** → stop expanding secondary viewpoints; add business-core scenarios first.
      - **TRACE** → align `VP-` ids with the viewpoint-overview.

package/src/orchestrator/templates/ai-instructions/claude-cmd-run-test.md CHANGED Viewed

@@ -30,7 +30,23 @@ If the count is 0 → use `AskUserQuestion` to offer:
 Skip this pre-flight when `--env` matches the base locale (no overlay needed in that case).
-**Auto-detect context**: check if `qa/flows/<name>/` exists → flow mode (base path: `qa/flows/<name>/`). Else check `qa/screens/<name>/` → screen mode (base path: `qa/screens/<name>/`).
+**Auto-detect context**: check if `qa/api/<name>/` or `qa/api/flows/<name>/` exists → **API unit mode** (below). Else if `qa/flows/<name>/` → flow mode (base path: `qa/flows/<name>/`). Else `qa/screens/<name>/` → screen mode (base path: `qa/screens/<name>/`).
+## API unit mode (driver-api) — no selectors
+If the unit is **api-first**, skip every selector/capture phase (an API test has no DOM). Instead:
+1. **Resolve the datasource** — ensure the `kind: api` datasource's `base_url` + auth are wired in `qa/datasources.yaml` + `.env.qa` (the `${X_URL}` key from `sungen api init`). A `production` datasource is refused unless `SUNGEN_ALLOW_PROD=1`.
+2. **Compile**: `[ -x ./bin/sungen.js ] && ./bin/sungen.js generate --area <name> || npx sungen generate --area <name>` → `specs/generated/api/<name>/`.
+3. **Run**: `npx playwright test specs/generated/api/<name>/<name>.spec.ts` (per-spec JSON results, as below).
+4. **Auto-fix** (no selectors — the failure classes differ): use `sungen-error-mapping`.
+   - **401/403** → wire `@hybrid` + `@auth:<role>` (reuse the UI session) or the catalog `Bearer :token` header; suggest `sungen makeauth <role>`.
+   - **datasource/base_url unresolved** → set the `${X_URL}` key in `.env.qa`.
+   - **missing/empty bound param** → trace `{{var}}` to test-data or a prior `@api` response; fill it.
+   - **`expect.status` mismatch** → reconcile against `apis.yaml`/spec (the catalog is the oracle); **never hand-edit the generated spec** (re-`generate --area` instead).
+   - **400 "parameter missing" / body ignored** → the endpoint wants a form body, not JSON → set `encoding: form` (or `multipart`) on the catalog entry, re-`generate --area`. Don't mark the scenario `@manual`.
+   - **flaky** → enforce self-cleaning flows, per-row isolation (`@cases`), `@concurrent` caps.
+5. **Integrity + trace** — `sungen script-check --area <name>` (verify the spec is a 1:1 of the Gherkin; on DRIFT re-`generate --area`, never hand-edit) and `sungen trace --area <name>` (process map + HUMAN-LOOP FOCUS). Then report + offer next steps.
 ## Pre-run (phased — per `sungen-selector-fix` skill)
@@ -86,6 +102,7 @@ Skip this pre-flight when `--env` matches the base locale (no overlay needed in
 9. **Integrity check & trace (always run after the final run).**
    - `sungen script-check --screen <name>` — verify the generated spec is a **1:1** of the Gherkin (every non-@manual scenario ↔ one `test()`, no drift). If it reports **DRIFT** (spec hand-edited or stale), re-run `sungen generate --screen <name>` so the spec matches the feature, then re-run — **never hand-edit the generated spec** (auto-fix must edit `selectors.yaml`, not the `.spec.ts`).
    - `sungen ledger record --screen <name> --step run --ms <elapsed>` (record this run), then `sungen trace --screen <name>` — show the process map + bottlenecks + **HUMAN-LOOP FOCUS** (the @manual scenarios the QA must verify) to the user.
+10. **Capability-pending offer (consent-gated).** If `sungen audit --screen <name>` reports `AUTOMATION-READY-PENDING` (or the run shows `@requires:<cap>` tests skipped "requires …"), these are **automation-ready** scenarios waiting on an opt-in driver. Use `AskUserQuestion` to offer: *"N scenario(s) are automation-ready — enable `<cap>` to run them? (`sungen capability add <cap>`)"*. **Only on the user's yes** run `sungen capability add <cap>` then re-run those specs; on no, leave them skipped (they are NOT failures and NOT manual). **Never auto-install.**
 ## Playwright command guidelines

package/src/orchestrator/templates/ai-instructions/claude-skill-api-design.md ADDED Viewed

@@ -0,0 +1,62 @@
+---
+name: sungen-api-design
+description: The API-first design loop for an api unit (qa/api/<area> or qa/api/flows/<flow>) — discover the catalog, lay out the API viewpoints, generate @api/@cases/flow/@concurrent scenarios, then drive the sungen audit --area gate + reviewer + repair to a high businessDepth (≥0.7). Use when create-test/run-test detects an api unit (no selectors, no visual capture).
+---
+# API design loop (driver-api · Orchestration + Harness)
+Use this when the unit is **api-first** — `qa/api/<area>/` or `qa/api/flows/<flow>/`. There are **no selectors and no visual capture**: the contract is the **named-endpoint catalog** (`api/apis.yaml`), referenced by `@api:<name>`. QA writes **no HTTP code**. Full annotation reference: the **API Steps** guide (`@api` / `@cases` / flows / `@concurrent` / `@hybrid`).
+## The loop (mirror of /sungen:design, API-native)
+### 1. Discover (no capture)
+Run `sungen context --area <name>` — it reads the catalog and prints the **endpoints** + the **generation units** (one `matrix` unit per endpoint, an `async` unit per mutating endpoint, a `flow` unit for an api flow). Read `qa/api/<name>/requirements/spec.md` if present. No `apis.yaml` yet? → `sungen api import <openapi|csv>` or `sungen api add --area <name>` first.
+### 2. API viewpoint overview (by method-profile)
+For each endpoint, cover its viewpoints — severity-weighted by method:
+| Profile | Endpoints | Must cover | Then |
+|---|---|---|---|
+| read | GET, HEAD | `contract` (status + body shape) | `pagination`/`filter` (list), `not-found` (by-id) |
+| mutating | POST/PUT/PATCH/DELETE | `contract`, `error` (validation/4xx/auth) | `idempotency` (`@concurrent`), `side-effect` (`@query`) |
+Bands: **~70%** success+failure matrix · **~20%** flows (auth/CRUD chains) · **~10%** async/idempotency.
+### 3. Generate (incremental — never the whole suite in one Write)
+- **Contract**: `@api:<name>` + `expect {{name.status}} is …` **and a body assertion** (`{{name.body.<path>}}`).
+- **Error matrix**: `@api:<name>(p={{p}}) @cases:<dataset>` — one scenario, a dataset of `input → expected status`.
+- **Flow**: ordered `@api` tags threading a prior response (`token={{login.body.token}}` → the catalog `Bearer :token` header; `id={{create.body.id}}` → a path param). Self-clean (delete what you create).
+- **Idempotency**: `@api:<name> @concurrent:N` + `expect {{name.ok_count}} is 1`, cross-checked with `@query` (the DB is the oracle).
+### 4. Gate + repair (always — businessDepth ≥ 0.7 is the bar)
+Run `sungen audit --area <name>`; read `gateStatus` + `findings`. Then the **semantic reviewer** (sungen-reviewer sub-agent, API criteria). Repair **both** (budget 3 rounds), re-audit until PASS:
+| Finding | Repair |
+|---|---|
+| `VIEWPOINT-API-CONTRACT` | the endpoint is invoked but its response is never asserted → add `expect {{name.status}}` + a `{{name.body.…}}` check |
+| `VIEWPOINT-API-ERROR` | a mutating endpoint has no failure scenario → add a `@cases` error matrix (or an explicit 4xx) |
+| `VIEWPOINT-API-IDEMPOTENCY` | a mutating endpoint has no race check → add `@concurrent:N` + a `@query` DB cross-check |
+| `VIEWPOINT-API-MANUAL-AUTOMATABLE` | a `@manual` scenario whose endpoint resolves is automatable → drop `@manual`, use `@api` (+ `@cases`); reserve `@manual` for genuine judgment cases |
+| **`DEPTH-FAIL`** (businessDepth < 0.7) | a **mutating success** scenario asserts only `status` → make it **prove the effect**: assert a response **body** field, a **`@query`** side-effect, or a **`@concurrent` `ok_count`** invariant. (An error/`@cases` scenario proving the status is correct — it is *not* depth-required.) |
+Stop when the gate PASSes + businessDepth ≥ 0.7, or the budget is exhausted → report residual gaps honestly (mark genuinely-unautomatable cases `@manual` with an oracle). Never fake a pass.
+### 5. Record + converge
+`sungen manifest --area <name>` (reuse) and ledger each phase; show the trace + the HUMAN-LOOP FOCUS. (Integrity `script-check`/`trace` for api: see run-test.)
+## Taxonomy (label scenarios correctly)
+| Class | What | Examples |
+|---|---|---|
+| **Functional** | single-endpoint behaviour | happy contract · error/validation (`@cases`) · boundary/edge |
+| **Functional — flow/integration** | multi-endpoint journeys | auth/CRUD lifecycle (`create → login → get → delete`), cross-endpoint invariants |
+| **Non-Functional** | performance · reliability · **security** · concurrency/idempotency | `@concurrent` race/idempotency |
+A flow (`create → login → delete`) is a **Functional integration** test, **not** non-functional — don't file it under "Non-Functional". Reserve non-functional for perf/security/concurrency.
+## Rules
+- **No HTTP, no selectors** — only `.feature` + the reviewed `apis.yaml` + `test-data`.
+- **Non-prod default** — a `production` datasource is refused unless `SUNGEN_ALLOW_PROD=1`.
+- **The DB is the oracle** for idempotency/side-effects — HTTP status alone can lie; pair `@api` with `@query`.
+- **`@parallel` + mutating endpoints** — give each scenario **isolated data** (a `{{$uuid}}` email, a `@cases` row, or its own created resource) and **self-clean** (delete what it created); shared inputs race under parallel execution.
+- **No dead data** — every `test-data` key must be bound into a scenario (`{{key}}`, a `@cases` dataset, or an override). `sungen audit`/the generate lint flag unreferenced keys.

package/src/orchestrator/templates/ai-instructions/claude-skill-gherkin-syntax.md CHANGED Viewed

@@ -213,6 +213,7 @@ Options: `nth` `exact` `scope` `match` `variant` `frame` `contenteditable` `colu
 | `@flow` | Mark feature as E2E flow (cross-screen testing) |
 | `@cases:dataset` | Data-driven: run the scenario once per row of the `dataset` LIST in test-data → one `test()` per row |
 | `@query:name` | Database: run the named query from `database/queries.yaml` (precondition) and bind its rows to `{{name}}`; assert with `expect {{name.count}} …` + path access. Override params `@query:name(p={{v}})`. Repeatable. (Optional Data Driver — see Database verification above) |
+| `@api:name` | API: run the named request from `api/apis.yaml` (precondition) and bind the response to `{{name}}`; assert with `expect {{name.status}} …` + path access (`{{name.body.<path>}}`). Override params `@api:name(p={{v}})`. Repeatable. (Optional API Driver) |
 ### Data-driven scenarios (`@cases`)

package/src/orchestrator/templates/ai-instructions/claude-skill-harness-audit.md CHANGED Viewed

@@ -58,7 +58,7 @@ Use these when repairing GATE/DEPTH findings for the hard viewpoints (cart/detai
   ```
   `see all [X] contain {{v}}` asserts EVERY matching element contains the value → "all displayed products belong to the selected category/brand", not just one.
-> Cross-screen flows (home → detail/cart): if the target screen is a separate screen, prefer a **flow** (`/sungen:add-flow`) so the journey is one test. On a single screen, keep the cross-screen assertion but tag `@manual` with a `# Deferred to a flow` comment.
+> Cross-screen flows (home → detail/cart): **automate the journey as a flow** (`/sungen:add-flow`) — it runs as one test, so it is automatable. Do **not** keep a full `@manual` duplicate of it on the screen (a non-running dead copy that `sungen audit` flags as `MANUAL-AUTOMATABLE` and that inflates nothing — deferred business-critical is reported as `DEPTH-DEFERRED`). The screen keeps its screen-contract; the flow owns the cross-screen depth. `@manual` is for genuine judgment / missing-capability only, tagged `@manual:Mx`.
 ## Repair loop rules
@@ -66,6 +66,7 @@ Use these when repairing GATE/DEPTH findings for the hard viewpoints (cart/detai
 2. **Stop when** `gateStatus == PASS` AND `findings` empty — or budget exhausted.
 3. **Never fake a pass.** A shallow `see [Cart] page` does not satisfy `cart-correctness`. If a gap is genuinely cross-screen or needs capabilities the DSL lacks (e.g. capture an element value to compare elsewhere), **report it as a residual gap / flow item** instead of forcing a green gate.
 4. **EP/data families are OK.** A `duplicates` cluster with `sameDataLikely=false` is an intentional equivalence-partition family (e.g. many invalid-email cases) — keep it; only collapse `sameDataLikely=true` exact duplicates.
+5. **Advisory findings — surface, don't gate.** `MANUAL-REASON-MISMATCH` → fix the scenario's `@manual:Mx` code (so the planner recommends the right driver) during repair. `CAPABILITY-SUGGESTION` → **present it to the user as a next-step option** (e.g. "N @manual could be automated — `sungen capability add api db`?"), **recommend-only — never auto-install**. Neither fails the gate.
 ## Discovery / fallback tree (when input is limited)