npm - qualia-framework - Versions diffs - 6.7.1 → 6.8.0 - Mend

qualia-framework 6.7.1 → 6.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/CHANGELOG.md +2304 -0
package/FLAGS.md +73 -0
package/SOUL.md +17 -0
package/TROUBLESHOOTING.md +183 -0
package/agents/plan-checker.md +4 -0
package/agents/planner.md +8 -0
package/agents/qa-browser.md +6 -2
package/agents/research-synthesizer.md +4 -0
package/agents/researcher.md +4 -0
package/agents/roadmapper.md +4 -0
package/agents/verifier.md +1 -1
package/agents/visual-evaluator.md +8 -6
package/bin/cli.js +7 -1
package/bin/install.js +69 -7
package/bin/runtime-manifest.js +1 -0
package/bin/security-scan.js +24 -10
package/bin/trust-score.js +34 -0
package/hooks/migration-guard.js +4 -4
package/hooks/pre-deploy-gate.js +14 -4
package/hooks/stop-session-log.js +10 -7
package/package.json +6 -2
package/qualia-design/design-rubric.md +3 -1
package/rules/architecture.md +1 -1
package/rules/grounding.md +3 -1
package/rules/speed.md +2 -2
package/skills/qualia-idk/SKILL.md +3 -3
package/skills/qualia-polish/REFERENCE.md +11 -6
package/skills/qualia-polish/SKILL.md +20 -3
package/skills/qualia-polish/scripts/loop.mjs +24 -6
package/skills/qualia-polish/scripts/playwright-capture.mjs +89 -11
package/skills/qualia-polish/scripts/vibe-tokens.mjs +57 -1
package/skills/qualia-research/SKILL.md +1 -1
package/skills/qualia-road/SKILL.md +6 -0
package/skills/qualia-scope/SKILL.md +2 -2
package/skills/qualia-secure/SKILL.md +5 -5
package/templates/knowledge/index.md +4 -4
package/tests/bin.test.sh +4 -4
package/tests/lib.test.sh +2 -2

package/bin/security-scan.js CHANGED Viewed

@@ -192,7 +192,7 @@ function categoryScore(findings) {
   return { weighted_sum: ws, score: Math.max(1, 5 - Math.floor(ws / 8)), counts };
 }
-function renderMarkdown({ findings, paths, score }) {
+function renderMarkdown({ findings, paths, score, skipped = [] }) {
   const lines = [];
   lines.push(`# Qualia security scan — ${new Date().toISOString()}`);
   lines.push("");
@@ -202,10 +202,16 @@ function renderMarkdown({ findings, paths, score }) {
   lines.push(`**Score:** ${score.score} / 5 (weighted_sum=${score.weighted_sum})`);
   lines.push("");
-  if (findings.length === 0) {
+  if (skipped.length > 0) {
+    lines.push(`⚠️ **${skipped.length} file(s) could not be scanned** — result is NOT clean (score degraded, non-zero exit):`);
+    for (const s of skipped) lines.push(`- \`${s.file}\` — ${s.error}`);
+    lines.push("");
+  }
+  if (findings.length === 0 && skipped.length === 0) {
     lines.push("✅ Clean.");
     lines.push("");
-  } else {
+  } else if (findings.length > 0) {
     findings.sort((a, b) => severityOrder(a.severity) - severityOrder(b.severity));
     for (const f of findings) {
       lines.push(`### [${f.severity}] ${f.name}`);
@@ -346,6 +352,7 @@ function main() {
   const paths = args.paths || defaultPaths();
   const findings = [];
   const scanned = [];
+  const skipped = [];
   for (const p of paths) {
     try {
@@ -356,14 +363,21 @@ function main() {
       const content = fs.readFileSync(p, "utf8");
       scanned.push(p);
       findings.push(...scanContent(p, content));
-    } catch {}
+    } catch (e) { skipped.push({ file: p, error: e.message }); }
   }
   const score = categoryScore(findings);
+  // An unreadable file must not silently pass as clean. Degrade the score
+  // (min 1) and force a non-clean exit when any target could not be scanned.
+  if (skipped.length > 0) score.score = Math.max(1, score.score - 1);
+  // A scan that couldn't read every target must not exit clean — surface at
+  // least HIGH (1) so an unreadable file can't slip through a CI gate as 0.
+  const exitCode = skipped.length > 0 ? Math.max(1, exitCodeFor(findings)) : exitCodeFor(findings);
   if (args.json) {
-    process.stdout.write(JSON.stringify({ findings, scanned, score }, null, 2) + "\n");
-    process.exit(exitCodeFor(findings));
+    process.stdout.write(JSON.stringify({ findings, scanned, skipped, score }, null, 2) + "\n");
+    process.exit(exitCode);
   }
   // --deep emits a prompt pack for the /qualia-secure skill to spawn agents.
@@ -373,17 +387,17 @@ function main() {
     try { fs.mkdirSync(planningDir, { recursive: true }); } catch {}
     const staticPath = path.join(planningDir, "security-scan.md");
     const promptPath = path.join(planningDir, "security-deep-prompt.md");
-    fs.writeFileSync(staticPath, renderMarkdown({ findings, paths: scanned, score }));
+    fs.writeFileSync(staticPath, renderMarkdown({ findings, paths: scanned, score, skipped }));
     fs.writeFileSync(promptPath, renderDeepPromptPack({ findings, scanned, score }));
     console.log(`Wrote ${staticPath}`);
     console.log(`Wrote ${promptPath}`);
     console.log("");
     console.log("Now run `/qualia-secure` in a Claude session — it will read the prompt pack");
     console.log("and spawn the red/blue/auditor agents in parallel.");
-    process.exit(exitCodeFor(findings));
+    process.exit(exitCode);
   }
-  const md = renderMarkdown({ findings, paths: scanned, score });
+  const md = renderMarkdown({ findings, paths: scanned, score, skipped });
   if (args.write) {
     const outDefault = path.join(process.cwd(), ".planning", "security-scan.md");
     const out = args.writePath || outDefault;
@@ -395,7 +409,7 @@ function main() {
     process.stdout.write(md);
   }
-  process.exit(exitCodeFor(findings));
+  process.exit(exitCode);
 }
 module.exports = { main, scanContent, defaultPaths, categoryScore, renderDeepPromptPack, SECRET_PATTERNS, CONFIG_SMELLS };

package/bin/trust-score.js CHANGED Viewed

@@ -4,6 +4,7 @@
 const fs = require("fs");
 const path = require("path");
 const os = require("os");
+const { spawnSync } = require("child_process");
 const pc = require("./plan-contract.js");
 const ledger = require("./state-ledger.js");
 const { binFiles } = require("./runtime-manifest.js");
@@ -16,6 +17,12 @@ const HOMES = [
 const REQUIRED_BIN = binFiles();
+// Spine scripts: the load-bearing executables that, if they fail to even parse,
+// render the install dead. binFiles() does not include these, so probe them
+// explicitly — existence AND Node loadability (`--check`). A syntax-broken
+// cli.js (the C1 regression) must force overall status off PASS.
+const SPINE_SCRIPTS = ["cli.js", "install.js", "state.js"];
 const REQUIRED_HOOKS = [
   "session-start.js", "auto-update.js", "branch-guard.js", "pre-push.js",
   "pre-deploy-gate.js", "migration-guard.js", "git-guardrails.js",
@@ -54,10 +61,27 @@ function inspectInstall(homes) {
     return { status: "fail", score: 0, issues: ["no Qualia install config found"], targets: [] };
   }
   const issues = [];
+  let spineBroken = false;
   for (const home of homes) {
     for (const f of REQUIRED_BIN) {
       if (!exists(path.join(home.dir, "bin", f))) issues.push(`${home.name}: missing bin/${f}`);
     }
+    // Spine probe: Node loadability of spine scripts that ARE installed.
+    // cli.js / install.js run via npx / the installer and are not part of the
+    // runtime manifest, so their ABSENCE is not a failure (state.js absence is
+    // already reported via REQUIRED_BIN). But any spine script that IS present
+    // must pass `--check` — a syntax-broken installed spine forces status off
+    // PASS (this is the check that catches a dead cli.js).
+    for (const f of SPINE_SCRIPTS) {
+      const fp = path.join(home.dir, "bin", f);
+      if (!exists(fp)) continue;
+      const r = spawnSync(process.execPath, ["--check", fp], { encoding: "utf8" });
+      if (r.status !== 0) {
+        const detail = (r.stderr || r.error?.message || "").split("\n")[0].trim();
+        issues.push(`${home.name}: spine bin/${f} fails --check${detail ? ` (${detail})` : ""}`);
+        spineBroken = true;
+      }
+    }
     if (home.name === "Claude") {
       if (!exists(path.join(home.dir, "CLAUDE.md"))) issues.push("Claude: missing CLAUDE.md");
       if (!exists(path.join(home.dir, "settings.json"))) issues.push("Claude: missing settings.json");
@@ -67,6 +91,16 @@ function inspectInstall(homes) {
       if (!exists(path.join(home.dir, "config.toml"))) issues.push("Codex: missing config.toml");
     }
   }
+  // A broken spine is a hard failure: the install cannot run. Force fail status
+  // (drives overall status to FAIL) and zero the score regardless of other issues.
+  if (spineBroken) {
+    return {
+      status: "fail",
+      score: 0,
+      issues,
+      targets: homes.map((h) => h.name),
+    };
+  }
   return {
     status: issues.length ? "degraded" : "pass",
     score: issues.length ? Math.max(6, 20 - issues.length * 2) : 20,

package/hooks/migration-guard.js CHANGED Viewed

@@ -161,12 +161,12 @@ if (realCreateTables.length > 0 && !/ENABLE\s+ROW\s+LEVEL\s+SECURITY/i.test(scan
 }
 if (errors.length > 0) {
-  console.log("⬢ Migration guard — dangerous patterns found:");
+  console.error("⬢ Migration guard — dangerous patterns found:");
   for (const e of errors) {
-    console.log(`  ✗ ${e}`);
+    console.error(`  ✗ ${e}`);
   }
-  console.log("");
-  console.log("Fix these before proceeding. If intentional, ask Fawzi to approve.");
+  console.error("");
+  console.error("Fix these before proceeding. If intentional, ask Fawzi to approve.");
   _trace("migration-guard", "block", { errors });
   process.exit(2);
 }

package/hooks/pre-deploy-gate.js CHANGED Viewed

@@ -73,7 +73,7 @@ function runGate(label, cmd, args, { required = true } = {}) {
   const r = spawnSync(cmd, args, {
     stdio: ["ignore", "pipe", "pipe"],
     encoding: "utf8",
-    timeout: 180000,
+    timeout: 120000,
     shell: process.platform === "win32",
   });
   if (r.status === 0) {
@@ -170,7 +170,10 @@ function enforceShipPolicy() {
   // If this is not a Qualia-managed project, keep the legacy behavior: run the
   // quality/security gates but do not invent state.
-  if (!state || !status) return;
+  if (!state || !status) {
+    console.error("⬢ pre-deploy-gate: no ship-state (brownfield) — skipping ship-policy gate, quality gates still run");
+    return;
+  }
   const shippable = status === "polished" || (status === "verified" && verification === "pass");
   if (!shippable && !force) {
@@ -281,9 +284,16 @@ if (fs.existsSync("tsconfig.json")) {
 }
 // Lint (with QUALIA_SKIP_LINT=1 escape — for the documented "lint blocks
-// ship" friction when the lint failures are pre-existing or auto-fixer-broken)
+// ship" friction when the lint failures are pre-existing or auto-fixer-broken).
+// The skip is OWNER-only, mirroring the QUALIA_SHIP_FORCE role-gate above.
 if (hasScript("lint")) {
-  if (process.env.QUALIA_SKIP_LINT === "1") {
+  const skipLint = process.env.QUALIA_SKIP_LINT === "1";
+  const lintRole = String(readConfig().role || "").toUpperCase();
+  if (skipLint && lintRole !== "OWNER") {
+    const lintState = readState();
+    blockDeploy("QUALIA_SKIP_LINT is OWNER-only.", (lintState && lintState.next_command) || "/qualia");
+  }
+  if (skipLint) {
     console.log("  ⚠ Lint skipped (QUALIA_SKIP_LINT=1)");
     _trace("pre-deploy-gate", "skip-lint", { reason: "QUALIA_SKIP_LINT=1" });
   } else {

package/hooks/stop-session-log.js CHANGED Viewed

@@ -91,13 +91,16 @@ try {
   // dedupe marker, so it's a cheap no-op on every turn except the one right
   // after a ship. Wrapped + unref'd so it never blocks or breaks the session.
   try {
-    const { spawn } = require("child_process");
-    const child = spawn(
-      process.execPath,
-      [path.join(__dirname, "..", "bin", "auto-report.js")],
-      { cwd: process.cwd(), detached: true, stdio: "ignore" },
-    );
-    child.unref();
+    const autoReportPath = path.join(__dirname, "..", "bin", "auto-report.js");
+    if (fs.existsSync(autoReportPath)) {
+      const { spawn } = require("child_process");
+      const child = spawn(
+        process.execPath,
+        [autoReportPath],
+        { cwd: process.cwd(), detached: true, stdio: "ignore" },
+      );
+      child.unref();
+    }
   } catch {}
   // ── Skip if too soon since last write ────────────────────

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "qualia-framework",
-  "version": "6.7.1",
+  "version": "6.8.0",
   "description": "Claude Code and Codex workflow framework by Qualia Solutions. Plan, build, verify, ship.",
   "bin": {
     "qualia-framework": "./bin/cli.js"
@@ -54,7 +54,11 @@
     "docs/onboarding.html",
     "CLAUDE.md",
     "AGENTS.md",
-    "guide.md"
+    "guide.md",
+    "SOUL.md",
+    "FLAGS.md",
+    "TROUBLESHOOTING.md",
+    "CHANGELOG.md"
   ],
   "engines": {
     "node": ">=18"

package/qualia-design/design-rubric.md CHANGED Viewed

@@ -33,7 +33,7 @@ A whole-app `/qualia-polish` scores all 9 dimensions across multiple representat
 | Score | Criteria |
 |---|---|
-| 1 | Inter / Roboto / Arial / system-ui / Space Grotesk as primary. Or single weight throughout. |
+| 1 | Banned font as primary — Inter / Roboto / Arial / Helvetica / system-ui / Space Grotesk / Montserrat / Poppins / Lato / Open Sans. Or single weight throughout. |
 | 2 | Project font loaded but only one weight, no scale, no letter-spacing variation. |
 | 3 | Project font with 2-3 weights, basic scale (h1/h2/body), readable. |
 | 4 | Distinctive display + refined body pair. Letter-spacing varied semantically (tight headlines, open labels). Tabular numerals on numeric data. |
@@ -41,6 +41,8 @@ A whole-app `/qualia-polish` scores all 9 dimensions across multiple representat
 Evidence format: `font-family: "<name>" used at line N, scale 14/16/24/40 visible, weights 400/500/700`
+The banned-font list above is the machine-enforced counterpart of `bin/slop-detect.mjs` (the static anti-pattern scanner) and `agents/visual-evaluator.md` (the vision evaluator). All three are kept in lockstep — Inter, Roboto, Arial, Helvetica, system-ui, Space Grotesk, Montserrat, Poppins, Lato, Open Sans.
 ### 2. Color cohesion
 | Score | Criteria |

package/rules/architecture.md CHANGED Viewed

@@ -119,7 +119,7 @@ Read this file (auto-load via skill or `@rules/architecture.md`) when:
 - Planning a new module or feature with multiple components.
 - The user requests `/qualia-optimize --deepen` or `--alignment`.
-- A verifier is scoring "Container depth & nesting" (per `rules/design-rubric.md` dimension 8).
+- A verifier is scoring "Container depth & nesting" (per `qualia-design/design-rubric.md` dimension 8).
 - An ADR is being drafted for an architectural fork.
 Do **not** auto-load this on quick fixes, copy edits, single-component touch-ups — that wastes instruction budget. Use judgment.

package/rules/grounding.md CHANGED Viewed

@@ -102,7 +102,9 @@ Every skill that spawns an agent must order the prompt from most-stable to most-
 - Per-task content goes LAST. Mixing task-specific files into `<phase_context>` breaks cache on every spawn within the same wave.
 - Reference files via `@path` when the harness auto-expands, OR inline the content — but pick one and stick with it per section (switching styles breaks prefix match).
-## Design Quality Rubric (any dimension < 3 = mandatory fix before commit)
+## Design Quality Rubric (pre-commit SUBSET — any dimension < 3 = mandatory fix before commit)
+> This 6-dimension table (Typography, Color, Spacing, States, Responsiveness, Accessibility) is the fast pre-commit gate every agent runs inline. It is a **deliberate SUBSET** of the authoritative 9-dimension rubric at `@qualia-design/design-rubric.md` (Typography, Color cohesion, Spatial rhythm, Layout originality, Shadow & depth, Motion intent, Microcopy specificity, Container depth & nesting, Visual system & graphics). For verifier scoring and `/qualia-polish --critique`, the 9-dimension rubric is authoritative — read it, don't score from this subset.
 | Dimension       | 1 (Fail)                                | 3 (Acceptable)                           | 5 (Excellent)                              |
 |-----------------|-----------------------------------------|------------------------------------------|--------------------------------------------|

package/rules/speed.md CHANGED Viewed

@@ -26,14 +26,14 @@ MCP servers impose a **token tax**: their tool definitions consume context-windo
 - The MCP returns structured JSON that would be expensive to parse from CLI text output.
 - The MCP enforces governance (RLS-aware queries, scoped DB credentials) that the CLI doesn't.
-If a `/skill-name` exists that wraps a CLI, prefer the skill — it's been hardened. Canonical example: `/supabase` skill replaces 32 supabase MCP tools with `supabase` CLI calls and saves substantial token budget.
+If a `/skill-name` exists that wraps a CLI, prefer the skill — it's been hardened. Canonical example: drive Supabase through `npx supabase` (migrations, type generation, local dev, SQL) instead of the Supabase MCP — the CLI hits the same API at zero token cost and replaces the bulk of the MCP tool surface (see `infrastructure.md`).
 ### MCP tier-list (when each is justified)
 | Server | Always-on? | Justification |
 |---|---|---|
 | `claude-in-chrome` | On-demand | Browser automation has no CLI equivalent; use for QA flows only |
-| `supabase` MCP | **Off** in favor of `/supabase` skill | CLI covers 95% of operations; MCP only if you need branch management interactively |
+| `supabase` MCP | **Off** in favor of `npx supabase` CLI | CLI covers 95% of operations; MCP only if you need branch management interactively |
 | `context7` | On-demand | Library docs at runtime — no CLI alternative for Context7 itself |
 | `notebooklm-mcp` | On-demand | NotebookLM has no CLI; only loaded when researching against existing notebooks |
 | `firecrawl-mcp` | On-demand | Web scraping; only loaded when feature requires it |

package/skills/qualia-idk/SKILL.md CHANGED Viewed

@@ -207,13 +207,13 @@ With all three reports + `<user_confusion>` + `<session_context>` in hand, produ
 |---|---|
 | Plan says built but code has stubs | `/qualia-plan {N} --gaps` → `/qualia-build` → `/qualia-verify` |
 | Verify FAILed and no postmortem ran | `/qualia-postmortem` → `/qualia-plan {N} --gaps` → `/qualia-build` |
-| Stale `.continue-here.md`, ongoing context | `/qualia-resume` → `/qualia` |
+| Stale `.continue-here.md`, ongoing context | `/qualia` (restores from `.continue-here.md` / STATE.md) |
 | Brownfield drift (plan and code diverged hard) | `/qualia-map` → `/qualia-plan {N} --gaps` |
 | Phase context missing (no `/qualia-scope` ran) | `/qualia-scope {N}` → `/qualia-plan {N}` |
 | Specific error, scope clear | `/qualia-fix '<symptom>'` |
 | Performance feels off, no profile | `/qualia-fix --perf '<route>'` or `/qualia-optimize --perf` |
 | Design feels off | `/qualia-polish --critique` then `/qualia-polish` |
-| User is overwhelmed | `/qualia-pause` (save handoff), come back later |
+| User is overwhelmed | `/qualia-handoff` (save handoff), come back later |
 | Truly nothing actionable found | Ask one specific question; don't invent a sequence |
 Pick the sequence that fits the actual evidence. Substitute real `{N}` from the Plan-view scan.
@@ -240,6 +240,6 @@ node ${QUALIA_BIN}/qualia-ui.js end "DIAGNOSED" "{first command in the sequence,
 - User knows what they're doing and just wants the next command → `/qualia`
 - User has a specific error message they want fixed → `/qualia-fix '<symptom>'`
 - User wants to review code quality → `/qualia-review`
-- User wants to pause and come back → `/qualia-pause`
+- User wants to pause and come back → `/qualia-handoff`
 `/qualia-idk` is specifically for **"I'm not sure what I'm even looking at"** situations. Route to sharper tools when the question is sharper.

package/skills/qualia-polish/REFERENCE.md CHANGED Viewed

@@ -32,6 +32,8 @@ The capture script (`scripts/playwright-capture.mjs`) auto-selects in this order
 3. `~/.cache/ms-playwright/chromium-{version}/chrome-{linux64,linux,mac,win}/chrome` — if Playwright was ever installed for browsers but the package isn't import-resolvable
 4. `which google-chrome` / `chromium` / `chromium-browser` / `chrome` — system browser fallback
+All four backends capture **full-page** (top-to-bottom, including below-fold content), not just the first viewport-height — the evaluator scores card grids, CTAs, and footers, so the capture must contain them. Playwright uses `fullPage: true`; the binary-direct backends omit any fixed `--window-size` height clamp so the headless render extends to the full document height.
 For backends 3 and 4 (binary-direct), the script uses `--headless=new --screenshot --virtual-time-budget`. Less precise than Playwright's `networkidle` waiting but works without any npm dependency.
 Setup hints if all four fail:
@@ -70,6 +72,7 @@ Role: @${QUALIA_AGENTS}/visual-evaluator.md
 </product>
 <screenshots>
+One full-page PNG per viewport (captured top-to-bottom, includes below-fold content):
 - mobile (375px):  /tmp/qpl-{ts}/iter-{N}/mobile-375.png
 - tablet (768px):  /tmp/qpl-{ts}/iter-{N}/tablet-768.png
 - desktop (1440px): /tmp/qpl-{ts}/iter-{N}/desktop-1440.png
@@ -150,6 +153,8 @@ the {dim} dimension at {score}. Your single task: fix that one dimension.
 ## Iteration log entry (what `loop.mjs record` writes to state.json.iterations[])
+`tokens_used` is a **deterministic fixed charge** (`PER_ITER_TOKENS = 14500`) added by `loop.mjs` per iteration — NOT a number reported by the evaluator. The evaluator's JSON no longer carries a `tokens_used` field; trusting a model-guessed count to drive the budget kill-switch was unsound, so the loop charges the constant below for every iteration regardless of what any agent claims.
 ```json
 {
   "iteration": 1,
@@ -189,20 +194,20 @@ When the kill trigger fires, the verdict becomes `killed_regression` and `state.
 | 6 | ~90K  | default |
 | 8 | ~120K | hard cap; pass `--budget 150000` to allow |
-Per-iteration cost (rough):
-- 3 screenshot reads ≈ 9K
+Per-iteration cost is charged as a **fixed constant** — `loop.mjs` adds `PER_ITER_TOKENS = 14500` per iteration to `state.tokens_used` and the kill-switch trips off that deterministic sum, never off an agent's self-estimate. The breakdown below is the rationale for the constant, not a per-run measurement:
+- 3 full-page screenshot reads ≈ 9K (taller PNGs than viewport-height crops, but still 3 reads)
 - rubric + brief inlined ≈ 2K (cached after iter 1)
 - previous-iteration delta ≈ 0.5K
 - 3 fix-builder spawns × (file read + edit + commit-fix call) ≈ 3K
-- **per-iteration ≈ 14.5K**
+- **per-iteration constant = 14.5K** (deterministic — same charge every iteration)
 ## Self-test scenarios (mapping to spec)
 | # | Fixture | Expected | Verifier |
 |---|---|---|---|
-| 1 | `fixtures/clean.html` | SUCCESS in 1-2 iterations, all dims ≥ 4 | run capture, run evaluator inline, assert pass |
-| 2 | `fixtures/broken.html` | SUCCESS in 4-6 iters; identifies banned font + gradient + 3-card grid + side-stripe + generic CTA | each fix-builder commits a `qpl-N:` change; final eval all dims ≥ 3 |
-| 3 | Kill-switch | KILL at iter ≤ 4 with `LOOP_REGRESSION_DETECTED` | call `loop.mjs record` 3× with the same fingerprint; assert exit 3 + correct verdict |
+| 1 | `fixtures/clean.html` | Each viewport PNG is full-page (height > viewport-height when the fixture scrolls); SUCCESS in 1-2 iterations, all dims ≥ 4 | run capture, assert PNG is full-page, run evaluator inline, assert pass |
+| 2 | `fixtures/broken.html` | SUCCESS in 4-6 iters; identifies banned font + gradient + 3-card grid + side-stripe + generic CTA, INCLUDING below-fold offenders only visible in a full-page shot | each fix-builder commits a `qpl-N:` change; final eval all dims ≥ 3 |
+| 3 | Kill-switch | KILL at iter ≤ 4 with `LOOP_REGRESSION_DETECTED`; `state.tokens_used` increments by exactly `PER_ITER_TOKENS` (14500) per recorded iteration | call `loop.mjs record` 3× with the same fingerprint; assert exit 3 + correct verdict + deterministic token charge |
 The pilot-results doc at `docs/playwright-loop-pilot-results.md` records the actual outcome from `bash scripts/_self-tests.sh` (Scenario 3 is exercised by a deterministic unit-style invocation; Scenarios 1+2 require a real vision pass and are run by Claude when the loop ships).

package/skills/qualia-polish/SKILL.md CHANGED Viewed

@@ -32,7 +32,7 @@ The first argument selects the scope. Stage selection follows from scope.
 | `/qualia-polish --redesign` | **Redesign** | ~30m | all + Stage 1 mandatory + 2 vision iterations |
 | `/qualia-polish --critique` | **Critique** | read-only | 0, 4, 5 (no edits) |
 | `/qualia-polish --quick` | **Quick** | ~1m | 0, 2, 7 (gates only, no vision loop) |
-| `/qualia-polish --vibe` | **Aesthetic pivot** | ~3m | token-only direction swap |
+| `/qualia-polish --vibe` | **Aesthetic pivot** | ~3m | token-only direction swap, then 6, 7 (mandatory verify) |
 | `/qualia-polish --loop {url}` | **Loop** | ~5-15m | autonomous see/fix/verify, max 8 iterations |
 Other flags: `--register=brand|product` overrides register inference. Vibe-specific flags: `--variants N`, `--extract URL|image`, `--sync`, `--write`. Loop-specific flags: `--brief PATH`, `--max N`, `--viewports 375,768,1440`, `--ref PATH`, `--budget 100000`.
@@ -73,6 +73,21 @@ node ${QUALIA_SKILLS}/qualia-polish/scripts/vibe-extract.mjs --source https://ex
 Default flow proposes one opinionated direction per `rules/one-opinion.md`. `--variants` is opt-in only. `--extract` stages a reference screenshot for the visual evaluator to reverse-engineer into DESIGN.md tokens; the user must review low-confidence extracted values before application. `--sync --write` patches DESIGN.md to reflect tokens already present in code.
+**Mandatory post-vibe verify (the token swap does NOT terminate the run).** After applying the new tokens, a `--vibe` run MUST NOT stop at the swap — it routes back through **Stage 6 — Verify**. Before claiming done, run:
+```bash
+# 1. TypeScript still compiles (a token rename can break a typed theme map)
+npx tsc --noEmit 2>&1 | tail -10
+# 2. Anti-pattern scan on the files the swap touched (new tokens can reintroduce slop)
+node bin/slop-detect.mjs {changed_files}
+[ $? -eq 0 ] || { echo "vibe failed — slop-detect found critical issues in the new tokens"; exit 1; }
+```
+3. **Scoped vision check (best-effort, only if a dev server is reachable).** Capture one screenshot per viewport via `scripts/playwright-capture.mjs` and spawn a single `agents/visual-evaluator.md` pass scored ONLY on the dimensions a token swap can move: **color, typography, shadow, motion**. (Layout/container/microcopy/graphics are out of scope for `--vibe` — do not score or fix them here.) If no dev server and no resolvable browser backend, skip this step and note it in the report; never block the vibe on missing optional tooling. Any in-scope dimension scoring < 3 is a regression in the new direction — re-tune the offending tokens and re-verify, do not ship.
+Only after Stage 6 passes does the `--vibe` run reach Stage 7 (commit & state).
 ## Setup gates (non-optional, every scope)
 Before any work — design or otherwise — pass these gates. Skipping them produces generic output that ignores the project.
@@ -163,6 +178,8 @@ Each agent receives:
 For Component scope: do the work in main context. Read, fix, verify.
+**Optional best-effort vision check (Component scope).** If a dev server is reachable (`http://localhost:3000` or a port detected via `lsof`) AND `scripts/playwright-capture.mjs` can resolve a browser backend, capture ONE screenshot at a single viewport (desktop 1440 is sufficient for an isolated component) and spawn one `agents/visual-evaluator.md` pass scored on component-relevant dimensions ONLY — Typography, Color, Shadow, Motion, Microcopy (skip Layout/Container/Graphics per the rubric's scope guard). This is opt-in by environment: if there is no dev server or no resolvable backend, **omit the vision check without error and record that it was unavailable** — never block a component touch-up on missing optional tooling.
 **Apply fixes scoped by what's missing:**
 | If the file scores low on… | Apply fix from… |
@@ -216,7 +233,7 @@ If a11y &lt; 90 OR axe critical/serious violations: **fix programmatically** (th
 ### Stage 4 — Vision loop (Redesign scope only — max 2 iterations)
-Use `skills/qualia-polish/scripts/playwright-capture.mjs` (Playwright backend, with Chrome-binary fallback). Capture screenshots at 3 viewports: 375 / 768 / 1440 — these match what `agents/visual-evaluator.md` expects, and what the `--loop` mode captures. Single browser (Chromium) is fine in 2026 — cross-browser CSS rendering differences are vanishingly rare.
+Use `skills/qualia-polish/scripts/playwright-capture.mjs` (Playwright backend, with Chrome-binary fallback). Capture one **full-page** screenshot at each of 3 viewports: 375 / 768 / 1440 — these match what `agents/visual-evaluator.md` expects, and what the `--loop` mode captures. The `height` below sets the viewport window, not the screenshot height — each PNG runs top-to-bottom so the evaluator scores below-fold sections too. Single browser (Chromium) is fine in 2026 — cross-browser CSS rendering differences are vanishingly rare.
 ```
 viewports: [
@@ -325,6 +342,6 @@ node ${QUALIA_BIN}/qualia-ui.js end "POLISHED" "{next command — depends on con
 | `bin/slop-detect.mjs not found` | Framework not installed in project | Run `npx qualia install` or pull script from framework repo |
 | `PRODUCT.md missing` | Project predates the PRODUCT.md substrate | Run setup (ask 5 questions, generate). For Component scope, can proceed with nudge. |
 | `Lighthouse not installed` | Optional tool | Skip Stage 3 numeric gates, note in report. Don't fail. |
-| `webapp-testing skill not present` | Optional Anthropic skill | Skip Stage 4 vision loop on Redesign scope. Note in report. Recommend installing. |
+| `playwright-capture.mjs cannot resolve a browser backend` | None of the four backends resolve: no `playwright`/`playwright-core` package, no cached Chromium under `~/.cache/ms-playwright`, no system Chrome/Chromium on PATH | Skip Stage 4 vision loop (and any screenshot-dependent verify). Note in report. Install a backend: `npm i -D playwright && npx playwright install chromium` (or put `google-chrome` on PATH). |
 | Vision loop oscillates between iterations | Rubric not anchored properly | Verify rubric prompt instructs "default to 3, only 1-2 = fix". Hard cap at 2 iterations. |
 | User says "you missed X" after polish completes | Scope was too narrow | Re-run with wider scope (`/qualia-polish` whole-app). Don't argue scope. |

package/skills/qualia-polish/scripts/loop.mjs CHANGED Viewed

@@ -14,12 +14,13 @@
  *
  * Eval JSON contract (what the vision evaluator returns):
  *   {
- *     "iteration": 1,
+ *     "iteration": 1,            // optional; if present must equal state.iteration+1 (monotonicity assertion)
  *     "viewport_results": [{ viewport, scores, top_issues }],
  *     "aggregate_scores": { typography, color, spatial, layout, shadow, motion, microcopy, container, graphics },
- *     "top_issues": [{ dim, severity, description, likely_file, fix }],
- *     "tokens_used": 14500
+ *     "top_issues": [{ dim, severity, description, likely_file, fix }]
  *   }
+ *   NOTE: any ev.tokens_used is IGNORED — the budget charges a fixed
+ *   PER_ITER_TOKENS per iteration (deterministic kill-switch).
  *
  * State JSON shape (written by this script, read by Claude):
  *   {
@@ -35,6 +36,12 @@ import { spawnSync } from "node:child_process";
 const DIMS = ["typography", "color", "spatial", "layout", "shadow", "motion", "microcopy", "container", "graphics"];
+// Deterministic per-iteration token charge (evaluator + 3 builders ≈ 14500,
+// per REFERENCE.md:197). The budget kill-switch must NOT trust ev.tokens_used —
+// that is a model-guessed number and an under-report would defeat the budget
+// guard. Charge a fixed, known cost per iteration instead.
+const PER_ITER_TOKENS = 14500;
 // ── helpers ──────────────────────────────────────────────────────────────
 function loadState(p) {
   if (!existsSync(p)) { console.error(`state file not found: ${p}`); exit(2); }
@@ -98,7 +105,17 @@ function cmdRecord() {
   const state = loadState(statePath);
   const ev = JSON.parse(readFileSync(evalPath, "utf8"));
-  state.iteration = (ev.iteration ?? state.iteration + 1);
+  // Monotonicity assertion: each record() advances exactly one iteration. If the
+  // evaluator reports an iteration that disagrees with the script's own counter,
+  // the loop is desynced — fail loudly instead of trusting the eval's number,
+  // which would corrupt regression fingerprinting (state.iteration is the key
+  // used at :108-134) and budget accounting.
+  const expected = state.iteration + 1;
+  if (ev.iteration != null && ev.iteration !== expected) {
+    console.error(`loop desync: eval.iteration=${ev.iteration} expected=${expected}`);
+    exit(2);
+  }
+  state.iteration = expected;
   const scores = ev.aggregate_scores || {};
   const failingDims = DIMS.filter((d) => (scores[d] ?? 3) < 3);
   const total = DIMS.reduce((a, d) => a + (scores[d] ?? 3), 0);
@@ -133,7 +150,8 @@ function cmdRecord() {
     return its.length >= 3 && its.every((n, i, arr) => i === 0 || n === arr[i - 1] + 1);
   });
-  state.tokens_used += parseInt(ev.tokens_used || 0, 10) || 0;
+  // Charge the fixed per-iteration cost — do NOT trust ev.tokens_used.
+  state.tokens_used += PER_ITER_TOKENS;
   state.iterations.push({
     iteration: state.iteration,
@@ -142,7 +160,7 @@ function cmdRecord() {
     pass,
     failing_dims: failingDims,
     top_issues: issues,
-    tokens_used: ev.tokens_used || 0,
+    tokens_used: PER_ITER_TOKENS,
     timestamp: new Date().toISOString(),
   });