qualia-framework 6.7.0 → 6.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/CHANGELOG.md +2304 -0
  2. package/FLAGS.md +73 -0
  3. package/SOUL.md +17 -0
  4. package/TROUBLESHOOTING.md +183 -0
  5. package/agents/plan-checker.md +4 -0
  6. package/agents/planner.md +8 -0
  7. package/agents/qa-browser.md +6 -2
  8. package/agents/research-synthesizer.md +4 -0
  9. package/agents/researcher.md +4 -0
  10. package/agents/roadmapper.md +4 -0
  11. package/agents/verifier.md +1 -1
  12. package/agents/visual-evaluator.md +8 -6
  13. package/bin/cli.js +7 -1
  14. package/bin/install.js +69 -7
  15. package/bin/runtime-manifest.js +1 -0
  16. package/bin/security-scan.js +24 -10
  17. package/bin/trust-score.js +34 -0
  18. package/docs/onboarding.html +1 -1
  19. package/hooks/migration-guard.js +4 -4
  20. package/hooks/pre-deploy-gate.js +14 -4
  21. package/hooks/stop-session-log.js +10 -7
  22. package/package.json +6 -2
  23. package/qualia-design/design-rubric.md +3 -1
  24. package/rules/architecture.md +1 -1
  25. package/rules/grounding.md +3 -1
  26. package/rules/speed.md +2 -2
  27. package/skills/qualia-idk/SKILL.md +3 -3
  28. package/skills/qualia-polish/REFERENCE.md +11 -6
  29. package/skills/qualia-polish/SKILL.md +20 -3
  30. package/skills/qualia-polish/scripts/loop.mjs +24 -6
  31. package/skills/qualia-polish/scripts/playwright-capture.mjs +89 -11
  32. package/skills/qualia-polish/scripts/vibe-tokens.mjs +57 -1
  33. package/skills/qualia-research/SKILL.md +1 -1
  34. package/skills/qualia-road/SKILL.md +6 -0
  35. package/skills/qualia-scope/SKILL.md +2 -2
  36. package/skills/qualia-secure/SKILL.md +5 -5
  37. package/templates/help.html +1 -1
  38. package/templates/knowledge/index.md +4 -4
  39. package/tests/bin.test.sh +4 -4
  40. package/tests/lib.test.sh +2 -2
@@ -192,7 +192,7 @@ function categoryScore(findings) {
192
192
  return { weighted_sum: ws, score: Math.max(1, 5 - Math.floor(ws / 8)), counts };
193
193
  }
194
194
 
195
- function renderMarkdown({ findings, paths, score }) {
195
+ function renderMarkdown({ findings, paths, score, skipped = [] }) {
196
196
  const lines = [];
197
197
  lines.push(`# Qualia security scan — ${new Date().toISOString()}`);
198
198
  lines.push("");
@@ -202,10 +202,16 @@ function renderMarkdown({ findings, paths, score }) {
202
202
  lines.push(`**Score:** ${score.score} / 5 (weighted_sum=${score.weighted_sum})`);
203
203
  lines.push("");
204
204
 
205
- if (findings.length === 0) {
205
+ if (skipped.length > 0) {
206
+ lines.push(`⚠️ **${skipped.length} file(s) could not be scanned** — result is NOT clean (score degraded, non-zero exit):`);
207
+ for (const s of skipped) lines.push(`- \`${s.file}\` — ${s.error}`);
208
+ lines.push("");
209
+ }
210
+
211
+ if (findings.length === 0 && skipped.length === 0) {
206
212
  lines.push("✅ Clean.");
207
213
  lines.push("");
208
- } else {
214
+ } else if (findings.length > 0) {
209
215
  findings.sort((a, b) => severityOrder(a.severity) - severityOrder(b.severity));
210
216
  for (const f of findings) {
211
217
  lines.push(`### [${f.severity}] ${f.name}`);
@@ -346,6 +352,7 @@ function main() {
346
352
  const paths = args.paths || defaultPaths();
347
353
  const findings = [];
348
354
  const scanned = [];
355
+ const skipped = [];
349
356
 
350
357
  for (const p of paths) {
351
358
  try {
@@ -356,14 +363,21 @@ function main() {
356
363
  const content = fs.readFileSync(p, "utf8");
357
364
  scanned.push(p);
358
365
  findings.push(...scanContent(p, content));
359
- } catch {}
366
+ } catch (e) { skipped.push({ file: p, error: e.message }); }
360
367
  }
361
368
 
362
369
  const score = categoryScore(findings);
370
+ // An unreadable file must not silently pass as clean. Degrade the score
371
+ // (min 1) and force a non-clean exit when any target could not be scanned.
372
+ if (skipped.length > 0) score.score = Math.max(1, score.score - 1);
373
+
374
+ // A scan that couldn't read every target must not exit clean — surface at
375
+ // least HIGH (1) so an unreadable file can't slip through a CI gate as 0.
376
+ const exitCode = skipped.length > 0 ? Math.max(1, exitCodeFor(findings)) : exitCodeFor(findings);
363
377
 
364
378
  if (args.json) {
365
- process.stdout.write(JSON.stringify({ findings, scanned, score }, null, 2) + "\n");
366
- process.exit(exitCodeFor(findings));
379
+ process.stdout.write(JSON.stringify({ findings, scanned, skipped, score }, null, 2) + "\n");
380
+ process.exit(exitCode);
367
381
  }
368
382
 
369
383
  // --deep emits a prompt pack for the /qualia-secure skill to spawn agents.
@@ -373,17 +387,17 @@ function main() {
373
387
  try { fs.mkdirSync(planningDir, { recursive: true }); } catch {}
374
388
  const staticPath = path.join(planningDir, "security-scan.md");
375
389
  const promptPath = path.join(planningDir, "security-deep-prompt.md");
376
- fs.writeFileSync(staticPath, renderMarkdown({ findings, paths: scanned, score }));
390
+ fs.writeFileSync(staticPath, renderMarkdown({ findings, paths: scanned, score, skipped }));
377
391
  fs.writeFileSync(promptPath, renderDeepPromptPack({ findings, scanned, score }));
378
392
  console.log(`Wrote ${staticPath}`);
379
393
  console.log(`Wrote ${promptPath}`);
380
394
  console.log("");
381
395
  console.log("Now run `/qualia-secure` in a Claude session — it will read the prompt pack");
382
396
  console.log("and spawn the red/blue/auditor agents in parallel.");
383
- process.exit(exitCodeFor(findings));
397
+ process.exit(exitCode);
384
398
  }
385
399
 
386
- const md = renderMarkdown({ findings, paths: scanned, score });
400
+ const md = renderMarkdown({ findings, paths: scanned, score, skipped });
387
401
  if (args.write) {
388
402
  const outDefault = path.join(process.cwd(), ".planning", "security-scan.md");
389
403
  const out = args.writePath || outDefault;
@@ -395,7 +409,7 @@ function main() {
395
409
  process.stdout.write(md);
396
410
  }
397
411
 
398
- process.exit(exitCodeFor(findings));
412
+ process.exit(exitCode);
399
413
  }
400
414
 
401
415
  module.exports = { main, scanContent, defaultPaths, categoryScore, renderDeepPromptPack, SECRET_PATTERNS, CONFIG_SMELLS };
@@ -4,6 +4,7 @@
4
4
  const fs = require("fs");
5
5
  const path = require("path");
6
6
  const os = require("os");
7
+ const { spawnSync } = require("child_process");
7
8
  const pc = require("./plan-contract.js");
8
9
  const ledger = require("./state-ledger.js");
9
10
  const { binFiles } = require("./runtime-manifest.js");
@@ -16,6 +17,12 @@ const HOMES = [
16
17
 
17
18
  const REQUIRED_BIN = binFiles();
18
19
 
20
+ // Spine scripts: the load-bearing executables that, if they fail to even parse,
21
+ // render the install dead. binFiles() does not include these, so probe them
22
+ // explicitly — existence AND Node loadability (`--check`). A syntax-broken
23
+ // cli.js (the C1 regression) must force overall status off PASS.
24
+ const SPINE_SCRIPTS = ["cli.js", "install.js", "state.js"];
25
+
19
26
  const REQUIRED_HOOKS = [
20
27
  "session-start.js", "auto-update.js", "branch-guard.js", "pre-push.js",
21
28
  "pre-deploy-gate.js", "migration-guard.js", "git-guardrails.js",
@@ -54,10 +61,27 @@ function inspectInstall(homes) {
54
61
  return { status: "fail", score: 0, issues: ["no Qualia install config found"], targets: [] };
55
62
  }
56
63
  const issues = [];
64
+ let spineBroken = false;
57
65
  for (const home of homes) {
58
66
  for (const f of REQUIRED_BIN) {
59
67
  if (!exists(path.join(home.dir, "bin", f))) issues.push(`${home.name}: missing bin/${f}`);
60
68
  }
69
+ // Spine probe: Node loadability of spine scripts that ARE installed.
70
+ // cli.js / install.js run via npx / the installer and are not part of the
71
+ // runtime manifest, so their ABSENCE is not a failure (state.js absence is
72
+ // already reported via REQUIRED_BIN). But any spine script that IS present
73
+ // must pass `--check` — a syntax-broken installed spine forces status off
74
+ // PASS (this is the check that catches a dead cli.js).
75
+ for (const f of SPINE_SCRIPTS) {
76
+ const fp = path.join(home.dir, "bin", f);
77
+ if (!exists(fp)) continue;
78
+ const r = spawnSync(process.execPath, ["--check", fp], { encoding: "utf8" });
79
+ if (r.status !== 0) {
80
+ const detail = (r.stderr || r.error?.message || "").split("\n")[0].trim();
81
+ issues.push(`${home.name}: spine bin/${f} fails --check${detail ? ` (${detail})` : ""}`);
82
+ spineBroken = true;
83
+ }
84
+ }
61
85
  if (home.name === "Claude") {
62
86
  if (!exists(path.join(home.dir, "CLAUDE.md"))) issues.push("Claude: missing CLAUDE.md");
63
87
  if (!exists(path.join(home.dir, "settings.json"))) issues.push("Claude: missing settings.json");
@@ -67,6 +91,16 @@ function inspectInstall(homes) {
67
91
  if (!exists(path.join(home.dir, "config.toml"))) issues.push("Codex: missing config.toml");
68
92
  }
69
93
  }
94
+ // A broken spine is a hard failure: the install cannot run. Force fail status
95
+ // (drives overall status to FAIL) and zero the score regardless of other issues.
96
+ if (spineBroken) {
97
+ return {
98
+ status: "fail",
99
+ score: 0,
100
+ issues,
101
+ targets: homes.map((h) => h.name),
102
+ };
103
+ }
70
104
  return {
71
105
  status: issues.length ? "degraded" : "pass",
72
106
  score: issues.length ? Math.max(6, 20 - issues.length * 2) : 20,
@@ -546,7 +546,7 @@
546
546
  <div class="kit-group">
547
547
  <h4>Lock decisions before code</h4>
548
548
  <dl>
549
- <dt>/qualia-discuss</dt><dd>Without args: project-level non-technical discovery (mandatory at kickoff). With <code>N</code>: phase-level technical alignment with locked decisions and ADRs.</dd>
549
+ <dt>/qualia-scope</dt><dd>Without args: project-level non-technical discovery (mandatory at kickoff). With <code>N</code>: phase-level archetype-aware grill with locked decisions and ADRs, gated on Definition-of-Done.</dd>
550
550
  <dt>/qualia-research</dt><dd>Deep-research a niche library or domain before planning a phase.</dd>
551
551
  </dl>
552
552
  </div>
@@ -161,12 +161,12 @@ if (realCreateTables.length > 0 && !/ENABLE\s+ROW\s+LEVEL\s+SECURITY/i.test(scan
161
161
  }
162
162
 
163
163
  if (errors.length > 0) {
164
- console.log("⬢ Migration guard — dangerous patterns found:");
164
+ console.error("⬢ Migration guard — dangerous patterns found:");
165
165
  for (const e of errors) {
166
- console.log(` ✗ ${e}`);
166
+ console.error(` ✗ ${e}`);
167
167
  }
168
- console.log("");
169
- console.log("Fix these before proceeding. If intentional, ask Fawzi to approve.");
168
+ console.error("");
169
+ console.error("Fix these before proceeding. If intentional, ask Fawzi to approve.");
170
170
  _trace("migration-guard", "block", { errors });
171
171
  process.exit(2);
172
172
  }
@@ -73,7 +73,7 @@ function runGate(label, cmd, args, { required = true } = {}) {
73
73
  const r = spawnSync(cmd, args, {
74
74
  stdio: ["ignore", "pipe", "pipe"],
75
75
  encoding: "utf8",
76
- timeout: 180000,
76
+ timeout: 120000,
77
77
  shell: process.platform === "win32",
78
78
  });
79
79
  if (r.status === 0) {
@@ -170,7 +170,10 @@ function enforceShipPolicy() {
170
170
 
171
171
  // If this is not a Qualia-managed project, keep the legacy behavior: run the
172
172
  // quality/security gates but do not invent state.
173
- if (!state || !status) return;
173
+ if (!state || !status) {
174
+ console.error("⬢ pre-deploy-gate: no ship-state (brownfield) — skipping ship-policy gate, quality gates still run");
175
+ return;
176
+ }
174
177
 
175
178
  const shippable = status === "polished" || (status === "verified" && verification === "pass");
176
179
  if (!shippable && !force) {
@@ -281,9 +284,16 @@ if (fs.existsSync("tsconfig.json")) {
281
284
  }
282
285
 
283
286
  // Lint (with QUALIA_SKIP_LINT=1 escape — for the documented "lint blocks
284
- // ship" friction when the lint failures are pre-existing or auto-fixer-broken)
287
+ // ship" friction when the lint failures are pre-existing or auto-fixer-broken).
288
+ // The skip is OWNER-only, mirroring the QUALIA_SHIP_FORCE role-gate above.
285
289
  if (hasScript("lint")) {
286
- if (process.env.QUALIA_SKIP_LINT === "1") {
290
+ const skipLint = process.env.QUALIA_SKIP_LINT === "1";
291
+ const lintRole = String(readConfig().role || "").toUpperCase();
292
+ if (skipLint && lintRole !== "OWNER") {
293
+ const lintState = readState();
294
+ blockDeploy("QUALIA_SKIP_LINT is OWNER-only.", (lintState && lintState.next_command) || "/qualia");
295
+ }
296
+ if (skipLint) {
287
297
  console.log(" ⚠ Lint skipped (QUALIA_SKIP_LINT=1)");
288
298
  _trace("pre-deploy-gate", "skip-lint", { reason: "QUALIA_SKIP_LINT=1" });
289
299
  } else {
@@ -91,13 +91,16 @@ try {
91
91
  // dedupe marker, so it's a cheap no-op on every turn except the one right
92
92
  // after a ship. Wrapped + unref'd so it never blocks or breaks the session.
93
93
  try {
94
- const { spawn } = require("child_process");
95
- const child = spawn(
96
- process.execPath,
97
- [path.join(__dirname, "..", "bin", "auto-report.js")],
98
- { cwd: process.cwd(), detached: true, stdio: "ignore" },
99
- );
100
- child.unref();
94
+ const autoReportPath = path.join(__dirname, "..", "bin", "auto-report.js");
95
+ if (fs.existsSync(autoReportPath)) {
96
+ const { spawn } = require("child_process");
97
+ const child = spawn(
98
+ process.execPath,
99
+ [autoReportPath],
100
+ { cwd: process.cwd(), detached: true, stdio: "ignore" },
101
+ );
102
+ child.unref();
103
+ }
101
104
  } catch {}
102
105
 
103
106
  // ── Skip if too soon since last write ────────────────────
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "qualia-framework",
3
- "version": "6.7.0",
3
+ "version": "6.8.0",
4
4
  "description": "Claude Code and Codex workflow framework by Qualia Solutions. Plan, build, verify, ship.",
5
5
  "bin": {
6
6
  "qualia-framework": "./bin/cli.js"
@@ -54,7 +54,11 @@
54
54
  "docs/onboarding.html",
55
55
  "CLAUDE.md",
56
56
  "AGENTS.md",
57
- "guide.md"
57
+ "guide.md",
58
+ "SOUL.md",
59
+ "FLAGS.md",
60
+ "TROUBLESHOOTING.md",
61
+ "CHANGELOG.md"
58
62
  ],
59
63
  "engines": {
60
64
  "node": ">=18"
@@ -33,7 +33,7 @@ A whole-app `/qualia-polish` scores all 9 dimensions across multiple representat
33
33
 
34
34
  | Score | Criteria |
35
35
  |---|---|
36
- | 1 | Inter / Roboto / Arial / system-ui / Space Grotesk as primary. Or single weight throughout. |
36
+ | 1 | Banned font as primary — Inter / Roboto / Arial / Helvetica / system-ui / Space Grotesk / Montserrat / Poppins / Lato / Open Sans. Or single weight throughout. |
37
37
  | 2 | Project font loaded but only one weight, no scale, no letter-spacing variation. |
38
38
  | 3 | Project font with 2-3 weights, basic scale (h1/h2/body), readable. |
39
39
  | 4 | Distinctive display + refined body pair. Letter-spacing varied semantically (tight headlines, open labels). Tabular numerals on numeric data. |
@@ -41,6 +41,8 @@ A whole-app `/qualia-polish` scores all 9 dimensions across multiple representat
41
41
 
42
42
  Evidence format: `font-family: "<name>" used at line N, scale 14/16/24/40 visible, weights 400/500/700`
43
43
 
44
+ The banned-font list above is the machine-enforced counterpart of `bin/slop-detect.mjs` (the static anti-pattern scanner) and `agents/visual-evaluator.md` (the vision evaluator). All three are kept in lockstep — Inter, Roboto, Arial, Helvetica, system-ui, Space Grotesk, Montserrat, Poppins, Lato, Open Sans.
45
+
44
46
  ### 2. Color cohesion
45
47
 
46
48
  | Score | Criteria |
@@ -119,7 +119,7 @@ Read this file (auto-load via skill or `@rules/architecture.md`) when:
119
119
 
120
120
  - Planning a new module or feature with multiple components.
121
121
  - The user requests `/qualia-optimize --deepen` or `--alignment`.
122
- - A verifier is scoring "Container depth & nesting" (per `rules/design-rubric.md` dimension 8).
122
+ - A verifier is scoring "Container depth & nesting" (per `qualia-design/design-rubric.md` dimension 8).
123
123
  - An ADR is being drafted for an architectural fork.
124
124
 
125
125
  Do **not** auto-load this on quick fixes, copy edits, single-component touch-ups — that wastes instruction budget. Use judgment.
@@ -102,7 +102,9 @@ Every skill that spawns an agent must order the prompt from most-stable to most-
102
102
  - Per-task content goes LAST. Mixing task-specific files into `<phase_context>` breaks cache on every spawn within the same wave.
103
103
  - Reference files via `@path` when the harness auto-expands, OR inline the content — but pick one and stick with it per section (switching styles breaks prefix match).
104
104
 
105
- ## Design Quality Rubric (any dimension < 3 = mandatory fix before commit)
105
+ ## Design Quality Rubric (pre-commit SUBSET — any dimension < 3 = mandatory fix before commit)
106
+
107
+ > This 6-dimension table (Typography, Color, Spacing, States, Responsiveness, Accessibility) is the fast pre-commit gate every agent runs inline. It is a **deliberate SUBSET** of the authoritative 9-dimension rubric at `@qualia-design/design-rubric.md` (Typography, Color cohesion, Spatial rhythm, Layout originality, Shadow & depth, Motion intent, Microcopy specificity, Container depth & nesting, Visual system & graphics). For verifier scoring and `/qualia-polish --critique`, the 9-dimension rubric is authoritative — read it, don't score from this subset.
106
108
 
107
109
  | Dimension | 1 (Fail) | 3 (Acceptable) | 5 (Excellent) |
108
110
  |-----------------|-----------------------------------------|------------------------------------------|--------------------------------------------|
package/rules/speed.md CHANGED
@@ -26,14 +26,14 @@ MCP servers impose a **token tax**: their tool definitions consume context-windo
26
26
  - The MCP returns structured JSON that would be expensive to parse from CLI text output.
27
27
  - The MCP enforces governance (RLS-aware queries, scoped DB credentials) that the CLI doesn't.
28
28
 
29
- If a `/skill-name` exists that wraps a CLI, prefer the skill — it's been hardened. Canonical example: `/supabase` skill replaces 32 supabase MCP tools with `supabase` CLI calls and saves substantial token budget.
29
+ If a `/skill-name` exists that wraps a CLI, prefer the skill — it's been hardened. Canonical example: drive Supabase through `npx supabase` (migrations, type generation, local dev, SQL) instead of the Supabase MCP the CLI hits the same API at zero token cost and replaces the bulk of the MCP tool surface (see `infrastructure.md`).
30
30
 
31
31
  ### MCP tier-list (when each is justified)
32
32
 
33
33
  | Server | Always-on? | Justification |
34
34
  |---|---|---|
35
35
  | `claude-in-chrome` | On-demand | Browser automation has no CLI equivalent; use for QA flows only |
36
- | `supabase` MCP | **Off** in favor of `/supabase` skill | CLI covers 95% of operations; MCP only if you need branch management interactively |
36
+ | `supabase` MCP | **Off** in favor of `npx supabase` CLI | CLI covers 95% of operations; MCP only if you need branch management interactively |
37
37
  | `context7` | On-demand | Library docs at runtime — no CLI alternative for Context7 itself |
38
38
  | `notebooklm-mcp` | On-demand | NotebookLM has no CLI; only loaded when researching against existing notebooks |
39
39
  | `firecrawl-mcp` | On-demand | Web scraping; only loaded when feature requires it |
@@ -207,13 +207,13 @@ With all three reports + `<user_confusion>` + `<session_context>` in hand, produ
207
207
  |---|---|
208
208
  | Plan says built but code has stubs | `/qualia-plan {N} --gaps` → `/qualia-build` → `/qualia-verify` |
209
209
  | Verify FAILed and no postmortem ran | `/qualia-postmortem` → `/qualia-plan {N} --gaps` → `/qualia-build` |
210
- | Stale `.continue-here.md`, ongoing context | `/qualia-resume` `/qualia` |
210
+ | Stale `.continue-here.md`, ongoing context | `/qualia` (restores from `.continue-here.md` / STATE.md) |
211
211
  | Brownfield drift (plan and code diverged hard) | `/qualia-map` → `/qualia-plan {N} --gaps` |
212
212
  | Phase context missing (no `/qualia-scope` ran) | `/qualia-scope {N}` → `/qualia-plan {N}` |
213
213
  | Specific error, scope clear | `/qualia-fix '<symptom>'` |
214
214
  | Performance feels off, no profile | `/qualia-fix --perf '<route>'` or `/qualia-optimize --perf` |
215
215
  | Design feels off | `/qualia-polish --critique` then `/qualia-polish` |
216
- | User is overwhelmed | `/qualia-pause` (save handoff), come back later |
216
+ | User is overwhelmed | `/qualia-handoff` (save handoff), come back later |
217
217
  | Truly nothing actionable found | Ask one specific question; don't invent a sequence |
218
218
 
219
219
  Pick the sequence that fits the actual evidence. Substitute real `{N}` from the Plan-view scan.
@@ -240,6 +240,6 @@ node ${QUALIA_BIN}/qualia-ui.js end "DIAGNOSED" "{first command in the sequence,
240
240
  - User knows what they're doing and just wants the next command → `/qualia`
241
241
  - User has a specific error message they want fixed → `/qualia-fix '<symptom>'`
242
242
  - User wants to review code quality → `/qualia-review`
243
- - User wants to pause and come back → `/qualia-pause`
243
+ - User wants to pause and come back → `/qualia-handoff`
244
244
 
245
245
  `/qualia-idk` is specifically for **"I'm not sure what I'm even looking at"** situations. Route to sharper tools when the question is sharper.
@@ -32,6 +32,8 @@ The capture script (`scripts/playwright-capture.mjs`) auto-selects in this order
32
32
  3. `~/.cache/ms-playwright/chromium-{version}/chrome-{linux64,linux,mac,win}/chrome` — if Playwright was ever installed for browsers but the package isn't import-resolvable
33
33
  4. `which google-chrome` / `chromium` / `chromium-browser` / `chrome` — system browser fallback
34
34
 
35
+ All four backends capture **full-page** (top-to-bottom, including below-fold content), not just the first viewport-height — the evaluator scores card grids, CTAs, and footers, so the capture must contain them. Playwright uses `fullPage: true`; the binary-direct backends omit any fixed `--window-size` height clamp so the headless render extends to the full document height.
36
+
35
37
  For backends 3 and 4 (binary-direct), the script uses `--headless=new --screenshot --virtual-time-budget`. Less precise than Playwright's `networkidle` waiting but works without any npm dependency.
36
38
 
37
39
  Setup hints if all four fail:
@@ -70,6 +72,7 @@ Role: @${QUALIA_AGENTS}/visual-evaluator.md
70
72
  </product>
71
73
 
72
74
  <screenshots>
75
+ One full-page PNG per viewport (captured top-to-bottom, includes below-fold content):
73
76
  - mobile (375px): /tmp/qpl-{ts}/iter-{N}/mobile-375.png
74
77
  - tablet (768px): /tmp/qpl-{ts}/iter-{N}/tablet-768.png
75
78
  - desktop (1440px): /tmp/qpl-{ts}/iter-{N}/desktop-1440.png
@@ -150,6 +153,8 @@ the {dim} dimension at {score}. Your single task: fix that one dimension.
150
153
 
151
154
  ## Iteration log entry (what `loop.mjs record` writes to state.json.iterations[])
152
155
 
156
+ `tokens_used` is a **deterministic fixed charge** (`PER_ITER_TOKENS = 14500`) added by `loop.mjs` per iteration — NOT a number reported by the evaluator. The evaluator's JSON no longer carries a `tokens_used` field; trusting a model-guessed count to drive the budget kill-switch was unsound, so the loop charges the constant below for every iteration regardless of what any agent claims.
157
+
153
158
  ```json
154
159
  {
155
160
  "iteration": 1,
@@ -189,20 +194,20 @@ When the kill trigger fires, the verdict becomes `killed_regression` and `state.
189
194
  | 6 | ~90K | default |
190
195
  | 8 | ~120K | hard cap; pass `--budget 150000` to allow |
191
196
 
192
- Per-iteration cost (rough):
193
- - 3 screenshot reads ≈ 9K
197
+ Per-iteration cost is charged as a **fixed constant** — `loop.mjs` adds `PER_ITER_TOKENS = 14500` per iteration to `state.tokens_used` and the kill-switch trips off that deterministic sum, never off an agent's self-estimate. The breakdown below is the rationale for the constant, not a per-run measurement:
198
+ - 3 full-page screenshot reads ≈ 9K (taller PNGs than viewport-height crops, but still 3 reads)
194
199
  - rubric + brief inlined ≈ 2K (cached after iter 1)
195
200
  - previous-iteration delta ≈ 0.5K
196
201
  - 3 fix-builder spawns × (file read + edit + commit-fix call) ≈ 3K
197
- - **per-iteration 14.5K**
202
+ - **per-iteration constant = 14.5K** (deterministic — same charge every iteration)
198
203
 
199
204
  ## Self-test scenarios (mapping to spec)
200
205
 
201
206
  | # | Fixture | Expected | Verifier |
202
207
  |---|---|---|---|
203
- | 1 | `fixtures/clean.html` | SUCCESS in 1-2 iterations, all dims ≥ 4 | run capture, run evaluator inline, assert pass |
204
- | 2 | `fixtures/broken.html` | SUCCESS in 4-6 iters; identifies banned font + gradient + 3-card grid + side-stripe + generic CTA | each fix-builder commits a `qpl-N:` change; final eval all dims ≥ 3 |
205
- | 3 | Kill-switch | KILL at iter ≤ 4 with `LOOP_REGRESSION_DETECTED` | call `loop.mjs record` 3× with the same fingerprint; assert exit 3 + correct verdict |
208
+ | 1 | `fixtures/clean.html` | Each viewport PNG is full-page (height > viewport-height when the fixture scrolls); SUCCESS in 1-2 iterations, all dims ≥ 4 | run capture, assert PNG is full-page, run evaluator inline, assert pass |
209
+ | 2 | `fixtures/broken.html` | SUCCESS in 4-6 iters; identifies banned font + gradient + 3-card grid + side-stripe + generic CTA, INCLUDING below-fold offenders only visible in a full-page shot | each fix-builder commits a `qpl-N:` change; final eval all dims ≥ 3 |
210
+ | 3 | Kill-switch | KILL at iter ≤ 4 with `LOOP_REGRESSION_DETECTED`; `state.tokens_used` increments by exactly `PER_ITER_TOKENS` (14500) per recorded iteration | call `loop.mjs record` 3× with the same fingerprint; assert exit 3 + correct verdict + deterministic token charge |
206
211
 
207
212
  The pilot-results doc at `docs/playwright-loop-pilot-results.md` records the actual outcome from `bash scripts/_self-tests.sh` (Scenario 3 is exercised by a deterministic unit-style invocation; Scenarios 1+2 require a real vision pass and are run by Claude when the loop ships).
208
213
 
@@ -32,7 +32,7 @@ The first argument selects the scope. Stage selection follows from scope.
32
32
  | `/qualia-polish --redesign` | **Redesign** | ~30m | all + Stage 1 mandatory + 2 vision iterations |
33
33
  | `/qualia-polish --critique` | **Critique** | read-only | 0, 4, 5 (no edits) |
34
34
  | `/qualia-polish --quick` | **Quick** | ~1m | 0, 2, 7 (gates only, no vision loop) |
35
- | `/qualia-polish --vibe` | **Aesthetic pivot** | ~3m | token-only direction swap |
35
+ | `/qualia-polish --vibe` | **Aesthetic pivot** | ~3m | token-only direction swap, then 6, 7 (mandatory verify) |
36
36
  | `/qualia-polish --loop {url}` | **Loop** | ~5-15m | autonomous see/fix/verify, max 8 iterations |
37
37
 
38
38
  Other flags: `--register=brand|product` overrides register inference. Vibe-specific flags: `--variants N`, `--extract URL|image`, `--sync`, `--write`. Loop-specific flags: `--brief PATH`, `--max N`, `--viewports 375,768,1440`, `--ref PATH`, `--budget 100000`.
@@ -73,6 +73,21 @@ node ${QUALIA_SKILLS}/qualia-polish/scripts/vibe-extract.mjs --source https://ex
73
73
 
74
74
  Default flow proposes one opinionated direction per `rules/one-opinion.md`. `--variants` is opt-in only. `--extract` stages a reference screenshot for the visual evaluator to reverse-engineer into DESIGN.md tokens; the user must review low-confidence extracted values before application. `--sync --write` patches DESIGN.md to reflect tokens already present in code.
75
75
 
76
+ **Mandatory post-vibe verify (the token swap does NOT terminate the run).** After applying the new tokens, a `--vibe` run MUST NOT stop at the swap — it routes back through **Stage 6 — Verify**. Before claiming done, run:
77
+
78
+ ```bash
79
+ # 1. TypeScript still compiles (a token rename can break a typed theme map)
80
+ npx tsc --noEmit 2>&1 | tail -10
81
+
82
+ # 2. Anti-pattern scan on the files the swap touched (new tokens can reintroduce slop)
83
+ node bin/slop-detect.mjs {changed_files}
84
+ [ $? -eq 0 ] || { echo "vibe failed — slop-detect found critical issues in the new tokens"; exit 1; }
85
+ ```
86
+
87
+ 3. **Scoped vision check (best-effort, only if a dev server is reachable).** Capture one screenshot per viewport via `scripts/playwright-capture.mjs` and spawn a single `agents/visual-evaluator.md` pass scored ONLY on the dimensions a token swap can move: **color, typography, shadow, motion**. (Layout/container/microcopy/graphics are out of scope for `--vibe` — do not score or fix them here.) If no dev server and no resolvable browser backend, skip this step and note it in the report; never block the vibe on missing optional tooling. Any in-scope dimension scoring < 3 is a regression in the new direction — re-tune the offending tokens and re-verify, do not ship.
88
+
89
+ Only after Stage 6 passes does the `--vibe` run reach Stage 7 (commit & state).
90
+
76
91
  ## Setup gates (non-optional, every scope)
77
92
 
78
93
  Before any work — design or otherwise — pass these gates. Skipping them produces generic output that ignores the project.
@@ -163,6 +178,8 @@ Each agent receives:
163
178
 
164
179
  For Component scope: do the work in main context. Read, fix, verify.
165
180
 
181
+ **Optional best-effort vision check (Component scope).** If a dev server is reachable (`http://localhost:3000` or a port detected via `lsof`) AND `scripts/playwright-capture.mjs` can resolve a browser backend, capture ONE screenshot at a single viewport (desktop 1440 is sufficient for an isolated component) and spawn one `agents/visual-evaluator.md` pass scored on component-relevant dimensions ONLY — Typography, Color, Shadow, Motion, Microcopy (skip Layout/Container/Graphics per the rubric's scope guard). This is opt-in by environment: if there is no dev server or no resolvable backend, **omit the vision check without error and record that it was unavailable** — never block a component touch-up on missing optional tooling.
182
+
166
183
  **Apply fixes scoped by what's missing:**
167
184
 
168
185
  | If the file scores low on… | Apply fix from… |
@@ -216,7 +233,7 @@ If a11y &lt; 90 OR axe critical/serious violations: **fix programmatically** (th
216
233
 
217
234
  ### Stage 4 — Vision loop (Redesign scope only — max 2 iterations)
218
235
 
219
- Use `skills/qualia-polish/scripts/playwright-capture.mjs` (Playwright backend, with Chrome-binary fallback). Capture screenshots at 3 viewports: 375 / 768 / 1440 — these match what `agents/visual-evaluator.md` expects, and what the `--loop` mode captures. Single browser (Chromium) is fine in 2026 — cross-browser CSS rendering differences are vanishingly rare.
236
+ Use `skills/qualia-polish/scripts/playwright-capture.mjs` (Playwright backend, with Chrome-binary fallback). Capture one **full-page** screenshot at each of 3 viewports: 375 / 768 / 1440 — these match what `agents/visual-evaluator.md` expects, and what the `--loop` mode captures. The `height` below sets the viewport window, not the screenshot height — each PNG runs top-to-bottom so the evaluator scores below-fold sections too. Single browser (Chromium) is fine in 2026 — cross-browser CSS rendering differences are vanishingly rare.
220
237
 
221
238
  ```
222
239
  viewports: [
@@ -325,6 +342,6 @@ node ${QUALIA_BIN}/qualia-ui.js end "POLISHED" "{next command — depends on con
325
342
  | `bin/slop-detect.mjs not found` | Framework not installed in project | Run `npx qualia install` or pull script from framework repo |
326
343
  | `PRODUCT.md missing` | Project predates the PRODUCT.md substrate | Run setup (ask 5 questions, generate). For Component scope, can proceed with nudge. |
327
344
  | `Lighthouse not installed` | Optional tool | Skip Stage 3 numeric gates, note in report. Don't fail. |
328
- | `webapp-testing skill not present` | Optional Anthropic skill | Skip Stage 4 vision loop on Redesign scope. Note in report. Recommend installing. |
345
+ | `playwright-capture.mjs cannot resolve a browser backend` | None of the four backends resolve: no `playwright`/`playwright-core` package, no cached Chromium under `~/.cache/ms-playwright`, no system Chrome/Chromium on PATH | Skip Stage 4 vision loop (and any screenshot-dependent verify). Note in report. Install a backend: `npm i -D playwright && npx playwright install chromium` (or put `google-chrome` on PATH). |
329
346
  | Vision loop oscillates between iterations | Rubric not anchored properly | Verify rubric prompt instructs "default to 3, only 1-2 = fix". Hard cap at 2 iterations. |
330
347
  | User says "you missed X" after polish completes | Scope was too narrow | Re-run with wider scope (`/qualia-polish` whole-app). Don't argue scope. |
@@ -14,12 +14,13 @@
14
14
  *
15
15
  * Eval JSON contract (what the vision evaluator returns):
16
16
  * {
17
- * "iteration": 1,
17
+ * "iteration": 1, // optional; if present must equal state.iteration+1 (monotonicity assertion)
18
18
  * "viewport_results": [{ viewport, scores, top_issues }],
19
19
  * "aggregate_scores": { typography, color, spatial, layout, shadow, motion, microcopy, container, graphics },
20
- * "top_issues": [{ dim, severity, description, likely_file, fix }],
21
- * "tokens_used": 14500
20
+ * "top_issues": [{ dim, severity, description, likely_file, fix }]
22
21
  * }
22
+ * NOTE: any ev.tokens_used is IGNORED — the budget charges a fixed
23
+ * PER_ITER_TOKENS per iteration (deterministic kill-switch).
23
24
  *
24
25
  * State JSON shape (written by this script, read by Claude):
25
26
  * {
@@ -35,6 +36,12 @@ import { spawnSync } from "node:child_process";
35
36
 
36
37
  const DIMS = ["typography", "color", "spatial", "layout", "shadow", "motion", "microcopy", "container", "graphics"];
37
38
 
39
+ // Deterministic per-iteration token charge (evaluator + 3 builders ≈ 14500,
40
+ // per REFERENCE.md:197). The budget kill-switch must NOT trust ev.tokens_used —
41
+ // that is a model-guessed number and an under-report would defeat the budget
42
+ // guard. Charge a fixed, known cost per iteration instead.
43
+ const PER_ITER_TOKENS = 14500;
44
+
38
45
  // ── helpers ──────────────────────────────────────────────────────────────
39
46
  function loadState(p) {
40
47
  if (!existsSync(p)) { console.error(`state file not found: ${p}`); exit(2); }
@@ -98,7 +105,17 @@ function cmdRecord() {
98
105
  const state = loadState(statePath);
99
106
  const ev = JSON.parse(readFileSync(evalPath, "utf8"));
100
107
 
101
- state.iteration = (ev.iteration ?? state.iteration + 1);
108
+ // Monotonicity assertion: each record() advances exactly one iteration. If the
109
+ // evaluator reports an iteration that disagrees with the script's own counter,
110
+ // the loop is desynced — fail loudly instead of trusting the eval's number,
111
+ // which would corrupt regression fingerprinting (state.iteration is the key
112
+ // used at :108-134) and budget accounting.
113
+ const expected = state.iteration + 1;
114
+ if (ev.iteration != null && ev.iteration !== expected) {
115
+ console.error(`loop desync: eval.iteration=${ev.iteration} expected=${expected}`);
116
+ exit(2);
117
+ }
118
+ state.iteration = expected;
102
119
  const scores = ev.aggregate_scores || {};
103
120
  const failingDims = DIMS.filter((d) => (scores[d] ?? 3) < 3);
104
121
  const total = DIMS.reduce((a, d) => a + (scores[d] ?? 3), 0);
@@ -133,7 +150,8 @@ function cmdRecord() {
133
150
  return its.length >= 3 && its.every((n, i, arr) => i === 0 || n === arr[i - 1] + 1);
134
151
  });
135
152
 
136
- state.tokens_used += parseInt(ev.tokens_used || 0, 10) || 0;
153
+ // Charge the fixed per-iteration cost do NOT trust ev.tokens_used.
154
+ state.tokens_used += PER_ITER_TOKENS;
137
155
 
138
156
  state.iterations.push({
139
157
  iteration: state.iteration,
@@ -142,7 +160,7 @@ function cmdRecord() {
142
160
  pass,
143
161
  failing_dims: failingDims,
144
162
  top_issues: issues,
145
- tokens_used: ev.tokens_used || 0,
163
+ tokens_used: PER_ITER_TOKENS,
146
164
  timestamp: new Date().toISOString(),
147
165
  });
148
166