qualia-framework 4.3.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CLAUDE.md +13 -1
  2. package/README.md +16 -13
  3. package/agents/builder.md +12 -20
  4. package/agents/plan-checker.md +18 -0
  5. package/agents/planner.md +9 -0
  6. package/agents/verifier.md +62 -0
  7. package/bin/agent-runs.js +233 -0
  8. package/bin/cli.js +225 -21
  9. package/bin/install.js +25 -5
  10. package/bin/plan-contract.js +220 -0
  11. package/bin/slop-detect.mjs +357 -0
  12. package/bin/state.js +199 -10
  13. package/docs/agent-runs.md +273 -0
  14. package/docs/erp-contract.md +5 -0
  15. package/docs/plan-contract.md +321 -0
  16. package/hooks/auto-update.js +3 -7
  17. package/hooks/pre-compact.js +22 -11
  18. package/hooks/pre-deploy-gate.js +16 -2
  19. package/hooks/pre-push.js +22 -2
  20. package/hooks/stop-session-log.js +1 -1
  21. package/package.json +8 -2
  22. package/rules/design-brand.md +110 -0
  23. package/rules/design-laws.md +144 -0
  24. package/rules/design-product.md +110 -0
  25. package/rules/design-rubric.md +153 -0
  26. package/skills/qualia-build/SKILL.md +5 -5
  27. package/skills/qualia-flush/SKILL.md +1 -1
  28. package/skills/qualia-new/SKILL.md +40 -3
  29. package/skills/qualia-polish/SKILL.md +180 -136
  30. package/skills/qualia-quick/SKILL.md +1 -1
  31. package/skills/qualia-report/SKILL.md +25 -5
  32. package/skills/qualia-ship/SKILL.md +12 -10
  33. package/skills/zoho-workflow/SKILL.md +64 -0
  34. package/templates/DESIGN.md +229 -435
  35. package/templates/PRODUCT.md +95 -0
  36. package/templates/help.html +13 -7
  37. package/tests/bin.test.sh +6 -3
  38. package/tests/hooks.test.sh +9 -20
  39. package/tests/lib.test.sh +217 -0
  40. package/tests/runner.js +96 -75
  41. package/tests/state.test.sh +4 -3
  42. package/skills/qualia-design/SKILL.md +0 -169
@@ -0,0 +1,273 @@
1
+ # Agent Runs Telemetry
2
+
3
+ Append-only JSONL ledger of every subagent spawn, recorded per project. Substrate for `qualia-framework agents`, postmortem analysis, and ERP enrichment.
4
+
5
+ Status: **draft, v1.** Pressure-test the shape against real spawns before locking.
6
+
7
+ ## Why this exists
8
+
9
+ Today, `traces.jsonl` records hook-level events only. There is zero per-agent telemetry: no record of which builder ran for how long on which task, which verifier failed and why, which researcher hit a rate limit. The data needed to answer "which task failed twice and required a postmortem" doesn't exist.
10
+
11
+ This file specifies a per-spawn record that lives next to the project (not in `~/.claude/`) so it travels with the repo, is committed alongside other planning artifacts, and stays attributable to a specific phase.
12
+
13
+ ## File layout
14
+
15
+ ```
16
+ .planning/
17
+ agent-runs.jsonl # all-time, append-only
18
+ agent-runs/
19
+ 2026-04-28.jsonl # daily rotation (optional, see below)
20
+ ```
21
+
22
+ **Rotation:** start with single-file. If `agent-runs.jsonl` exceeds 5MB, rotate to dated subfile. Cheap, no dependency.
23
+
24
+ **Privacy:** records contain file paths, task ids, durations, token counts, error strings — never command output, never file contents, never user prompts. The schema below is the upper bound of what we capture. `QUALIA_TELEMETRY=off` env var disables writes.
25
+
26
+ ## Schema (v1)
27
+
28
+ OpenTelemetry GenAI semantic conventions where they fit; framework-specific fields where they don't.
29
+
30
+ ```ts
31
+ interface AgentRunRecord {
32
+ // Identity
33
+ schema_version: 1;
34
+ run_id: string; // ULID — sortable, monotonic
35
+ parent_run_id?: string; // ONLY for true nesting (an agent spawned this one); null otherwise
36
+ skill_invocation_id: string; // groups runs from one skill call (sequential or parallel siblings)
37
+ session_id?: string; // Claude Code session id when reachable; per-process UUID fallback
38
+
39
+ // What ran
40
+ agent_type: AgentType;
41
+ agent_name?: string; // for custom agents (e.g. "frontend-agent")
42
+ model: string; // "claude-opus-4-7", "claude-sonnet-4-6", etc.
43
+ effort?: "low" | "medium" | "high" | "max";
44
+
45
+ // Where in the road
46
+ project: string; // tracking.json.project
47
+ phase?: number; // current phase if applicable
48
+ milestone?: number;
49
+ task_id?: string; // contract task id ("T1", "T2") for builders
50
+ wave?: number;
51
+ retry_of?: string; // run_id of the prior failed attempt this one is retrying
52
+
53
+ // Lifecycle
54
+ status: AgentStatus;
55
+ started_at: string; // ISO 8601 UTC
56
+ finished_at: string; // ISO 8601 UTC
57
+ duration_ms: number;
58
+
59
+ // Cost (OTel-aligned, optional — only if obtainable from spawn shape)
60
+ input_tokens?: number; // gen_ai.usage.input_tokens
61
+ output_tokens?: number; // gen_ai.usage.output_tokens
62
+ cache_read_tokens?: number; // gen_ai.usage.cache_read.input_tokens
63
+ cache_creation_tokens?: number; // gen_ai.usage.cache_creation.input_tokens
64
+
65
+ // Activity
66
+ tool_calls_count?: number;
67
+ files_changed?: string[]; // repo-relative, deduped
68
+ commit_sha?: string; // if the run produced a commit
69
+
70
+ // Outcome detail
71
+ // status = did the agent process complete cleanly (success/failure/timeout/...)
72
+ // verification_result = did the code under test pass (only on agent_type="verifier")
73
+ // a verifier with status="success" + verification_result="fail" = the verifier ran fine and the code failed.
74
+ // a verifier with status="failure" = the verifier itself errored (timeout, infra, etc.)
75
+ verifier_score?: number; // 1-5 if agent_type=verifier
76
+ verification_result?: "pass" | "fail" | "partial";
77
+ failure_reason?: string; // short, machine-classifiable; see "Failure taxonomy" below
78
+ failure_detail?: string; // last 500 chars of stderr/error — keep the tail (newest content), drop the head
79
+
80
+ // Self-link — only set on failure
81
+ log_file?: string; // .planning/agent-runs/<run_id>.log if status != success
82
+ }
83
+
84
+ type AgentType =
85
+ | "planner"
86
+ | "plan-checker"
87
+ | "builder"
88
+ | "verifier"
89
+ | "qa-browser"
90
+ | "researcher"
91
+ | "research-synthesizer"
92
+ | "roadmapper"
93
+ | "team-orchestrator"
94
+ | "custom"; // user-defined agents
95
+
96
+ type AgentStatus =
97
+ | "success" // completed, no failure_reason
98
+ | "partial" // completed but flagged issues (e.g. builder PARTIAL)
99
+ | "blocked" // builder hit a precondition gate (e.g. file lock)
100
+ | "failure" // explicit failure (verifier fail, builder error)
101
+ | "timeout" // exceeded budget
102
+ | "interrupted"; // user cancelled / parent killed
103
+ ```
104
+
105
+ ### Failure taxonomy
106
+
107
+ `failure_reason` is a closed enum so analytics can classify without parsing free text. Add new values via PR — don't free-text.
108
+
109
+ | Code | Meaning |
110
+ |---|---|
111
+ | `tsc-failed` | TypeScript compilation errors |
112
+ | `lint-failed` | ESLint violations |
113
+ | `tests-failed` | Test runner non-zero exit |
114
+ | `build-failed` | Production build broke |
115
+ | `verification-criteria-unmet` | Verifier ran cleanly but criteria failed |
116
+ | `verification-evidence-missing` | Behavioral check lacked required citations |
117
+ | `verification-execution-error` | Check itself errored (binary missing, timeout, cwd missing) — distinct from criteria failure |
118
+ | `file-not-found` | Referenced file absent |
119
+ | `dependency-missing` | Referenced npm/pip/etc package absent |
120
+ | `lock-timeout` | `.planning/.state.lock` not acquired |
121
+ | `network-error` | Outbound HTTP failed (research, ERP) |
122
+ | `rate-limited` | LLM API 429 |
123
+ | `context-overflow` | Prompt exceeded model context |
124
+ | `tool-misuse` | Builder called a forbidden tool |
125
+ | `precondition-unmet` | Required state/file missing before run |
126
+ | `unknown` | Catch-all; should be rare and trigger triage |
127
+
128
+ ## Example records
129
+
130
+ **Successful builder run** (sequential under `/qualia-build` skill invocation `sk_42`):
131
+ ```json
132
+ {"schema_version":1,"run_id":"01HXY8N3W2K7Q5MZP9V4F8R6T1","skill_invocation_id":"sk_42","session_id":"sess_abc123","agent_type":"builder","model":"claude-sonnet-4-6","effort":"medium","project":"acme-portal","phase":2,"milestone":1,"task_id":"T1","wave":1,"status":"success","started_at":"2026-04-28T14:32:11Z","finished_at":"2026-04-28T14:34:02Z","duration_ms":111000,"input_tokens":12450,"output_tokens":1820,"cache_read_tokens":11200,"tool_calls_count":7,"files_changed":["src/lib/auth.ts","src/lib/auth-schema.ts"],"commit_sha":"a3f5e1c"}
133
+ ```
134
+
135
+ **Failed verifier run** (same skill invocation, no parent — it's a sibling of the builder, not nested):
136
+ ```json
137
+ {"schema_version":1,"run_id":"01HXY8P5R8K7Q5MZP9V4F8R6T2","skill_invocation_id":"sk_42","session_id":"sess_abc123","agent_type":"verifier","model":"claude-opus-4-7","project":"acme-portal","phase":2,"milestone":1,"status":"failure","started_at":"2026-04-28T14:38:10Z","finished_at":"2026-04-28T14:39:55Z","duration_ms":105000,"input_tokens":18200,"output_tokens":2100,"tool_calls_count":12,"verifier_score":2,"verification_result":"fail","failure_reason":"verification-criteria-unmet","failure_detail":"Task T2 acceptance criterion 'Redirect to /dashboard on 200' could not be verified — page.tsx contains no redirect() call","log_file":".planning/agent-runs/01HXY8P5R8K7Q5MZP9V4F8R6T2.log"}
138
+ ```
139
+
140
+ **Researcher spawned by team-orchestrator** (true nesting — parent_run_id is set):
141
+ ```json
142
+ {"schema_version":1,"run_id":"01HXY8QF1ZK7Q5MZP9V4F8R6T3","parent_run_id":"01HXY8QE9V2K7Q5MZP9V4F8R6T0","skill_invocation_id":"sk_43","session_id":"sess_abc123","agent_type":"researcher","model":"claude-sonnet-4-6","project":"acme-portal","status":"failure","started_at":"2026-04-28T14:42:00Z","finished_at":"2026-04-28T14:42:12Z","duration_ms":12000,"tool_calls_count":1,"failure_reason":"rate-limited","failure_detail":"WebFetch returned 429 from context7.com after 1 attempt","log_file":".planning/agent-runs/01HXY8QF1ZK7Q5MZP9V4F8R6T3.log"}
143
+ ```
144
+
145
+ ### `parent_run_id` vs `skill_invocation_id` — when to use which
146
+
147
+ - **`skill_invocation_id`:** every record carries one. Groups all agents that ran under a single user-triggered skill (`/qualia-build phase 2` → planner, all builders, verifier all share an id).
148
+ - **`parent_run_id`:** rare. Set only when one agent literally spawned another via the `Agent` tool — for example, `team-orchestrator` fanning out to `frontend-agent` + `backend-agent`. Sequential planner→builder→verifier under one skill is *not* nesting; those are siblings.
149
+
150
+ ## How records get written
151
+
152
+ A small helper at `bin/lib/agent-runs.js`:
153
+
154
+ ```js
155
+ // pseudocode
156
+ const ar = require('./lib/agent-runs');
157
+
158
+ const run = ar.start({
159
+ agent_type: 'builder',
160
+ task_id: 'T1',
161
+ phase: 2,
162
+ model: process.env.QUALIA_MODEL || 'claude-sonnet-4-6',
163
+ });
164
+
165
+ // ... spawn agent, capture result ...
166
+
167
+ ar.finish(run, {
168
+ status: 'success',
169
+ files_changed: ['src/lib/auth.ts'],
170
+ commit_sha: getHeadSha(),
171
+ input_tokens: 12450,
172
+ output_tokens: 1820,
173
+ });
174
+ ```
175
+
176
+ `start()` returns a token; `finish()` writes the full record via a single `fs.appendFileSync` call. Crash between start and finish leaves no partial record on disk — the in-memory record is lost, which is fine.
177
+
178
+ **Concurrency:** `qualia-build` spawns multiple builders in the same wave, each calling `finish()` concurrently. Atomicity guarantee: `fs.appendFileSync` opens with `O_APPEND` and issues a single `write()` syscall per call. On Linux ext4/btrfs/xfs and macOS APFS, `write()` to a regular file with `O_APPEND` is atomic for sizes up to the filesystem block size (typically 4096 bytes). Records run ~600–800 bytes — well under. On Windows NTFS, `O_APPEND` semantics are implemented by Node's libuv via an internal seek+write under a file lock; effectively atomic for our sizes. This is *not* the POSIX pipe `PIPE_BUF` guarantee — that applies to pipes, not regular files. The protection here is the kernel's regular-file `O_APPEND` + single-`write()` behavior. If records ever exceed ~3.5KB, switch to a per-write lock (`proper-lockfile` or a `.planning/.agent-runs.lock` flock).
179
+
180
+ On `status != "success"`, `finish()` also writes `.planning/agent-runs/<run_id>.log` with the full stderr/error output. JSONL stays lean for analytics; debugging context lives in the side files. Successful runs leave no log file.
181
+
182
+ **Where called from:**
183
+ - The skills that orchestrate spawns (`/qualia-build`, `/qualia-verify`, `/qualia-plan`, etc.) wrap each Agent invocation in `ar.start` / `ar.finish`.
184
+ - Skills can't easily measure tokens — those fields are populated when the harness exposes them via `Task` tool result metadata and left undefined otherwise. Don't gate the design on data we may or may not have.
185
+
186
+ ## How records get read
187
+
188
+ `qualia-framework agents` — summary table:
189
+ ```
190
+ $ qualia-framework agents
191
+ Agent runs (last 50, project: acme-portal)
192
+
193
+ TIME AGENT PHASE TASK STATUS DURATION TOKENS NOTE
194
+ 14:34 builder 2 T1 success 111s 14k in src/lib/auth.ts
195
+ 14:38 verifier 2 — failure 105s 20k in verification-criteria-unmet
196
+ 14:42 builder 2 T1 success 89s 13k in fix: redirect after signin
197
+ 14:45 verifier 2 — success 97s 19k in pass
198
+ ```
199
+
200
+ `qualia-framework agents --failed` — only failure/partial/timeout/blocked:
201
+ ```
202
+ $ qualia-framework agents --failed
203
+ 2 failures in last 7 days
204
+
205
+ 2026-04-28 14:38 verifier phase 2 verification-criteria-unmet
206
+ 2026-04-26 09:22 builder phase 1 tsc-failed
207
+ ```
208
+
209
+ `qualia-framework agents --task T1` — all runs for one task (gap cycles visible):
210
+ ```
211
+ $ qualia-framework agents --task T1
212
+ T1 — Add email/password sign-in handler (3 runs)
213
+
214
+ 2026-04-28 14:32 builder success 111s
215
+ 2026-04-28 14:38 verifier failure 105s verification-criteria-unmet
216
+ 2026-04-28 14:42 builder success 89s ← retry after gap
217
+ ```
218
+
219
+ `qualia-framework analytics` extends with: agent failure rate, slowest agents (p50/p95), verifier fail rate by phase, repeated gap-cycles by task.
220
+
221
+ ## ERP integration (additive, non-breaking)
222
+
223
+ Report payload v2 (in `docs/erp-contract.md`) gains:
224
+ ```json
225
+ "agent_runs": {
226
+ "count": 14,
227
+ "failures": 2,
228
+ "verifier_fail_rate": 0.14,
229
+ "slowest_agent_ms": 312000,
230
+ "by_type": { "builder": 9, "verifier": 4, "planner": 1 }
231
+ }
232
+ ```
233
+
234
+ Aggregated counts only — never raw records. ERP backend treats the field as optional; old reports without it still parse. The full JSONL stays local.
235
+
236
+ ## Privacy and opt-out
237
+
238
+ | Captured | Not captured |
239
+ |---|---|
240
+ | Agent type, model | User prompts |
241
+ | Phase, task id | LLM responses |
242
+ | File paths (repo-relative) | File contents |
243
+ | Token counts | Command output |
244
+ | Duration, status | Stderr/stdout beyond `failure_detail` last 500 chars |
245
+ | Failure code | Network response bodies |
246
+ | Commit SHA | Git diffs |
247
+
248
+ Disable new writes: `export QUALIA_TELEMETRY=off`. The helper short-circuits *writes* — reads (`qualia-framework agents`) still surface previously recorded data. Opting out doesn't erase history.
249
+
250
+ ## Design decisions (locked v1)
251
+
252
+ These were called out as open questions during draft; resolved here so implementation can proceed.
253
+
254
+ 1. **Token counts:** all token fields are optional. Populate when the harness exposes them via `Task` tool metadata; leave undefined otherwise. The schema doesn't depend on always having them.
255
+ 2. **`session_id`:** optional. If Claude Code exposes a stable id to skills/hooks, use it. Otherwise the `bin/lib/agent-runs.js` writer generates a per-process UUID on first call and reuses it for the lifetime of that process.
256
+ 3. **Tool-call telemetry:** aggregate `tool_calls_count` only. No per-call spans. If a future analytics need demands per-call detail, add a separate `agent-tool-calls.jsonl` — don't bloat the main ledger.
257
+ 4. **`parent_run_id` vs `skill_invocation_id`:** added `skill_invocation_id` as the common grouping key (every record has one). `parent_run_id` is reserved strictly for true agent-spawned-agent nesting (team-orchestrator → fan-out children). Documented inline above.
258
+ 5. **`failure_detail`:** capped at 500 chars; keep the tail (most recent stderr), drop the head. The newest content is usually the most useful for classification.
259
+ 6. **Side log files:** on `status != success`, `finish()` writes `.planning/agent-runs/<run_id>.log` with full stderr. Lean JSONL stays grep-friendly; debugging context survives.
260
+ 7. **Cross-project rollup:** rejected. ERP does fleet-wide aggregation. A `~/.claude/agent-runs.jsonl` mirror would add a sync surface for marginal benefit.
261
+ 8. **Append atomicity:** relies on `O_APPEND` + single-`write()` syscall behavior for regular files (Linux/macOS/Windows). Atomic up to filesystem block size; our records are well under. Detailed in the "Concurrency" note above.
262
+ 9. **Cleanup:** `qualia-framework agents prune --before YYYY-MM-DD` removes records and matching log files older than the cutoff. Never auto-prunes — operator-driven only.
263
+ 10. **`QUALIA_TELEMETRY=off` semantics:** disables *writes* only. Reads (`qualia-framework agents`) still surface existing records — opting out of new collection does not retroactively hide history. Set before a session to silence that session's spawns.
264
+
265
+ ## Migration plan
266
+
267
+ 1. Add `bin/lib/agent-runs.js` (writer) + `bin/cli.js agents` (reader). Helper is a no-op if `.planning/` doesn't exist.
268
+ 2. Wire `ar.start`/`ar.finish` calls into the orchestrating skills (`/qualia-build`, `/qualia-verify`, `/qualia-plan`, `/qualia-research`, `/qualia-postmortem`).
269
+ 3. Add `agents` table to `qualia-framework analytics`.
270
+ 4. After two milestones produce real data, extend ERP payload (v2) with aggregated metrics. Coordinate with ERP backend.
271
+ 5. Defer postmortem feedback loop and ERP feedback analyzer until ≥4 weeks of real data exist.
272
+
273
+ No hard cutover. Pre-existing projects acquire the JSONL on first spawn after upgrade — older runs are simply absent.
@@ -200,6 +200,11 @@ Authorization: Bearer <api-key>
200
200
  external callers. Internal idempotent UPSERT on `(project_id,
201
201
  client_report_id)` retries is the one exception (see "Idempotent UPSERT
202
202
  on retry" above).
203
+ - The ERP resolves each report to a canonical internal project when possible.
204
+ Repository URL is the strongest signal, followed by repo/project slug, then
205
+ configured aliases, then the human report project name. This keeps legacy
206
+ repo/report names like `USD-Academy` or `USD-ACVADEMY` correctly linked to
207
+ ERP project names like `Underdog-Sales-Academy`.
203
208
  - **`dry_run` retention (v4.0.4+):** The ERP deletes rows where
204
209
  `dry_run = true AND submitted_at < now() - 7 days` via a daily cron at
205
210
  03:00 UTC. Production report views (list, project tree, email digests)
@@ -0,0 +1,321 @@
1
+ # Plan Contract
2
+
3
+ Machine-readable plan format consumed by builder, verifier, plan-checker, and `state.js`. Replaces ad-hoc markdown re-parsing — markdown plans become presentation, this JSON contract is truth.
4
+
5
+ Status: **draft, v1.** Pressure-test the shape against real phases before locking.
6
+
7
+ ## Why this exists
8
+
9
+ Today, `templates/plan.md` is structured markdown. Planner emits it, builder re-interprets it, verifier re-interprets it, plan-checker re-interprets it. Three independent LLM interpretations of the same prose = drift. The drift is invisible until verification fails for a reason that doesn't match the planner's intent.
10
+
11
+ The contract shifts every machine-driven step (task assignment, dependency check, verification execution) onto deterministic JSON. Prose stays in `phase-N-plan.md` for humans; code reads `phase-N-contract.json`.
12
+
13
+ ## File layout
14
+
15
+ ```
16
+ .planning/
17
+ phase-1-plan.md # human-facing prose (existing)
18
+ phase-1-contract.json # machine truth (NEW)
19
+ phase-1-deviations.json # builder→verifier deltas (existing)
20
+ phase-1-verification.md # verifier output (existing)
21
+ ```
22
+
23
+ `contract.json` is committed. It is regenerated only by re-running `/qualia-plan` or `qualia-framework state.js compile-plan`.
24
+
25
+ ## Schema (v1)
26
+
27
+ TypeScript-flavored for readability. Authoritative validator lives at `bin/lib/plan-contract.js` (Zod or hand-rolled — TBD; framework currently has zero deps).
28
+
29
+ ```ts
30
+ interface PlanContract {
31
+ version: 1; // bump on breaking change
32
+ phase: number; // 1-indexed
33
+ goal: string; // 1-2 sentences, what's TRUE when done
34
+ why: string; // unlocks-what; one sentence
35
+ generated_at: string; // ISO 8601 UTC
36
+ generated_by: "planner" | "compile-plan" | "manual";
37
+ source_plan_hash: string; // sha256 of phase-N-plan.md at compile time; "" if generated_by="manual"
38
+ tasks: Task[];
39
+ success_criteria: string[]; // phase-level user-facing truths
40
+ }
41
+
42
+ interface Task {
43
+ id: string; // "T1", "T2" — stable across reorders
44
+ title: string;
45
+ wave: number; // 1-indexed; tasks in same wave run in parallel
46
+ depends_on: string[]; // task ids this task needs
47
+ persona?: PersonaTag; // optional, for agent specialization
48
+ files_modify: string[]; // repo-relative paths
49
+ files_create: string[]; // repo-relative paths
50
+ files_delete: string[]; // repo-relative paths (for refactors that remove code)
51
+ acceptance_criteria: string[]; // observable behaviors (human-facing)
52
+ action: string; // concrete builder steps (advisory prose, max 500 chars)
53
+ context_files: string[]; // repo-relative paths the builder should read
54
+ verification: VerificationCheck[];
55
+ }
56
+
57
+ type PersonaTag =
58
+ | "security" | "architect" | "ux" | "frontend"
59
+ | "backend" | "data" | "performance" | "none";
60
+
61
+ type VerificationCheck =
62
+ | FileExistsCheck
63
+ | GrepMatchCheck
64
+ | CommandExitCheck
65
+ | BehavioralCheck;
66
+
67
+ interface FileExistsCheck {
68
+ type: "file-exists";
69
+ path: string; // repo-relative
70
+ must_contain?: string; // optional substring assertion
71
+ }
72
+
73
+ interface GrepMatchCheck {
74
+ type: "grep-match";
75
+ path: string; // file or glob
76
+ pattern: string; // regex
77
+ expect: "present" | "absent";
78
+ }
79
+
80
+ interface CommandExitCheck {
81
+ type: "command-exit";
82
+ command: string; // executed via execFile, NOT shell
83
+ args: string[]; // positional args (no shell parsing)
84
+ cwd?: string; // repo-relative; default = repo root
85
+ expected_exit: number; // typically 0
86
+ timeout_ms?: number; // default 30000
87
+ expect_stdout_match?: string; // regex; optional
88
+ }
89
+
90
+ interface BehavioralCheck {
91
+ type: "behavioral";
92
+ description: string; // human-readable; verifier interprets
93
+ evidence_required: Evidence[]; // structured citation requirements; vibes-based passes blocked at schema level
94
+ }
95
+
96
+ interface Evidence {
97
+ path: string; // repo-relative file path the verifier must cite
98
+ matcher?: string; // optional regex the cited line must satisfy
99
+ description: string; // what the cited line should demonstrate
100
+ }
101
+ ```
102
+
103
+ ### Why these four check types
104
+
105
+ They map 1:1 with the existing markdown Verification Contract section, so compilation is mechanical:
106
+
107
+ | Markdown section | Maps to |
108
+ |---|---|
109
+ | `Check type: file-exists` | `FileExistsCheck` |
110
+ | `Check type: grep-match` | `GrepMatchCheck` |
111
+ | `Check type: command-exit` | `CommandExitCheck` |
112
+ | `Check type: behavioral` | `BehavioralCheck` (last resort) |
113
+
114
+ `behavioral` is the only check that retains LLM interpretation — and even there, the schema forces evidence-required so the verifier can't produce vibes-based passes.
115
+
116
+ ## Example: a real phase contract
117
+
118
+ ```json
119
+ {
120
+ "version": 1,
121
+ "phase": 2,
122
+ "goal": "Authenticated users can sign in with email/password and reach the dashboard.",
123
+ "why": "Session persistence is the #1 abandonment trigger in onboarding — verification emails are wasted without it.",
124
+ "generated_at": "2026-04-28T14:32:00Z",
125
+ "generated_by": "planner",
126
+ "source_plan_hash": "sha256:9c1ae6f2b4d8e1f3a5c7b9d0e2f4a6c8e0b1d3f5a7c9e1b3d5f7a9c1e3b5d7f9",
127
+ "tasks": [
128
+ {
129
+ "id": "T1",
130
+ "title": "Add email/password sign-in handler",
131
+ "wave": 1,
132
+ "depends_on": [],
133
+ "persona": "backend",
134
+ "files_modify": ["src/lib/auth.ts"],
135
+ "files_create": ["src/lib/auth-schema.ts"],
136
+ "files_delete": [],
137
+ "acceptance_criteria": [
138
+ "POST /api/auth/signin returns 200 with valid creds",
139
+ "POST /api/auth/signin returns 401 with invalid creds",
140
+ "Session cookie is httpOnly and sameSite=lax"
141
+ ],
142
+ "action": "Use supabase.auth.signInWithPassword. Validate email/password with Zod schema. Set cookie via Next.js Response API.",
143
+ "context_files": [
144
+ "src/lib/supabase/server.ts",
145
+ "src/lib/supabase/client.ts"
146
+ ],
147
+ "verification": [
148
+ {
149
+ "type": "file-exists",
150
+ "path": "src/lib/auth-schema.ts",
151
+ "must_contain": "z.object"
152
+ },
153
+ {
154
+ "type": "command-exit",
155
+ "command": "npx",
156
+ "args": ["tsc", "--noEmit"],
157
+ "expected_exit": 0,
158
+ "timeout_ms": 60000
159
+ },
160
+ {
161
+ "type": "grep-match",
162
+ "path": "src/lib/auth.ts",
163
+ "pattern": "signInWithPassword",
164
+ "expect": "present"
165
+ }
166
+ ]
167
+ },
168
+ {
169
+ "id": "T2",
170
+ "title": "Wire sign-in form to handler",
171
+ "wave": 2,
172
+ "depends_on": ["T1"],
173
+ "persona": "frontend",
174
+ "files_modify": ["src/app/(auth)/signin/page.tsx"],
175
+ "files_create": [],
176
+ "files_delete": [],
177
+ "acceptance_criteria": [
178
+ "Form posts to /api/auth/signin",
179
+ "Error toast shows on 401",
180
+ "Redirect to /dashboard on 200"
181
+ ],
182
+ "action": "Add server action; show error state via useFormState; redirect via redirect() from next/navigation.",
183
+ "context_files": ["src/app/(auth)/signin/page.tsx"],
184
+ "verification": [
185
+ {
186
+ "type": "behavioral",
187
+ "description": "Form submission with valid creds redirects to /dashboard",
188
+ "evidence_required": [
189
+ {
190
+ "path": "src/app/(auth)/signin/page.tsx",
191
+ "matcher": "redirect\\(['\"]/dashboard",
192
+ "description": "redirect() call targeting /dashboard after successful signin"
193
+ },
194
+ {
195
+ "path": "src/app/(auth)/signin/page.tsx",
196
+ "matcher": "useFormState|action=",
197
+ "description": "form is wired to a server action or POST handler"
198
+ }
199
+ ]
200
+ }
201
+ ]
202
+ }
203
+ ],
204
+ "success_criteria": [
205
+ "User can sign in with valid credentials and land on /dashboard",
206
+ "User sees a clear error message on invalid credentials without leaving the page",
207
+ "Session persists across page reloads"
208
+ ]
209
+ }
210
+ ```
211
+
212
+ ## Validation rules (enforced at emission)
213
+
214
+ 1. **`tasks[].id` must be unique** within the phase.
215
+ 2. **Task ids must match** `^T\d+$` — `T1`, `T2`, etc. The compiler prefixes markdown task numbers (`## Task 1` → `T1`).
216
+ 3. **`depends_on` must reference ids that exist** in the same contract.
217
+ 4. **No cycles in `depends_on`.**
218
+ 5. **Wave assignment must respect dependencies** — a task's wave must be `>` than the max wave of its dependencies. (Trivially: if T2 depends on T1, T2.wave > T1.wave.)
219
+ 6. **At least one verification check per task.** Empty `verification: []` is rejected.
220
+ 7. **`files_modify`, `files_create`, `files_delete` are pairwise disjoint** — a file is in at most one of the three.
221
+ 8. **`command-exit` checks must use execFile-safe args** — no shell metacharacters in `command`; `args[]` carries positional values.
222
+ 9. **`success_criteria` minimum 1 entry.**
223
+ 10. **`action` ≤ 500 characters** — enforced. Keeps planner from over-specifying implementation.
224
+ 11. **`evidence_required[].path` must be repo-relative** and `matcher` (when present) must be a valid regex.
225
+
226
+ `bin/state.js validate-plan` runs these. Failures block transition to `built`.
227
+
228
+ Validator implementation: hand-rolled at `bin/lib/plan-contract.js`, ~80 LOC, zero dependencies. Framework's no-deps posture is preserved.
229
+
230
+ ## Drift detection (contract vs markdown)
231
+
232
+ Manual edits to `phase-N-plan.md` happen in practice. Without detection, the contract silently goes stale: builder reads JSON truth that no longer matches what humans see in markdown.
233
+
234
+ `source_plan_hash` is `sha256(plan_md_contents)` at compile time, prefixed `sha256:`. Stored in the contract.
235
+
236
+ `bin/state.js validate-plan --check-drift` re-hashes the current plan markdown and compares. Drift behavior:
237
+
238
+ | Scenario | Action |
239
+ |---|---|
240
+ | Hashes match | OK, no output |
241
+ | Hashes differ | Exit 2, message: `plan.md drifted from contract; run compile-plan --refresh` |
242
+ | Contract missing `source_plan_hash` (legacy or `manual`) | Warn but pass — drift checking disabled for that contract |
243
+
244
+ `compile-plan --refresh` re-reads markdown, regenerates contract, updates hash. Builder/verifier refuse to run if `--check-drift` fails.
245
+
246
+ ## Verification execution errors
247
+
248
+ A check that *cannot run* (binary missing, timeout, cwd doesn't exist) is distinct from a check that *ran and failed*. The verifier records:
249
+
250
+ | Outcome | `verification_result` | `failure_reason` |
251
+ |---|---|---|
252
+ | Check ran, passed | `pass` | — |
253
+ | Check ran, criteria unmet | `fail` | `verification-criteria-unmet` |
254
+ | Behavioral check, evidence missing | `fail` | `verification-evidence-missing` |
255
+ | Check itself errored (cmd not found, timeout, etc.) | `partial` | `verification-execution-error` |
256
+
257
+ Execution errors are NOT verification failures. They block phase advance the same way, but a postmortem treats them differently — fix the infrastructure, then re-run.
258
+
259
+ ## How builder reads it
260
+
261
+ ```js
262
+ // pseudocode — the actual implementation lives in skills/qualia-build
263
+ const contract = JSON.parse(fs.readFileSync(`.planning/phase-${N}-contract.json`));
264
+ const myTask = contract.tasks.find(t => t.id === assignedTaskId);
265
+
266
+ // builder gets:
267
+ // - exact files to touch
268
+ // - acceptance_criteria as the "definition of done"
269
+ // - context_files to read first
270
+ // - verification[] is the self-check before declaring DONE
271
+ ```
272
+
273
+ The builder still receives the Action prose as advisory guidance. The contract is the boundary.
274
+
275
+ ## How verifier reads it
276
+
277
+ For each task in the contract:
278
+ 1. Walk `verification[]`.
279
+ 2. For deterministic checks (`file-exists`, `grep-match`, `command-exit`): execute and record pass/fail with stdout/stderr captured. Distinguish "ran and failed" (`verification-criteria-unmet`) from "could not run" (`verification-execution-error`).
280
+ 3. For `behavioral` checks: for each `evidence_required[i]`, the verifier MUST produce a `{path, line, snippet}` citation. If `matcher` is present, the cited line must satisfy the regex. Missing evidence or matcher mismatch → automatic `verification-evidence-missing`.
281
+ 4. Aggregate per-task → per-phase pass/fail.
282
+ 5. Write `phase-N-verification.json` (machine output) alongside `phase-N-verification.md` (human output).
283
+
284
+ This eliminates the "verifier wrote a glowing pass when half the criteria weren't actually met" failure mode — `evidence_required[]` is structured, so vibes-based passes are blocked at the schema level.
285
+
286
+ ## Compile mode (migrating in-flight projects)
287
+
288
+ `bin/state.js compile-plan --phase N` reads `phase-N-plan.md` and emits a best-effort `phase-N-contract.json`:
289
+
290
+ - Frontmatter → `phase`, `goal`
291
+ - `## Task N — title` blocks → `tasks[]`
292
+ - `**Files:**` line → `files_modify` (cannot distinguish create vs modify from prose; defaults to modify, warns)
293
+ - `**Acceptance Criteria:**` bullets → `acceptance_criteria`
294
+ - `### Contract for Task N` blocks → `verification[]`
295
+ - Missing fields → `compile-plan` exits non-zero with a list of gaps
296
+
297
+ Compile is a one-time bridge. New plans emit JSON directly from the planner agent.
298
+
299
+ ## Design decisions (locked v1)
300
+
301
+ These were called out as open questions during draft; resolved here so implementation can proceed.
302
+
303
+ 1. **Persona enum:** dropped `data` — covered by `backend`. Six personas + `none`.
304
+ 2. **`acceptance_criteria` vs `verification[]`:** kept separate. AC is the human-facing definition of done (lands in commit messages, milestone summaries, ERP reports). `verification[]` is the mechanical execution path. The verifier never interprets AC — it executes `verification[]`. This separation is the whole point of the contract.
305
+ 3. **`action` cap:** 500 chars. Advisory only. Validator enforces.
306
+ 4. **Versioning:** in-place migration via `compile-plan --upgrade`. `version` field tells the loader which schema to apply. No filename suffixes — canonical filename stays `phase-N-contract.json`.
307
+ 5. **Wave placement:** lives on the task. The validator enforces `task.wave > max(deps wave)` so the redundancy with `depends_on` is contained. Wave is a scheduling/display hint; `depends_on` is the constraint.
308
+ 6. **`behavioral` checks:** permanent. UX feel, error message clarity, animation timing — none of these are deterministic. The escape hatch is healthy. The `evidence_required[]` field forces verifier to cite proof; vibes-based passes are blocked at the schema level.
309
+ 7. **Validator:** hand-rolled in plain Node. Framework keeps zero npm dependencies. Zod is rejected for this layer.
310
+
311
+ ## Migration plan
312
+
313
+ 1. Add schema + validator + `compile-plan` command. No callers yet.
314
+ 2. Backfill contracts for active projects via `compile-plan` — manual review of warnings.
315
+ 3. Update planner agent prompt to emit JSON alongside markdown.
316
+ 4. Update builder skill to read JSON for files/AC/verification; markdown still readable.
317
+ 5. Update verifier agent to execute `verification[]` deterministically; keep prose verification report for humans.
318
+ 6. Update plan-checker to validate JSON.
319
+ 7. After two milestones run cleanly on JSON, mark prose plan as advisory-only in docs.
320
+
321
+ No hard cutover. Both formats coexist during migration.
@@ -63,12 +63,12 @@ try {
63
63
  }
64
64
 
65
65
  // Synchronously fetch the latest version from npm. Tight timeout so the hook
66
- // never blocks Claude Code for long. The cache timestamp is written ONLY if
67
- // this fetch succeeds otherwise the next session retries (no 24h blackout
68
- // when the network is unreachable).
66
+ // never blocks Claude Code for long. Stamp the cache before the network call:
67
+ // if npm/DNS is down, we avoid paying the timeout on every Bash command.
69
68
  let latest = "";
70
69
  try {
71
70
  fs.writeFileSync(LOCK_FILE, String(process.pid));
71
+ fs.writeFileSync(CACHE_FILE, String(Math.floor(Date.now() / 1000)));
72
72
  const r = spawnSync("npm", ["view", "qualia-framework", "version"], {
73
73
  encoding: "utf8",
74
74
  timeout: 3000,
@@ -80,14 +80,10 @@ try {
80
80
  try { fs.unlinkSync(LOCK_FILE); } catch {}
81
81
 
82
82
  if (!latest) {
83
- // Fetch failed — leave cache untouched so the next call retries.
84
83
  _trace("auto-update", "allow", { reason: "npm-fetch-failed" });
85
84
  process.exit(0);
86
85
  }
87
86
 
88
- // Successful fetch — debounce future checks for 24h.
89
- fs.writeFileSync(CACHE_FILE, String(Math.floor(Date.now() / 1000)));
90
-
91
87
  const cmp = (a, b) => {
92
88
  const pa = a.split(".").map(Number), pb = b.split(".").map(Number);
93
89
  for (let i = 0; i < 3; i++) {