bossbuild 0.97.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/PRINCIPLES.md +70 -0
  3. package/README.md +213 -0
  4. package/VERSION +1 -0
  5. package/bin/boss +3 -0
  6. package/library/README.md +19 -0
  7. package/library/agents/.gitkeep +0 -0
  8. package/library/agents/mentor-venture.md +57 -0
  9. package/library/hooks/.gitkeep +0 -0
  10. package/library/hooks/auto-log.js +133 -0
  11. package/library/hooks/memory-cue.js +82 -0
  12. package/library/hooks/secrets-guard.js +87 -0
  13. package/library/memory-seed/README.md +29 -0
  14. package/library/memory-seed/durable-facts-example.md +16 -0
  15. package/library/practices/.gitkeep +0 -0
  16. package/library/practices/agent-security.md +111 -0
  17. package/library/practices/ai-adoption-culture.md +104 -0
  18. package/library/practices/ai-ux-patterns.md +246 -0
  19. package/library/practices/celebration-of-done.md +100 -0
  20. package/library/practices/conscience-voicing.md +121 -0
  21. package/library/practices/context-discipline.md +116 -0
  22. package/library/practices/design-system.md +152 -0
  23. package/library/practices/git-workflow.md +119 -0
  24. package/library/practices/harm-taxonomy.md +45 -0
  25. package/library/practices/quality-ratchet.md +48 -0
  26. package/library/practices/revalidation.md +57 -0
  27. package/library/practices/scalable-architecture.md +111 -0
  28. package/library/practices/ship-it-live.md +149 -0
  29. package/library/practices/skill-authoring.md +70 -0
  30. package/library/skills/.gitkeep +0 -0
  31. package/library/skills/boss-learn/SKILL.md +63 -0
  32. package/library/skills/boss-sync/SKILL.md +48 -0
  33. package/package.json +49 -0
  34. package/registry/CHANGELOG.md +2737 -0
  35. package/src/board.js +655 -0
  36. package/src/brain.js +288 -0
  37. package/src/cli.js +542 -0
  38. package/src/conscience.js +426 -0
  39. package/src/insights.js +147 -0
  40. package/src/learn.js +92 -0
  41. package/src/map.js +103 -0
  42. package/src/modes.js +82 -0
  43. package/src/paths.js +36 -0
  44. package/src/registry.js +34 -0
  45. package/src/scaffold.js +138 -0
  46. package/src/sync.js +292 -0
  47. package/src/team.js +103 -0
  48. package/stages/L0-quickstart/manifest.json +12 -0
  49. package/stages/L0-quickstart/template/.claude/agents/coder-generalist.md +31 -0
  50. package/stages/L0-quickstart/template/.claude/agents/mentor-venture.md +57 -0
  51. package/stages/L0-quickstart/template/.claude/agents/pm.md +28 -0
  52. package/stages/L0-quickstart/template/.claude/hooks/conscience.js +89 -0
  53. package/stages/L0-quickstart/template/.claude/hooks/lib/loop-runtime.js +507 -0
  54. package/stages/L0-quickstart/template/.claude/hooks/lib/yaml.js +163 -0
  55. package/stages/L0-quickstart/template/.claude/hooks/memory-cue.js +82 -0
  56. package/stages/L0-quickstart/template/.claude/hooks/secrets-guard.js +87 -0
  57. package/stages/L0-quickstart/template/.claude/rules/your-app-code.md +17 -0
  58. package/stages/L0-quickstart/template/.claude/settings.json +36 -0
  59. package/stages/L0-quickstart/template/.claude/skills/boss/SKILL.md +161 -0
  60. package/stages/L0-quickstart/template/.claude/skills/boss-learn/SKILL.md +63 -0
  61. package/stages/L0-quickstart/template/.claude/skills/boss-sync/SKILL.md +55 -0
  62. package/stages/L0-quickstart/template/.claude/skills/canvas/SKILL.md +112 -0
  63. package/stages/L0-quickstart/template/.claude/skills/comprehend/SKILL.md +72 -0
  64. package/stages/L0-quickstart/template/.claude/skills/decide/SKILL.md +122 -0
  65. package/stages/L0-quickstart/template/.claude/skills/feedback/SKILL.md +68 -0
  66. package/stages/L0-quickstart/template/.claude/skills/import/SKILL.md +73 -0
  67. package/stages/L0-quickstart/template/.claude/skills/persona/SKILL.md +92 -0
  68. package/stages/L0-quickstart/template/.claude/skills/prototype/SKILL.md +114 -0
  69. package/stages/L0-quickstart/template/.claude/skills/triage/SKILL.md +104 -0
  70. package/stages/L0-quickstart/template/.claude/skills/welcome/SKILL.md +262 -0
  71. package/stages/L0-quickstart/template/AGENTS.md +31 -0
  72. package/stages/L0-quickstart/template/CLAUDE.md +57 -0
  73. package/stages/L0-quickstart/template/docs/IDS.md +42 -0
  74. package/stages/L0-quickstart/template/docs/ideas/INDEX.md +24 -0
  75. package/stages/L0-quickstart/template/docs/loops/canvas-loop.md +90 -0
  76. package/stages/L0-quickstart/template/docs/loops/capture-loop.md +64 -0
  77. package/stages/L1-mvp/manifest.json +12 -0
  78. package/stages/L1-mvp/template/.claude/agents/mentor-architect.md +124 -0
  79. package/stages/L1-mvp/template/.claude/agents/mentor-cofounder.md +85 -0
  80. package/stages/L1-mvp/template/.claude/agents/mentor-gtm.md +49 -0
  81. package/stages/L1-mvp/template/.claude/agents/program-manager.md +46 -0
  82. package/stages/L1-mvp/template/.claude/agents/tester.md +42 -0
  83. package/stages/L1-mvp/template/.claude/hooks/auto-log.js +133 -0
  84. package/stages/L1-mvp/template/.claude/rules/feature-context.md +18 -0
  85. package/stages/L1-mvp/template/.claude/skills/ai-cost/SKILL.md +249 -0
  86. package/stages/L1-mvp/template/.claude/skills/ai-failure-states/SKILL.md +226 -0
  87. package/stages/L1-mvp/template/.claude/skills/ai-first-init/SKILL.md +227 -0
  88. package/stages/L1-mvp/template/.claude/skills/close/SKILL.md +170 -0
  89. package/stages/L1-mvp/template/.claude/skills/consult/SKILL.md +72 -0
  90. package/stages/L1-mvp/template/.claude/skills/cost-review/SKILL.md +204 -0
  91. package/stages/L1-mvp/template/.claude/skills/design-tokens-init/SKILL.md +192 -0
  92. package/stages/L1-mvp/template/.claude/skills/drift-deep/SKILL.md +170 -0
  93. package/stages/L1-mvp/template/.claude/skills/evals/SKILL.md +154 -0
  94. package/stages/L1-mvp/template/.claude/skills/extract/SKILL.md +209 -0
  95. package/stages/L1-mvp/template/.claude/skills/judge-traces/SKILL.md +68 -0
  96. package/stages/L1-mvp/template/.claude/skills/log/SKILL.md +64 -0
  97. package/stages/L1-mvp/template/.claude/skills/practice/SKILL.md +92 -0
  98. package/stages/L1-mvp/template/.claude/skills/pretotype/SKILL.md +95 -0
  99. package/stages/L1-mvp/template/.claude/skills/red-team/SKILL.md +137 -0
  100. package/stages/L1-mvp/template/.claude/skills/revalidate/SKILL.md +51 -0
  101. package/stages/L1-mvp/template/.claude/skills/ship/SKILL.md +105 -0
  102. package/stages/L1-mvp/template/.claude/skills/smoke/SKILL.md +43 -0
  103. package/stages/L1-mvp/template/.claude/skills/spec/SKILL.md +145 -0
  104. package/stages/L1-mvp/template/claude-append.md +122 -0
  105. package/stages/L1-mvp/template/docs/loops/ai-failure-state-loop.md +107 -0
  106. package/stages/L1-mvp/template/docs/loops/coordination-loop.md +116 -0
  107. package/stages/L1-mvp/template/docs/loops/cost-budget-loop.md +117 -0
  108. package/stages/L1-mvp/template/docs/loops/cost-review-loop.md +113 -0
  109. package/stages/L1-mvp/template/docs/loops/design-tokens-loop.md +98 -0
  110. package/stages/L1-mvp/template/docs/loops/drift-loop.md +149 -0
  111. package/stages/L1-mvp/template/docs/loops/extraction-loop.md +128 -0
  112. package/stages/L1-mvp/template/docs/loops/focus-loop.md +106 -0
  113. package/stages/L1-mvp/template/docs/loops/pretotype-loop.md +88 -0
  114. package/stages/L1-mvp/template/docs/loops/spec-loop.md +83 -0
  115. package/stages/L2-v1/manifest.json +12 -0
  116. package/stages/L2-v1/template/.claude/agents/db-architect.md +91 -0
  117. package/stages/L2-v1/template/.claude/agents/mentor-business.md +124 -0
  118. package/stages/L2-v1/template/.claude/agents/mentor-fundraising.md +72 -0
  119. package/stages/L2-v1/template/.claude/agents/mentor-pitch.md +84 -0
  120. package/stages/L2-v1/template/.claude/agents/mentor-talent.md +84 -0
  121. package/stages/L2-v1/template/.claude/agents/ui-designer.md +81 -0
  122. package/stages/L2-v1/template/.claude/agents/ux-designer.md +87 -0
  123. package/stages/L2-v1/template/.claude/skills/board/SKILL.md +98 -0
  124. package/stages/L2-v1/template/.claude/skills/design-review/SKILL.md +77 -0
  125. package/stages/L2-v1/template/.claude/skills/ux-check/SKILL.md +93 -0
  126. package/stages/L2-v1/template/claude-append.md +59 -0
  127. package/stages/L2-v1/template/docs/loops/design-drift-loop.md +108 -0
  128. package/stages/L3-scale/README.md +13 -0
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env node
2
+ // BOSS auto-log — a SubagentStop hook (OPT-IN; the trace substrate for IDEA-025).
3
+ //
4
+ // Ported UP from the dhun dogfood (Principle #1), Node-ported for BOSS's zero-dep
5
+ // rule. WHAT IT DOES: after a writer subagent finishes, append one honest line to
6
+ // `.boss/trace.jsonl` recording what that agent actually touched — session, agent,
7
+ // changed files, timestamp. This is the *trace substrate*: the within-session
8
+ // counterpart to `boss insights` (which reads the cross-project registry). It is
9
+ // the raw material a trace-native judge (IDEA-025 Phase 2, `/judge-traces`) and the
10
+ // sleep-time learn loop (Phase 3) read later. It does NOT judge, score, or send
11
+ // anything anywhere.
12
+ //
13
+ // HUMANE CONTRACT (inherits IDEA-021 / IDEA-013, non-negotiable):
14
+ // - LOCAL-ONLY. Writes one file in this repo. Never transmits. Never shares up.
15
+ // - APPEND-ONLY, facts not estimates. Records what the git tree shows changed.
16
+ // - MEASURE, DON'T INSTRUMENT THE HUMAN. It reads the work's own trace, not you.
17
+ // - READ ON DEMAND. Nothing consumes this file unless you run a skill that does.
18
+ //
19
+ // WHY OPT-IN: a SubagentStop hook fires a process after every subagent — real
20
+ // latency on multi-agent sessions. Ship it dormant; turn it on when you want the
21
+ // trace (regulated/high-stakes cohorts, or BOSS's own repo eating its dogfood).
22
+ //
23
+ // TO TURN IT ON — add to .claude/settings.json (registration IS the on-switch):
24
+ // "hooks": {
25
+ // "SubagentStop": [
26
+ // { "matcher": "",
27
+ // "hooks": [ { "type": "command",
28
+ // "command": "node \"$CLAUDE_PROJECT_DIR/.claude/hooks/auto-log.js\"",
29
+ // "timeout": 10 } ] }
30
+ // ]
31
+ // }
32
+ //
33
+ // Fail-open: any surprise exits 0. A trace line missed is fine; a broken session is not.
34
+
35
+ import process from 'node:process';
36
+ import { execFileSync } from 'node:child_process';
37
+ import { existsSync, mkdirSync, readFileSync, appendFileSync } from 'node:fs';
38
+ import { join } from 'node:path';
39
+
40
+ // Read-only agent types never write files — skip them (no trace to record).
41
+ const READ_ONLY = new Set([
42
+ 'Explore', 'Plan', 'claude-code-guide', 'general-purpose-readonly',
43
+ 'mentor-venture', 'mentor-architect', 'mentor-gtm', 'mentor-business',
44
+ 'mentor-fundraising', 'mentor-talent', 'mentor-humane', 'mentor-pitch',
45
+ ]);
46
+
47
+ function readStdin() {
48
+ return new Promise((resolve) => {
49
+ let data = '';
50
+ process.stdin.setEncoding('utf8');
51
+ process.stdin.on('data', (c) => (data += c));
52
+ process.stdin.on('end', () => resolve(data));
53
+ setTimeout(() => resolve(data), 1500);
54
+ });
55
+ }
56
+
57
+ const git = (repo, args) => {
58
+ try {
59
+ // Trailing-only trim: porcelain's leading status columns are significant
60
+ // (line 1 is " M path" — a full .trim() would eat that leading space and
61
+ // shift the column parse by one).
62
+ return execFileSync('git', args, { cwd: repo, encoding: 'utf8' }).replace(/\n+$/, '');
63
+ } catch {
64
+ return '';
65
+ }
66
+ };
67
+
68
+ const main = async () => {
69
+ const repo = process.env.CLAUDE_PROJECT_DIR || process.cwd();
70
+
71
+ let agent = 'unknown';
72
+ let session = 'unknown';
73
+ try {
74
+ const json = JSON.parse(await readStdin());
75
+ agent = json.subagent_type || json.tool_input?.subagent_type || json.agent_type || 'unknown';
76
+ session = json.session_id || 'unknown';
77
+ } catch {
78
+ process.exit(0); // fail-open
79
+ }
80
+ if (READ_ONLY.has(agent)) process.exit(0);
81
+
82
+ // Facts not estimates: read what the working tree actually shows changed —
83
+ // both tracked modifications AND new untracked files (coders create files,
84
+ // which a plain `git diff HEAD` would miss). `git status --porcelain` carries
85
+ // both; columns 0-1 are the status code, the path starts at column 3.
86
+ const changed = [
87
+ ...new Set(
88
+ git(repo, ['status', '--porcelain', '--untracked-files=all'])
89
+ .split('\n')
90
+ .filter(Boolean)
91
+ .map((l) => l.slice(3).replace(/^"|"$/g, '').split(' -> ').pop())
92
+ .filter(Boolean)
93
+ ),
94
+ ];
95
+ if (changed.length === 0) process.exit(0); // nothing written → nothing to trace
96
+
97
+ const files = changed.slice(0, 25); // cap the line; a 200-file diff doesn't need every path
98
+ const record = {
99
+ ts: new Date().toISOString(),
100
+ session,
101
+ agent,
102
+ files,
103
+ file_count: changed.length,
104
+ };
105
+
106
+ const dir = join(repo, '.boss');
107
+ const out = join(dir, 'trace.jsonl');
108
+ try {
109
+ if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
110
+ // Dedup: SubagentStop can fire several times for one change set. Skip if the
111
+ // last line has the same (session, agent, files) signature.
112
+ if (existsSync(out)) {
113
+ const lines = readFileSync(out, 'utf8').trim().split('\n');
114
+ const last = lines[lines.length - 1];
115
+ if (last) {
116
+ try {
117
+ const p = JSON.parse(last);
118
+ const same =
119
+ p.session === session &&
120
+ p.agent === agent &&
121
+ JSON.stringify(p.files) === JSON.stringify(files);
122
+ if (same) process.exit(0);
123
+ } catch { /* fall through and write */ }
124
+ }
125
+ }
126
+ appendFileSync(out, JSON.stringify(record) + '\n');
127
+ } catch {
128
+ process.exit(0); // fail-open: a write surprise must not break the session
129
+ }
130
+ process.exit(0);
131
+ };
132
+
133
+ main();
@@ -0,0 +1,18 @@
1
+ ---
2
+ paths:
3
+ - "src/**"
4
+ ---
5
+
6
+ <!-- MVP working-context. Like the Quickstart rule, this loads only when Claude opens a matching file.
7
+ In MVP you build feature by feature (FEAT-NNN). Keep the *live* feature's working notes here — the
8
+ local decisions, gotchas, and "don't redo this" that matter while it's in flight, not forever.
9
+ When the feature ships, /close will compress this to a one-line outcome (BOSS FEAT-020, Phases 2-3).
10
+ Until then: prune it by hand when a feature lands. Rescope paths: to where the feature's code lives. -->
11
+
12
+ # Working context — current feature
13
+
14
+ _What the model needs while building the live FEAT. Ephemeral by design — prune when it ships._
15
+
16
+ - **Active FEAT:** (e.g. FEAT-003 — checkout flow)
17
+ - **Local decisions:** (the choices that bind this feature's code, with a one-line why)
18
+ - **Gotchas / don't-redo:** (the traps you already hit, so the model doesn't re-walk them)
@@ -0,0 +1,249 @@
1
+ ---
2
+ name: ai-cost
3
+ description: Establish AI spend discipline for {{PROJECT_NAME}} — declare per-user / per-feature / monthly budgets, name the model choices, wire a per-call cost logger, set a review cadence. Cohort-aware (first-product gets a tight cap; vibe-virtuoso gets inspect-only; domain-expert gets privacy-first logging). Run at the first inflection where the app actually calls an LLM. Usage - /ai-cost
4
+ ---
5
+
6
+ # /ai-cost — name the bill before it surprises you
7
+
8
+ The cost of an AI-mediated app is the single most-load-bearing operating decision you make once
9
+ your code reaches the model. Token math is small per call and large per cohort. *"Just call GPT-5
10
+ and see"* is a perfectly fine demo posture and a perfectly destructive production posture.
11
+
12
+ This skill is the gate between *"the app calls an LLM"* and *"the app is in front of users."* It
13
+ makes you declare the budget BEFORE the bill, wire a logger so you can SEE the bill, and pair
14
+ the cost shape with the right mentor (architecture for shape; business for unit economics).
15
+
16
+ ## When to run it
17
+
18
+ - A FEAT puts an LLM call in the user-facing control flow (not a one-off dev script).
19
+ - You're about to ship that FEAT to anyone other than yourself.
20
+ - You see the conscience's `cost` moment open — the `cost-budget-loop` detected LLM calls in
21
+ `src/` and no `docs/ai-cost-budget.md`. Run this skill to close the loop.
22
+ - After a model swap or prompt rewrite. The bill changes; the budget should be re-checked.
23
+ - After a real-bill surprise. The bill IS the design signal — codify the lesson here.
24
+
25
+ ## What this skill produces
26
+
27
+ 1. **`docs/ai-cost-budget.md`** — the declared contract. Budgets, model choices, alert
28
+ thresholds, review cadence. The single file your future-self reads when the bill spikes.
29
+ 2. **A cost-logger wrapper** — a small function in your stack that wraps every LLM SDK call,
30
+ records `{ feat, model, input_tokens, output_tokens, estimated_usd, ts }` to a local ledger.
31
+ 3. **`.boss/cost-log.jsonl`** — the running ledger (gitignored; local-only by default; ship to
32
+ a real datastore when you have real users).
33
+
34
+ ## How to run it
35
+
36
+ ### 1. Read the cohort
37
+
38
+ Read `cohort` from `.boss/config.json`. The cohort decides the default posture. If unset, ask the
39
+ one open question from `/boss` step 6, then continue with the answer.
40
+
41
+ ### 2. Survey the LLM surface
42
+
43
+ Scan `src/` for LLM SDK call sites (`anthropic`, `openai`, `@anthropic-ai/sdk`, `messages.create`,
44
+ `chat.completions.create`, `generateText`, `streamText`, `Anthropic(`, `OpenAI(`, etc.). For each
45
+ hit, identify:
46
+ - **Which FEAT** it serves (link to a `FEAT-NNN`).
47
+ - **Which model** it uses (e.g., `claude-sonnet-4-6`, `gpt-5-mini`, `claude-haiku-4-5`).
48
+ - **Per-call shape** — prompt size order-of-magnitude (small / medium / large), expected outputs.
49
+ - **Call frequency** — once per session? Per user action? Per page render?
50
+
51
+ Don't audit every call — find the **three most expensive call patterns** by order of magnitude.
52
+ Most apps are 80/20: a small number of call patterns dominate cost.
53
+
54
+ ### 3. Pick the budget shape (cohort-aware)
55
+
56
+ Walk the founder through the budget framework. Cohort defaults below are *starting points*,
57
+ not prescriptions — they're calibrated to the cohort's risk and operating style. The founder
58
+ picks; the skill records.
59
+
60
+ | Cohort | Default per-user/day | Monthly cap | Posture |
61
+ |---|---|---|---|
62
+ | `first-product` | $5 | $100 | Conservative. Hard cap. Auto-fallback to cheaper model on breach. |
63
+ | `vibe-coder-newbie` | $5 | $50 | Strict — protect from runaway. Define cap in plain dollars, not tokens. |
64
+ | `non-tech-founder` | $10 | $200 | Plain-language framing. Show: *"each user costs about $X/day."* |
65
+ | `vibe-virtuoso` | (inspect-only) | (inspect-only) | No gate. Logger on; budget tracked; show the numbers. Override-friendly. |
66
+ | `eng-builder` | (BYO) | (BYO) | Logger on; no opinion on caps. Transparent + inspectable; they'll wire alerts themselves. |
67
+ | `indie-hacker` | $3 (sustainable margin) | $50 | Frame as **% of revenue per user** (target: <30% of MRR per user). Calm-company math. |
68
+ | `returning-founder` | $10 | $300 | Frame as **cost-per-acquired-user** + **cost-per-active-user**. They know unit economics. |
69
+ | `domain-expert` | $20 | $500 | Higher per-user is fine in regulated domains. **Privacy-first logging — NO PII, NO prompt body unless redacted.** Cite the regulatory context. |
70
+ | _(no cohort declared)_ | $10 | $200 | Generic conservative; revisit when cohort sharpens. |
71
+
72
+ For each row, the founder edits to fit the actual bet. The numbers are a **starting frame**;
73
+ the founder's read of the business is the real signal.
74
+
75
+ ### 4. Pick model choices deliberately
76
+
77
+ For each call site identified in step 2, name **why** the chosen model. Three valid answers:
78
+ - **"Quality requires it"** — name the failure mode that the cheaper model exhibits. (If you
79
+ can't, the cheaper model probably works.)
80
+ - **"Speed requires it"** — name the latency budget. (If there's no SLA, latency probably
81
+ doesn't require the bigger model.)
82
+ - **"Default we haven't tested"** — *valid only as a TODO.* Schedule the A/B against the
83
+ cheaper model in the same `docs/ai-cost-budget.md`.
84
+
85
+ The most common cost win is **downgrading non-load-bearing calls** to cheaper models. The
86
+ second most common is **caching** (Anthropic prompt caching, response caching). The third is
87
+ **batching** (Anthropic batch API; OpenAI batch). Each gets a line in the budget doc.
88
+
89
+ ### 5. Wire the logger
90
+
91
+ A ~30-line wrapper around the LLM SDK that records each call. Stack-agnostic shape:
92
+
93
+ ```typescript
94
+ // src/lib/ai-cost-logger.ts
95
+ import { appendFileSync } from 'node:fs';
96
+ import { join } from 'node:path';
97
+
98
+ const LEDGER = join(process.cwd(), '.boss', 'cost-log.jsonl');
99
+
100
+ // Model price table — UPDATE WHEN YOU SWAP MODELS. The math is wrong otherwise.
101
+ const PRICE_PER_M_TOKENS = {
102
+ 'claude-sonnet-4-6': { input: 3.00, output: 15.00 },
103
+ 'claude-haiku-4-5': { input: 1.00, output: 5.00 },
104
+ 'claude-opus-4-7': { input: 15.00, output: 75.00 },
105
+ 'gpt-5-mini': { input: 0.25, output: 2.00 },
106
+ // add yours
107
+ };
108
+
109
+ export function logCall({ feat, model, inputTokens, outputTokens, userId }) {
110
+ const p = PRICE_PER_M_TOKENS[model] || { input: 0, output: 0 };
111
+ const usd = (inputTokens * p.input + outputTokens * p.output) / 1_000_000;
112
+ const entry = {
113
+ ts: new Date().toISOString(),
114
+ feat, model, userId,
115
+ input_tokens: inputTokens,
116
+ output_tokens: outputTokens,
117
+ estimated_usd: Number(usd.toFixed(6)),
118
+ };
119
+ appendFileSync(LEDGER, JSON.stringify(entry) + '\n');
120
+ return entry;
121
+ }
122
+ ```
123
+
124
+ ```python
125
+ # src/ai_cost_logger.py
126
+ import json, os, datetime
127
+
128
+ LEDGER = os.path.join(os.getcwd(), ".boss", "cost-log.jsonl")
129
+
130
+ PRICE_PER_M = {
131
+ "claude-sonnet-4-6": {"input": 3.00, "output": 15.00},
132
+ "claude-haiku-4-5": {"input": 1.00, "output": 5.00},
133
+ "claude-opus-4-7": {"input": 15.00, "output": 75.00},
134
+ "gpt-5-mini": {"input": 0.25, "output": 2.00},
135
+ }
136
+
137
+ def log_call(feat, model, input_tokens, output_tokens, user_id=None):
138
+ p = PRICE_PER_M.get(model, {"input": 0, "output": 0})
139
+ usd = (input_tokens * p["input"] + output_tokens * p["output"]) / 1_000_000
140
+ entry = {
141
+ "ts": datetime.datetime.utcnow().isoformat() + "Z",
142
+ "feat": feat, "model": model, "user_id": user_id,
143
+ "input_tokens": input_tokens, "output_tokens": output_tokens,
144
+ "estimated_usd": round(usd, 6),
145
+ }
146
+ with open(LEDGER, "a") as f:
147
+ f.write(json.dumps(entry) + "\n")
148
+ return entry
149
+ ```
150
+
151
+ Wrap each LLM call. The wrapper is the *only* path to the SDK — make it impossible to bypass:
152
+ add a lint rule or a code review note that says *"if you imported `@anthropic-ai/sdk` directly,
153
+ this is a bug — go through `lib/ai-cost-logger`."*
154
+
155
+ **Privacy note (domain-expert and any health/legal/financial project):** the logger above
156
+ records token counts and metadata, NOT prompt or response content. Keep it that way. If you
157
+ need to log content for debugging, do it in a separate file with explicit consent + retention,
158
+ and exclude it from any shared logs.
159
+
160
+ ### 6. Write `docs/ai-cost-budget.md`
161
+
162
+ The contract doc. Use this skeleton (frontmatter included so it's discoverable like every other
163
+ BOSS doc):
164
+
165
+ ```markdown
166
+ ---
167
+ id: ai-cost-budget
168
+ type: budget
169
+ owner: pm
170
+ status: declared
171
+ updated: {{DATE}}
172
+ ---
173
+
174
+ # AI cost budget — {{PROJECT_NAME}}
175
+
176
+ ## Cohort + posture
177
+ - Cohort: <cohort name from .boss/config.json>
178
+ - Posture: <strict cap | inspect-only | BYO | % of revenue>
179
+
180
+ ## Budgets
181
+ - **Per user, per day:** $X.XX (alert at 80% — $X.XX)
182
+ - **Per user, per month:** $X.XX
183
+ - **Monthly cap (all users):** $X.XX (hard ceiling: pause the feature, don't quietly overrun)
184
+
185
+ ## Model choices (one row per call site)
186
+ | Call site / FEAT | Model | Why this model | Cheaper-model A/B status |
187
+ |---|---|---|---|
188
+ | <FEAT-001 / classify-intent> | <claude-haiku-4-5> | <quality requires it: classifier fails below this> | <tested 2026-MM-DD; haiku 92%, sonnet 96% — kept sonnet> |
189
+
190
+ ## Cost levers (revisit when budget breached)
191
+ - [ ] Prompt caching (Anthropic prompt caching for stable system prompts)
192
+ - [ ] Response caching (identical prompts in <N> minutes)
193
+ - [ ] Batch API (non-realtime calls)
194
+ - [ ] Downgrade to cheaper model for non-load-bearing calls
195
+ - [ ] Truncate context (do you really need the whole document?)
196
+ - [ ] Structured outputs (Liu) — smaller schemas = smaller responses
197
+
198
+ ## Review cadence
199
+ - Weekly during MVP — read `.boss/cost-log.jsonl`, total by FEAT + by user, sanity-check.
200
+ - Monthly during V1 — daily totals; cohort cost-per-user; cost-as-%-of-revenue.
201
+
202
+ ## Breach grammar (per IDEA-008)
203
+ - When per-user/day exceeds budget by <Y%>, hook should surface the `cost` moment.
204
+ - Override (when legitimate): record in `docs/devlog.md`:
205
+ - **OVERRIDE:** `cost-budget-loop` overrun on <date> — rationale: <e.g., one power user
206
+ running a long workflow; not representative; expected to come back into budget by week-end>.
207
+ ```
208
+
209
+ ### 7. Set the review cadence
210
+
211
+ Add a reminder to `docs/RESUME.md` next-tasks: *"Review `.boss/cost-log.jsonl` weekly through
212
+ MVP."* This is the discipline part — without the cadence, the ledger fills up unread.
213
+
214
+ ### 8. Pair with mentors (when warranted)
215
+
216
+ After writing the budget doc, optionally:
217
+ - **`mentor-architect`** — when the cost shape implies an architectural decision (batching vs.
218
+ realtime, caching layer, model fallback strategy). Hand off with: *"`mentor-architect`, the
219
+ cost shape says X — what architecture decisions does that imply?"*
220
+ - **`mentor-business`** — when unit economics get load-bearing (cost-per-acquired-user, cost
221
+ vs. willingness-to-pay, pricing implications). Hand off with: *"`mentor-business`, our
222
+ cost-per-active-user is X; what should the pricing carry?"*
223
+
224
+ Don't auto-invoke either. Surface the question; let the founder decide whether to consult.
225
+
226
+ ## Connection to other loops
227
+
228
+ - **Upstream:** `pretotype-loop` closed — you know the demand exists. Don't optimize cost
229
+ before you've validated the bet; you'll spend on the wrong thing.
230
+ - **Downstream:** `cost-budget-loop` — the conscience moment that fires when LLM calls are
231
+ present and the budget doc is missing (or breaches it). This skill closes that loop.
232
+ - **Adjacent:** `/evals` — the eval set IS a cost lever (Husain). Cheaper models pass enough
233
+ evals → ship the cheaper model.
234
+
235
+ ## Rules
236
+
237
+ - **Declare BEFORE the bill.** A budget written after the surprise is a post-mortem, not a budget.
238
+ - **Token math is not optional once users are real.** "I'll watch it" is a budget of $0 with
239
+ a guarantee of overrun.
240
+ - **Right It before It right (Savoia) — but also Right Costs before Costs Right.** Don't
241
+ optimize the bill on a feature that hasn't earned its existence.
242
+ - **The logger is the only path to the SDK.** If founders can call the SDK directly, the
243
+ ledger lies. Lint it; review it; convention it.
244
+ - **Privacy-first logging.** Token counts and metadata are fine. Prompt and response bodies
245
+ are NOT fine to ship to shared logs without consent + retention discipline.
246
+ - **The cost moment is a nudge, not a gate.** The conscience surfaces drift; the founder
247
+ decides. Override grammar in `docs/devlog.md` per IDEA-008.
248
+ - **Per-cohort math is real math.** A first-product cohort and a domain-expert cohort don't
249
+ have the same budget shape; pretending they do produces wrong defaults for both.
@@ -0,0 +1,226 @@
1
+ ---
2
+ name: ai-failure-states
3
+ description: Design what {{PROJECT_NAME}} does when the AI fails — the five failure states (garbage / refusal / hallucination / timeout / cost-spike) and the declared response for each. Names the UX *before* the failure happens, not after. Cohort-aware (first-product gets named patterns; eng-builder gets lint-anchored unhandled-path discipline; domain-expert gets humane-fallback when stakes are real). Run during /ai-first-init, or any time a FEAT puts an LLM in the user-facing path. Usage - /ai-failure-states
4
+ ---
5
+
6
+ # /ai-failure-states — name the failure before the user finds it
7
+
8
+ Most AI-mediated apps ship the **happy path** and discover the **failure modes** in production.
9
+ The failure modes were always going to happen. What was missing was the *declared response* —
10
+ the UX answer for each, designed before the user encountered it.
11
+
12
+ This skill is the design step that costs an hour and saves the next ten. Five failure states.
13
+ One declared response per state. Cohort-aware delivery. Recorded in `docs/ai-failure-states.md`
14
+ so the next FEAT inherits the discipline.
15
+
16
+ ## The five failure states
17
+
18
+ These are the failure modes that **always exist** in AI-mediated code. Naming each one + its
19
+ declared response IS the design. The skill walks you through each in order.
20
+
21
+ | # | Failure | What it looks like | Default declared response |
22
+ |---|---|---|---|
23
+ | 1 | **Garbage output** | Model returns nonsense, malformed JSON, off-topic prose, content that violates the schema. | Reject + retry once with a "be more careful" preamble; on second failure, surface the structured-error UI. |
24
+ | 2 | **Refusal** | Model refuses ("I can't help with that"), gives an over-cautious non-answer, returns a safety-template. | Detect refusal patterns; route to a human-handoff or a deterministic fallback. Don't loop on the same prompt. |
25
+ | 3 | **Hallucination** | Model returns confidently-wrong content (made-up citations, invented APIs, fabricated facts). | If the FEAT depends on factual accuracy: add a verification step (citation lookup, schema validation, second-pass cross-check); if not, lower the temperature + tighten the prompt. |
26
+ | 4 | **Timeout / network failure** | The call hangs, the network drops, the provider returns 5xx. | Hard timeout (declared per call site); on timeout, return the *last-known-good* result, a graceful degradation, or a queued retry — never the spinner-forever. |
27
+ | 5 | **Cost spike** | A request consumes 10x the expected tokens (long input, runaway output, prompt injection eating context). | Per-call token cap (input AND output); on cap-hit, truncate gracefully with a labeled response ("this answer was capped at N tokens — refine your question or upgrade"). |
28
+
29
+ Other failure modes exist (rate-limit, model deprecation, eval regression, etc.) but these five
30
+ are the **load-bearing** ones — every AI-mediated FEAT has all five. Design responses for each.
31
+
32
+ ## When to run it
33
+
34
+ - During `/ai-first-init` — the conductor calls this as step 5.
35
+ - Before any FEAT that puts an LLM call in the user-visible path ships (acceptance criteria
36
+ should reference the failure-state response, not assume the happy path).
37
+ - When the `ai-failure-state-loop` opens — the conscience surfaces a `failure-mode` moment
38
+ saying *"the code calls an LLM but no failure-states doc exists; what does the UI do when
39
+ the model fails?"*
40
+ - After a real-production failure surprise — codify the new failure mode here so the next
41
+ FEAT inherits the answer.
42
+
43
+ ## How to run it
44
+
45
+ ### 1. Read the cohort + the project's AI-first declaration
46
+
47
+ Read `cohort` from `.boss/config.json`. If `docs/ai-first.md` exists, read it — it names what's
48
+ AI-mediated in this project (decides which failure states warrant the most design).
49
+
50
+ ### 2. Walk the founder through each failure state
51
+
52
+ For each of the five, ask **two questions**:
53
+ - **What does it look like in this project?** (Concrete, not abstract — "the user asked for
54
+ a recipe and got a wall of unrelated text" beats "garbage output.")
55
+ - **What does the UI do?** (Concrete — "show the structured-error card with a retry button"
56
+ beats "handle gracefully.")
57
+
58
+ Cohort-aware delivery:
59
+ - **`first-product`**: walk through each with a named example. Don't assume they know what
60
+ hallucination looks like in practice. Show the pattern, then ask them to declare.
61
+ - **`vibe-coder-newbie`**: similar — patterns over abstractions. Cite "this is the thing
62
+ where Claude makes up citations" not "epistemic failure mode #3."
63
+ - **`non-tech-founder`**: plain language. Each failure described as *"the user sees X; the
64
+ app should do Y."*
65
+ - **`eng-builder`**: terse + inspectable. Hand them the table; they'll declare the responses
66
+ in a paragraph. They'll likely add their own (e.g., "model deprecation = pin model
67
+ version + feature flag for swap").
68
+ - **`vibe-virtuoso`**: ship a starter declaration in one pass; they'll edit. Don't coach.
69
+ - **`indie-hacker`**: frame failure as **cost-of-an-unhappy-customer**. Each declared
70
+ response is a calm-company artifact (no panic UX; calibrated degradation).
71
+ - **`returning-founder`**: skip the 101. *"You've seen these — what's your declared
72
+ response for each in this project?"*
73
+ - **`domain-expert`**: stakes are real. For **hallucination** in medical/legal/financial
74
+ contexts: **the declared response is almost always a human-in-the-loop, not a retry.** For
75
+ **refusal**: the route to a human escalation has to exist *as a first-class UI path,* not a
76
+ fallback. Cite the regulatory frame in the doc itself.
77
+
78
+ ### 3. Write `docs/ai-failure-states.md`
79
+
80
+ Use this skeleton (frontmatter included so it's discoverable like every other BOSS doc).
81
+ The **Eval-tested** field on each state (v0.30.0+) closes the *"stub forever"* loophole —
82
+ naming which eval case actually exercises the handler turns the declaration into a contract.
83
+
84
+ ```markdown
85
+ ---
86
+ id: ai-failure-states
87
+ type: design-decisions
88
+ owner: pm
89
+ status: declared
90
+ updated: {{DATE}}
91
+ ---
92
+
93
+ # AI failure states — {{PROJECT_NAME}}
94
+
95
+ ## Cohort + context
96
+ - Cohort: <cohort name from .boss/config.json>
97
+ - AI-mediated surfaces: <which features depend on the model; pulled from docs/ai-first.md>
98
+ - Stakes: <low / moderate / high — names the regulatory or human-stakes context>
99
+
100
+ ## The five failure states
101
+
102
+ ### 1. Garbage output
103
+ - **Looks like:** <project-specific example>
104
+ - **Declared response:** <what the UI does, in code-level detail>
105
+ - **Fallback handler:** <name the function/component that owns this — e.g., `handleGarbageResponse()`,
106
+ `<ErrorBoundary kind="malformed">`>
107
+ - **Eval-tested:** _(v0.30.0+)_ <eval case id that exercises this — e.g., `feat-007-fail-001-garbage`>
108
+ or **STUB** (handler exists but no eval — record an override or write the eval).
109
+
110
+ ### 2. Refusal
111
+ - **Looks like:** ...
112
+ - **Declared response:** ...
113
+ - **Fallback handler:** ...
114
+ - **Eval-tested:** <eval case id> or **STUB**
115
+
116
+ ### 3. Hallucination
117
+ - **Looks like:** ...
118
+ - **Declared response:** ...
119
+ - **Fallback handler:** ...
120
+ - **Eval-tested:** <eval case id> or **STUB**
121
+
122
+ ### 4. Timeout / network failure
123
+ - **Looks like:** ...
124
+ - **Declared response:** ...
125
+ - **Hard timeout (ms):** <per-call-site declaration>
126
+ - **Fallback handler:** ...
127
+ - **Eval-tested:** <eval case id> or **STUB**
128
+
129
+ ### 5. Cost spike
130
+ - **Looks like:** ...
131
+ - **Declared response:** ...
132
+ - **Per-call token cap (in / out):** <numbers>
133
+ - **Fallback handler:** ...
134
+ - **Eval-tested:** <eval case id> or **STUB**
135
+
136
+ ## Verification cadence
137
+ - Eval set covers each failure state (Husain): yes / no / partial.
138
+ See `docs/evals/FEAT-NNN.yml`. **v0.30.0+: the `/evals` skill requires AI-mediated FEATs
139
+ to include at least one `should-fail` case per declared failure state, categorized by
140
+ `failure_mode` matching the names above.**
141
+ - Production telemetry: how do we know a failure happened? <log signal, alert, etc.>
142
+ - Review cadence: <weekly during MVP / monthly during V1>
143
+
144
+ ## Override grammar (per IDEA-008)
145
+ When a failure-state response is intentionally not implemented (legitimate skip — e.g., feature
146
+ is dev-only and not user-facing yet) OR when **Eval-tested = STUB** is acceptable for now,
147
+ record in `docs/devlog.md`:
148
+ - **OVERRIDE:** skipped <failure-state-N> response on <date> — rationale: <why; expected
149
+ re-open condition>.
150
+ - **OVERRIDE:** kept <failure-state-N> as STUB on <date> — rationale: <e.g., handler is a
151
+ stub because production traffic hasn't surfaced this failure yet; will write the eval +
152
+ implementation when FEAT-MMM ships>.
153
+ ```
154
+
155
+ ### 4. Wire the fallback handlers in code (or stub them)
156
+
157
+ For each failure state, add at minimum a **stub handler** in the code path that wraps the LLM
158
+ call. This satisfies the `ai-failure-state-loop` exit predicate AND prevents the
159
+ forgot-to-handle-this regression.
160
+
161
+ ```typescript
162
+ // src/lib/ai-handlers.ts — stubs for the five declared responses
163
+ export function handleGarbageResponse(raw: string, retry: () => Promise<unknown>) {
164
+ // 1. Validate against schema (Liu discipline). If invalid: retry once with stricter prompt.
165
+ // 2. On second failure: return structured error for the UI.
166
+ throw new Error('TODO: implement per docs/ai-failure-states.md §1');
167
+ }
168
+
169
+ export function handleRefusal(modelText: string) { /* ... */ }
170
+ export function handleHallucination(...) { /* ... */ }
171
+ export function handleTimeout(...) { /* ... */ }
172
+ export function handleCostSpike(...) { /* ... */ }
173
+ ```
174
+
175
+ The stubs exist so the founder *cannot forget*. The loop's exit predicate scans for these
176
+ handler names — if they exist (even as stubs), the loop closes. The discipline is that the
177
+ declaration exists *before* the FEAT ships; the implementation can happen incrementally.
178
+
179
+ ### 5. Update existing AI-mediated FEAT specs
180
+
181
+ For each `docs/ideas/FEAT-NNN.md` that puts an LLM in the user-visible path:
182
+ - Add a **Failure states** section to the spec (the v0.26 `/spec` upgrade adds this field
183
+ automatically for new FEATs).
184
+ - Reference the declared response from `docs/ai-failure-states.md`.
185
+ - Update **Acceptance criteria** to include at least one failure-state path (e.g., *"refusal
186
+ routes to /support, not the spinner"*).
187
+
188
+ ## Connection to other loops
189
+
190
+ - **Upstream:** `cost-budget-loop` closed (budget exists; cost-spike has a number to compare
191
+ against). `/evals` running (eval set categorizes failure modes per Husain).
192
+ - **Same loop:** `ai-failure-state-loop` — opens when LLM call sites exist without a
193
+ declared failure-states doc + at least one handler reference at the call site.
194
+ - **Downstream:** Structured outputs (Liu) — each declared response often *depends* on the
195
+ output being schema-validated; if you haven't declared a schema, garbage detection is
196
+ guesswork.
197
+
198
+ ## What this skill is NOT
199
+
200
+ - **Not a UI library.** It declares the *response*, not the visual. The visual lives in your
201
+ component library / design tokens.
202
+ - **Not a substitute for evals.** Evals catch the failures; the failure-states doc *names what
203
+ to do* when they happen. Both are required.
204
+ - **Not a guarantee.** Designing the response doesn't mean it works on first try; it means
205
+ the next FEAT inherits a starting point + the override grammar tells you when you skipped.
206
+
207
+ ## Rules
208
+
209
+ - **Name the five.** Each AI-mediated FEAT has all five failure modes. Pretending one doesn't
210
+ apply is the bug that produces the spinner-forever / silent-fail / wallet-drain bug a month
211
+ later.
212
+ - **Concrete over abstract.** *"Show the structured-error card with retry"* beats *"handle
213
+ gracefully."* If you can't name what the UI does, you haven't designed it.
214
+ - **Stubs over nothing — but not stubs forever.** A `handleHallucination()` that throws
215
+ *"TODO: implement per §3"* is better than no function at all — it satisfies the loop AND
216
+ prevents the silent regression. **But:** the `Eval-tested` field is what turns a stub
217
+ into a contract. If you've shipped a stub, you've also committed to writing the eval case
218
+ that will eventually exercise it — OR to recording the override per IDEA-008 with a
219
+ re-open condition (v0.30.0+).
220
+ - **Domain-expert exception.** In high-stakes domains, the declared response for hallucination
221
+ is **almost never a retry** — it's a human-in-the-loop escalation. Don't design AI-as-final-
222
+ answer in regulated contexts.
223
+ - **Override is legitimate.** Skip a state when the founder has a real reason (dev-only
224
+ feature; no user-facing path). Record the override in devlog per IDEA-008.
225
+ - **The doc is a living artifact.** When a new failure mode shows up in production, add it as
226
+ a sixth (and seventh, etc.) — the five are the floor, not the ceiling.