bossbuild 0.97.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/PRINCIPLES.md +70 -0
  3. package/README.md +213 -0
  4. package/VERSION +1 -0
  5. package/bin/boss +3 -0
  6. package/library/README.md +19 -0
  7. package/library/agents/.gitkeep +0 -0
  8. package/library/agents/mentor-venture.md +57 -0
  9. package/library/hooks/.gitkeep +0 -0
  10. package/library/hooks/auto-log.js +133 -0
  11. package/library/hooks/memory-cue.js +82 -0
  12. package/library/hooks/secrets-guard.js +87 -0
  13. package/library/memory-seed/README.md +29 -0
  14. package/library/memory-seed/durable-facts-example.md +16 -0
  15. package/library/practices/.gitkeep +0 -0
  16. package/library/practices/agent-security.md +111 -0
  17. package/library/practices/ai-adoption-culture.md +104 -0
  18. package/library/practices/ai-ux-patterns.md +246 -0
  19. package/library/practices/celebration-of-done.md +100 -0
  20. package/library/practices/conscience-voicing.md +121 -0
  21. package/library/practices/context-discipline.md +116 -0
  22. package/library/practices/design-system.md +152 -0
  23. package/library/practices/git-workflow.md +119 -0
  24. package/library/practices/harm-taxonomy.md +45 -0
  25. package/library/practices/quality-ratchet.md +48 -0
  26. package/library/practices/revalidation.md +57 -0
  27. package/library/practices/scalable-architecture.md +111 -0
  28. package/library/practices/ship-it-live.md +149 -0
  29. package/library/practices/skill-authoring.md +70 -0
  30. package/library/skills/.gitkeep +0 -0
  31. package/library/skills/boss-learn/SKILL.md +63 -0
  32. package/library/skills/boss-sync/SKILL.md +48 -0
  33. package/package.json +49 -0
  34. package/registry/CHANGELOG.md +2737 -0
  35. package/src/board.js +655 -0
  36. package/src/brain.js +288 -0
  37. package/src/cli.js +542 -0
  38. package/src/conscience.js +426 -0
  39. package/src/insights.js +147 -0
  40. package/src/learn.js +92 -0
  41. package/src/map.js +103 -0
  42. package/src/modes.js +82 -0
  43. package/src/paths.js +36 -0
  44. package/src/registry.js +34 -0
  45. package/src/scaffold.js +138 -0
  46. package/src/sync.js +292 -0
  47. package/src/team.js +103 -0
  48. package/stages/L0-quickstart/manifest.json +12 -0
  49. package/stages/L0-quickstart/template/.claude/agents/coder-generalist.md +31 -0
  50. package/stages/L0-quickstart/template/.claude/agents/mentor-venture.md +57 -0
  51. package/stages/L0-quickstart/template/.claude/agents/pm.md +28 -0
  52. package/stages/L0-quickstart/template/.claude/hooks/conscience.js +89 -0
  53. package/stages/L0-quickstart/template/.claude/hooks/lib/loop-runtime.js +507 -0
  54. package/stages/L0-quickstart/template/.claude/hooks/lib/yaml.js +163 -0
  55. package/stages/L0-quickstart/template/.claude/hooks/memory-cue.js +82 -0
  56. package/stages/L0-quickstart/template/.claude/hooks/secrets-guard.js +87 -0
  57. package/stages/L0-quickstart/template/.claude/rules/your-app-code.md +17 -0
  58. package/stages/L0-quickstart/template/.claude/settings.json +36 -0
  59. package/stages/L0-quickstart/template/.claude/skills/boss/SKILL.md +161 -0
  60. package/stages/L0-quickstart/template/.claude/skills/boss-learn/SKILL.md +63 -0
  61. package/stages/L0-quickstart/template/.claude/skills/boss-sync/SKILL.md +55 -0
  62. package/stages/L0-quickstart/template/.claude/skills/canvas/SKILL.md +112 -0
  63. package/stages/L0-quickstart/template/.claude/skills/comprehend/SKILL.md +72 -0
  64. package/stages/L0-quickstart/template/.claude/skills/decide/SKILL.md +122 -0
  65. package/stages/L0-quickstart/template/.claude/skills/feedback/SKILL.md +68 -0
  66. package/stages/L0-quickstart/template/.claude/skills/import/SKILL.md +73 -0
  67. package/stages/L0-quickstart/template/.claude/skills/persona/SKILL.md +92 -0
  68. package/stages/L0-quickstart/template/.claude/skills/prototype/SKILL.md +114 -0
  69. package/stages/L0-quickstart/template/.claude/skills/triage/SKILL.md +104 -0
  70. package/stages/L0-quickstart/template/.claude/skills/welcome/SKILL.md +262 -0
  71. package/stages/L0-quickstart/template/AGENTS.md +31 -0
  72. package/stages/L0-quickstart/template/CLAUDE.md +57 -0
  73. package/stages/L0-quickstart/template/docs/IDS.md +42 -0
  74. package/stages/L0-quickstart/template/docs/ideas/INDEX.md +24 -0
  75. package/stages/L0-quickstart/template/docs/loops/canvas-loop.md +90 -0
  76. package/stages/L0-quickstart/template/docs/loops/capture-loop.md +64 -0
  77. package/stages/L1-mvp/manifest.json +12 -0
  78. package/stages/L1-mvp/template/.claude/agents/mentor-architect.md +124 -0
  79. package/stages/L1-mvp/template/.claude/agents/mentor-cofounder.md +85 -0
  80. package/stages/L1-mvp/template/.claude/agents/mentor-gtm.md +49 -0
  81. package/stages/L1-mvp/template/.claude/agents/program-manager.md +46 -0
  82. package/stages/L1-mvp/template/.claude/agents/tester.md +42 -0
  83. package/stages/L1-mvp/template/.claude/hooks/auto-log.js +133 -0
  84. package/stages/L1-mvp/template/.claude/rules/feature-context.md +18 -0
  85. package/stages/L1-mvp/template/.claude/skills/ai-cost/SKILL.md +249 -0
  86. package/stages/L1-mvp/template/.claude/skills/ai-failure-states/SKILL.md +226 -0
  87. package/stages/L1-mvp/template/.claude/skills/ai-first-init/SKILL.md +227 -0
  88. package/stages/L1-mvp/template/.claude/skills/close/SKILL.md +170 -0
  89. package/stages/L1-mvp/template/.claude/skills/consult/SKILL.md +72 -0
  90. package/stages/L1-mvp/template/.claude/skills/cost-review/SKILL.md +204 -0
  91. package/stages/L1-mvp/template/.claude/skills/design-tokens-init/SKILL.md +192 -0
  92. package/stages/L1-mvp/template/.claude/skills/drift-deep/SKILL.md +170 -0
  93. package/stages/L1-mvp/template/.claude/skills/evals/SKILL.md +154 -0
  94. package/stages/L1-mvp/template/.claude/skills/extract/SKILL.md +209 -0
  95. package/stages/L1-mvp/template/.claude/skills/judge-traces/SKILL.md +68 -0
  96. package/stages/L1-mvp/template/.claude/skills/log/SKILL.md +64 -0
  97. package/stages/L1-mvp/template/.claude/skills/practice/SKILL.md +92 -0
  98. package/stages/L1-mvp/template/.claude/skills/pretotype/SKILL.md +95 -0
  99. package/stages/L1-mvp/template/.claude/skills/red-team/SKILL.md +137 -0
  100. package/stages/L1-mvp/template/.claude/skills/revalidate/SKILL.md +51 -0
  101. package/stages/L1-mvp/template/.claude/skills/ship/SKILL.md +105 -0
  102. package/stages/L1-mvp/template/.claude/skills/smoke/SKILL.md +43 -0
  103. package/stages/L1-mvp/template/.claude/skills/spec/SKILL.md +145 -0
  104. package/stages/L1-mvp/template/claude-append.md +122 -0
  105. package/stages/L1-mvp/template/docs/loops/ai-failure-state-loop.md +107 -0
  106. package/stages/L1-mvp/template/docs/loops/coordination-loop.md +116 -0
  107. package/stages/L1-mvp/template/docs/loops/cost-budget-loop.md +117 -0
  108. package/stages/L1-mvp/template/docs/loops/cost-review-loop.md +113 -0
  109. package/stages/L1-mvp/template/docs/loops/design-tokens-loop.md +98 -0
  110. package/stages/L1-mvp/template/docs/loops/drift-loop.md +149 -0
  111. package/stages/L1-mvp/template/docs/loops/extraction-loop.md +128 -0
  112. package/stages/L1-mvp/template/docs/loops/focus-loop.md +106 -0
  113. package/stages/L1-mvp/template/docs/loops/pretotype-loop.md +88 -0
  114. package/stages/L1-mvp/template/docs/loops/spec-loop.md +83 -0
  115. package/stages/L2-v1/manifest.json +12 -0
  116. package/stages/L2-v1/template/.claude/agents/db-architect.md +91 -0
  117. package/stages/L2-v1/template/.claude/agents/mentor-business.md +124 -0
  118. package/stages/L2-v1/template/.claude/agents/mentor-fundraising.md +72 -0
  119. package/stages/L2-v1/template/.claude/agents/mentor-pitch.md +84 -0
  120. package/stages/L2-v1/template/.claude/agents/mentor-talent.md +84 -0
  121. package/stages/L2-v1/template/.claude/agents/ui-designer.md +81 -0
  122. package/stages/L2-v1/template/.claude/agents/ux-designer.md +87 -0
  123. package/stages/L2-v1/template/.claude/skills/board/SKILL.md +98 -0
  124. package/stages/L2-v1/template/.claude/skills/design-review/SKILL.md +77 -0
  125. package/stages/L2-v1/template/.claude/skills/ux-check/SKILL.md +93 -0
  126. package/stages/L2-v1/template/claude-append.md +59 -0
  127. package/stages/L2-v1/template/docs/loops/design-drift-loop.md +108 -0
  128. package/stages/L3-scale/README.md +13 -0
@@ -0,0 +1,122 @@
1
+ ## MVP working rules (added on `boss unlock mvp`)
2
+
3
+ > {{MODE}} mode adds the smallest spine that lets you actually build: a spec, a smoke gate, a devlog, a
4
+ > session-end ritual. Same conscience — capture and validate still come first. Don't out-ceremony the work.
5
+
6
+ 1. **Spec before code.** Any non-trivial change starts with `/spec` — promotes an idea to `FEAT-NNN` with a goal, acceptance criteria, and a smoke check. Throwaway one-liners don't need it.
7
+ 2. **Smoke before commit.** `/smoke` runs the stack's "is the app even working" gate. Green before the commit, red is information — fix or document the regression.
8
+ 3. **Devlog every session.** `/log` appends a dated entry to `docs/devlog.md` — what landed, what's next, what surprised you. Lighter than commits, denser than `CHANGELOG`. Future-you reads it before starting work.
9
+ 4. **`/close` at session end.** Updates `docs/RESUME.md` (state + next tasks + open decisions) and writes a `/log` entry. Read RESUME first thing next session.
10
+ 5. **Spec → build → smoke → log → close.** That's the loop. When you find yourself skipping a step often, ask whether it's the wrong step or the wrong moment — don't paper over it with more ceremony.
11
+ 6. **The conscience still runs.** Quickstart's nudges (validation drift, "Done!" graduation) keep firing — MVP doesn't replace the front of the funnel, it sits behind it.
12
+
13
+ ## Git workflow (trunk-based, review-bounded)
14
+
15
+ > AI changed which part of version control hurts: the agent writes code ~4× faster, but **review** is now the bottleneck. Keep batches small enough that two humans can actually stand behind what merged. Full depth: `git-workflow` practice.
16
+
17
+ - **Trunk-based default.** Commit to `main` or branches that live hours, not days; merge daily; keep fewer than ~3 active branches (DORA: ~2.3× more likely to be elite). `/smoke` green before every commit is what makes that safe — your smoke check *is* your CI until surface area earns a real pipeline.
18
+ - **Worktrees are how you parallelize agents — capped at ~2–4 = your *review* capacity, not your agent count.** You can spawn ten agents; you can't read ten diffs well. More agents than you can review isn't throughput, it's unreviewed code with your name on the merge. One worktree per `FEAT` (a vertical slice) so they don't collide.
19
+ - **Risk-tiered review, not blanket gates.** Low-risk (copy, styling, isolated pure functions) — a glance. High-risk (auth, money, migrations, deletes, deploys, AI-mediated paths) — the *other* human reviews it, and `/smoke` + `/evals` + `/red-team` are that high-risk tier.
20
+ - **Read the test diff harder than the code.** Agents under pressure to go green will quietly rewrite assertions to match broken behaviour. Ask: *did the behaviour get fixed, or did the expectation get lowered?*
21
+ - **Whoever clicks merge owns what the agent wrote.** "The AI wrote it" is not an owner. **Ownership = the prompt-author's intent + the reviewer's acceptance** — the agent is the instrument.
22
+ - **Mob the hard problems.** With an AI as your pair, you question its suggestions *less*. For genuinely novel/risky work, put both humans + the agent on it together rather than one founder solo in a worktree.
23
+ - **Honesty anchor (METR, n=16):** experienced devs on mature repos were 19% *slower* with AI while *believing* they were 20% faster. Trust the green `main` and the merged diff, not the feeling of speed.
24
+
25
+ ## Shipping (localhost is not shipped)
26
+
27
+ > The CI half above keeps `main` green; this is the **CD** half — *is this where a real user can hit it, or just you?* An app only you can reach is a pseudo app: you can't prove pain, fit, or willingness-to-pay on it. Full depth: `ship-it-live` practice; the runner: `/ship`.
28
+
29
+ - **Deploy early, cheap, reversible.** Get a real URL at MVP, not at launch — smallest viable host, reversible-and-cheap over impressive. "I'll deploy when it's polished" costs you the only thing that's scarce this early: contact with a real user. (The "reliability is premature at MVP" counter doesn't survive scrutiny — what staying on localhost saves you isn't worth what it costs.)
30
+ - **Secrets & authz at the boundary — the leg with teeth.** Never ship a secret in the client bundle, and if the app talks to a database with a public/anon key, the row-level security is what stands between your users and the internet — and the AI does **not** configure it by default. This is the signature vibe-coded leak (CVE-2025-48757 / MoltBook — 170+ apps, 1.5M credentials, founders who wrote no code). `/ship`'s pre-flight + `/red-team`'s pre-ship pass catch it; run them before the first public URL.
31
+ - **Rollback ≠ reversible.** Instant rollback restores the *app*, not the *database* — a migration that ran doesn't un-run. Name the revert path before you deploy, and make schema changes backward-compatible (expand-migrate-contract) so a code rollback never strands the data.
32
+ - **Honesty anchor (DORA 2024):** AI adoption correlated with *worse* delivery stability and throughput. Faster shipping isn't safer shipping — the instrument is measured (change-fail rate, time-to-restore), not the feeling.
33
+
34
+ ## What MVP adds (alongside Quickstart)
35
+
36
+ - **Skills:**
37
+ - `/spec` — promote idea → `FEAT-NNN` spec (v0.21.0+: includes validated-learning field per
38
+ Ries, evals field per Husain, moment #4 restraint check against canvas-loop)
39
+ - `/smoke` — build-health gate; configured per stack
40
+ - `/evals` — AI-correctness gate paired with smoke (Husain; v0.21.0+). For any FEAT with
41
+ LLM in control flow, the eval set lives in `docs/evals/FEAT-NNN.yml`. 20 cases beats 0;
42
+ categorize failures by mode.
43
+ - `/pretotype` — demand-test the bet BEFORE building (Savoia; v0.21.0+). Fake-door / WoZ /
44
+ Mechanical-Turk / Pinocchio / YouTube test / impresario. Set the threshold before running.
45
+ - `/design-tokens-init` — scaffold the three-layer token system at the *first UI commit*
46
+ inflection (IDEA-010 Phase 2; v0.21.0+). Cohort-aware delivery. Prevents the 47-blues /
47
+ pattern-reinvention / billion-line-drift failure modes.
48
+ - `/ai-cost` — declare the AI spend contract at the *first LLM-call* inflection (v0.25.0+).
49
+ Cohort-aware budgets (first-product strict, vibe-virtuoso inspect-only, domain-expert
50
+ privacy-first), per-call cost logger, review cadence. Closes `cost-budget-loop`.
51
+ - `/cost-review` — read the ledger (the cadence /ai-cost only declared; v0.30.0+). Reads
52
+ `.boss/cost-log.jsonl`, summarizes by FEAT + user + cohort, compares against budget,
53
+ flags surprises, writes `docs/cost-reviews/REVIEW-YYYY-MM-DD.md`. Closes
54
+ `cost-review-loop`. *Both halves required: declare AND read.*
55
+ - `/ai-first-init` — bake AI-first discipline upfront (v0.26.0+). Conductor skill that
56
+ walks the founder through `docs/ai-first.md` (declare what's AI-mediated) → structured
57
+ outputs (Liu, `docs/schemas/`) → eval set (Husain, via `/evals`) → cost budget
58
+ (`/ai-cost`) → failure-state design (`/ai-failure-states`). Run after `boss unlock mvp`
59
+ when the project is AI-native. The "from day one" sequence.
60
+ - `/ai-failure-states` — design the five failure modes' UX before they happen (v0.26.0+).
61
+ Garbage / refusal / hallucination / timeout / cost-spike, each with a declared response +
62
+ stub fallback handler in code. Cohort-aware. Closes `ai-failure-state-loop`. **Domain-
63
+ expert cohort: hallucination response defaults to human-in-the-loop, not retry.**
64
+ - `/extract` — PRINCIPLE #1's own discipline as a skill (v0.29.0+). Reads recent work and
65
+ proposes 1-3 extraction candidates, each routed **UP** (into BOSS's `library/<cat>/`
66
+ via `boss learn`) or **DOWN** (into the app's `src/`) — or honest **NOT-YET**. The
67
+ LLM-as-judge counterpart to predicate-based loops; closes `extraction-loop`. *Two
68
+ destinations, not one.*
69
+ - `/drift-deep` — the deep, whole-project drift audit (v0.37.0+). The 1M-context *"am I
70
+ fooling myself across EVERYTHING I've built?"* counterpart to the cheap, bounded `drift`
71
+ hook moment. Reads the canvas + ALL devlog + every FEAT spec + the actual `src/` code and
72
+ judges whether the body of work validates the named riskiest assumption or builds around
73
+ it. Verdict (on-aim / drifting / mixed) → `docs/drift-audits/DRIFT-YYYY-MM-DD.md`.
74
+ Deliberate + founder-invoked (the cost discipline: a whole-project read can't be
75
+ per-prompt) — no loop, no nudge, you run it when you want the truth.
76
+ - `/ship` — put it where a real user can hit it (v0.92.0+, FEAT-024). The CD half: detect
77
+ the stack → deploy-time pre-flight (no client-bundled secrets; server-side authz/RLS
78
+ actually on — the CVE-2025-48757 / MoltBook vibe-coded-leak surface) → cheapest reversible
79
+ host → hand back the live URL + the rollback path. Stack-neutral (no baked-in target);
80
+ the pre-flight is a check, not a gate. Full depth: `ship-it-live` practice.
81
+ - `/log` — devlog
82
+ - `/close` — session-end RESUME update
83
+ - **Builder agents:** `tester` (owns the smoke gate + acceptance checks for FEATs);
84
+ `program-manager` (sequencing — the *when*, distinct from `pm`'s *what*).
85
+ - **Mentor agents:** `mentor-architect` (AI-native stack/architecture, advisory);
86
+ `mentor-gtm` (first 100, channels, positioning).
87
+ - **Loops** (on the v0.18 generic loop primitive, MVP-stage):
88
+ - `spec-loop` — gates spec writing on canvas-loop closure (encodes moment #4 restraint;
89
+ skill-side detection via `/spec`)
90
+ - `pretotype-loop` — records that demand-testing happened before significant build
91
+ - `design-tokens-loop` — JIT scaffolds the design token system when UI starts accumulating
92
+ (conscience emits `coherence` moment when entry-met / exit-unmet)
93
+ - `cost-budget-loop` — opens at the first LLM SDK call in `src/` without a budget doc
94
+ (conscience emits `cost` moment when entry-met / exit-unmet; v0.25.0+)
95
+ - `ai-failure-state-loop` — opens at the first LLM SDK call in `src/` without a failure-
96
+ states design doc + fallback handlers (conscience emits `failure-mode` moment when
97
+ entry-met / exit-unmet; v0.26.0+). Same entry inflection as `cost-budget-loop`; the two
98
+ failure modes always coexist at the AI-mediated boundary.
99
+ - `extraction-loop` — PRINCIPLE #1's own discipline (v0.29.0+). Opens when `docs/devlog.md`
100
+ has ≥3 dated entries and no `EXTR-NNN` extraction record exists. Conscience emits the
101
+ `capture` moment; founder runs `/extract` to record the routing decision (UP / DOWN /
102
+ NOT-YET). First hook-runner loop whose entry is *time-of-work* (devlog count) rather than
103
+ *file-state predicate* — sets the precedent for future judgment-required moments.
104
+ - `cost-review-loop` — the cadence half of cost discipline (v0.30.0+). Opens once
105
+ `docs/ai-cost-budget.md` exists and no cost-review file is on record. Conscience emits
106
+ the `cost-stale` moment; founder runs `/cost-review` to read the ledger. Second
107
+ time-of-work entry pattern (declaration → read sequence).
108
+ - `drift-loop` — the work vs. the named risk (v0.31.0+). Opens when a canvas has a real
109
+ **Riskiest assumption** + `docs/devlog.md` has ≥3 dated entries + no real **Experiment
110
+ this week** validation plan. Conscience emits the `drift` moment — the gap *between*
111
+ `caution` (no risk named) and `done` (graduation). The first moment fronting a *model
112
+ judgment the predicate can't make*: the gate is cheap (risk named + work piling up + no
113
+ plan), but the model reads a bounded set (risk line + ~5 recent devlog + open FEAT) and
114
+ judges whether the work is *testing* the risk or building *around* it — naming the specific
115
+ gap, staying silent when on-aim. Closes when the validation plan is recorded (`/canvas`)
116
+ or run (`/pretotype`).
117
+ - **Conventions:** `FEAT-NNN` for features in build (already listed in `docs/IDS.md`);
118
+ `docs/devlog.md` is append-only (override grammar lives here per IDEA-008);
119
+ `docs/RESUME.md` is the living state pointer; `docs/loops/` lives alongside `docs/ideas/`;
120
+ `docs/design/DESIGN_TOKENS.md` arrives JIT when design-tokens-loop opens.
121
+ - **Graduation:** when the app earns design-system rigor / a real db / prototypes / a board
122
+ → `boss unlock v1`.
@@ -0,0 +1,107 @@
1
+ ---
2
+ id: ai-failure-state-loop
3
+ type: loop
4
+ stage: L1-mvp
5
+ runner_type: hook
6
+ attributed_to: [Husain (failure-mode categorization), Liu (structured outputs make detection possible), Karpathy (the failure surface IS the design surface)]
7
+ also_relevant: [Mollick, Rauch, Willison]
8
+ entry:
9
+ - count_at_least:
10
+ path_glob: src/**
11
+ pattern: '(anthropic|@anthropic-ai/sdk|openai|OpenAI\(|Anthropic\(|messages\.create|chat\.completions\.create|generateText|streamText)'
12
+ min: 1
13
+ exit:
14
+ - exists: { path: docs/ai-failure-states.md }
15
+ - count_at_least:
16
+ path_glob: src/**
17
+ pattern: '(handleGarbageResponse|handleRefusal|handleHallucination|handleTimeout|handleCostSpike|ai-handlers|aiHandlers|on_refusal|on_hallucination|on_timeout)'
18
+ min: 1
19
+ drift_moment: failure-mode
20
+ ---
21
+
22
+ # Loop: ai-failure-state (MVP)
23
+
24
+ The discipline that **names the failure before the user finds it.** When code contains an LLM
25
+ SDK call but no `docs/ai-failure-states.md`, the founder is shipping a happy-path UI that will
26
+ meet the five guaranteed failure modes (garbage / refusal / hallucination / timeout / cost
27
+ spike) without a declared response. This loop opens at the first LLM call and closes when
28
+ both the design doc exists AND the code references at least one failure-handler.
29
+
30
+ `runner_type: hook` — the conscience evaluates this loop on every UserPromptSubmit. Entry
31
+ predicate requires ≥1 LLM SDK call site (same pattern as `cost-budget-loop` — they share the
32
+ inflection point: *the first time code touches the model*).
33
+
34
+ When open, the conscience surfaces the `failure-mode` moment: *"The code calls an LLM but no
35
+ failure-states doc exists. What does the UI do when the model returns garbage / refuses /
36
+ hallucinates / times out / costs too much? Run `/ai-failure-states` to declare it."*
37
+
38
+ ## Entry artifact
39
+
40
+ `src/` contains ≥1 LLM SDK call site (same regex as `cost-budget-loop`). The five failure
41
+ modes exist the moment the first call exists; the loop opens at parity with the cost loop.
42
+
43
+ ## Purpose
44
+
45
+ Force the failure-states design into the open before the user encounters them. The skill
46
+ (`/ai-failure-states`) walks the founder through:
47
+ - The five failure states + a project-specific concrete example for each
48
+ - The declared response for each (in code-level detail; not "handle gracefully")
49
+ - Stub fallback handlers in code (so the discipline is wired even if implementation is
50
+ incremental)
51
+ - Cohort-aware delivery (first-product gets named patterns; eng-builder gets lint-anchored
52
+ unhandled-path discipline; domain-expert gets the human-in-the-loop discipline for
53
+ high-stakes domains)
54
+
55
+ ## Exit artifact
56
+
57
+ `docs/ai-failure-states.md` exists AND `src/` contains ≥1 reference to a failure-handler
58
+ function name (the regex covers the canonical TypeScript names and snake_case Python
59
+ equivalents). The dual requirement matters: a design doc without code references is
60
+ declaration without contract; handlers without a doc is code without rationale.
61
+
62
+ ## Drift
63
+
64
+ Entry satisfied (≥1 LLM SDK call) AND exit not satisfied (no design doc OR no handler refs)
65
+ → loop is open → conscience emits `failure-mode` moment.
66
+
67
+ Confidence: low on first detection (one call site might be exploratory); medium with a
68
+ handful of calls; high when LLM calls touch user-visible code paths (heuristic: presence of
69
+ the regex AND presence of `export`, `route`, `app.`, `Router`, `handler` near the call sites).
70
+
71
+ The voice (cohort-aware via v0.20's framing): name the gap in one line, point at
72
+ `/ai-failure-states`, hand the decision back. Never blocks. Override recorded in devlog per
73
+ IDEA-008.
74
+
75
+ ## How to remix
76
+
77
+ - **Skip:** legitimate when the LLM use is genuinely throwaway (dev-only script, no
78
+ user-facing path). Override:
79
+ ```
80
+ - **OVERRIDE:** skipped `ai-failure-state-loop` — rationale: <e.g., LLM call is in a
81
+ dev-only script; no user-facing path; not deployed>.
82
+ ```
83
+ - **Swap discipline:** the five failure states are the floor, not the ceiling. Some projects
84
+ need rate-limit handling, model-deprecation discipline, regulatory-failure-mode design
85
+ (for `domain-expert` cohort projects). Add them in the doc; the loop's exit predicate just
86
+ needs *one* handler reference, so additional handlers are additive, not gated.
87
+ - **Author your own:** a domain-specific failure discipline (e.g., a
88
+ `multimodal-fallback-loop` for projects where image/audio generation has its own failure
89
+ shape; or a `tool-use-failure-loop` for agentic workflows where the model picks the wrong
90
+ tool).
91
+
92
+ ## When this loop re-opens
93
+
94
+ - Failure-states doc deleted → exit predicate fails again.
95
+ - All handler references removed (refactor that strips them out) → ditto.
96
+ - New LLM provider added that introduces a new failure surface (e.g., a multimodal API with
97
+ image-safety refusal patterns the existing doc didn't cover) → the founder should re-run
98
+ the skill to extend the doc; loop doesn't auto-detect this, but the founder should know.
99
+ - Real-production failure surprise → the failure-state doc was incomplete. Add the new mode
100
+ + handler; refresh the declared response.
101
+
102
+ ## Cite
103
+
104
+ Husain's *"categorize failures by mode — failure modes are more valuable than success
105
+ modes"* applied to UX, not just evals. Liu (structured outputs make garbage detection
106
+ mechanical, not vibes). Karpathy (the failure surface IS the design surface for AI-mediated
107
+ products — designing the happy path is the easy 20%).
@@ -0,0 +1,116 @@
1
+ ---
2
+ id: coordination-loop
3
+ type: loop
4
+ stage: L1-mvp
5
+ runner_type: hook
6
+ attributed_to: [Ajesh Shah (PRINCIPLES — the conscience holds the seam AI erodes)]
7
+ also_relevant: [Ju & Aral 2025 (human-AI teams' social/emotional comms drop ~27% while perceived quality stays flat — the invisible drift), Noam Wasserman (cofounder dynamics), Amy Edmondson (psychological safety)]
8
+ entry:
9
+ - any_file_matches:
10
+ path_glob: .boss/config.json
11
+ pattern: '"handle"'
12
+ - count_at_least:
13
+ path_glob: docs/devlog.md
14
+ pattern: '^## \d{4}-\d{2}-\d{2}'
15
+ min: 3
16
+ exit:
17
+ - count_at_least:
18
+ path_glob: docs/decisions/DEC-*.md
19
+ pattern: '^id:\s*DEC-\d'
20
+ min: 1
21
+ drift_moment: coordination
22
+ ---
23
+
24
+ # Loop: coordination (MVP) — the cofounder seam AI quietly erodes
25
+
26
+ This is the founder layer's conscience moment (IDEA-037 / FEAT-021 slice 5b). It exists because of the
27
+ single most-replicated finding in human-AI teaming: **AI accelerates each individual but does not hold the
28
+ team together** — and worse, the human-to-human seam erodes *invisibly* (Ju & Aral RCT, n=2,234:
29
+ social/emotional communication dropped ~27% while *perceived* teamwork quality stayed flat). A two-person
30
+ team can feel productive while quietly building in parallel, each in their own AI session, never actually
31
+ deciding anything *together*.
32
+
33
+ The shared decision log (`DEC-NNN`) is the artifact that makes that drift visible. **A team that has been
34
+ building for a while and has never recorded a single decision together** is the structural signal that work
35
+ may be flowing *through* each founder's agent and *around* the cofounder.
36
+
37
+ ## Dormant-solo by construction
38
+
39
+ The entry requires a cofounder on the roster (`.boss/config.json` carries a `"handle"`). A solo founder
40
+ never opens this loop — there is no seam to watch. It only lights up once `boss team add` has put a second
41
+ person on the venture. This is the same dormant-solo guarantee the rest of the founder layer holds.
42
+
43
+ ## The judgment the predicate can't make (and the model can)
44
+
45
+ The predicate gate is coarse: *it's a team · real work has happened (devlog ≥3) · zero `DEC` recorded.*
46
+ Regex can confirm those facts; it cannot tell whether the empty decision log means **the founders are
47
+ genuinely deciding in parallel and drifting apart** (the real seam problem) or **they decided everything on
48
+ a call and simply didn't write it down** (a quiet log that's perfectly healthy). That call is the model's,
49
+ and it's the whole value of the moment.
50
+
51
+ So the loop opens the door; the model walks through with judgment. When the door is open, the conscience
52
+ reads a bounded slice — the decisions directory (empty), the board (`boss board`), who's on the team
53
+ (`boss team`) — nothing wider. Then it judges: is one founder's solo-agent velocity high while the shared
54
+ log sits untouched by the other, or is this honest parallel work with the deciding happening off-repo? If
55
+ the former, name it in one spare line and ask the *coordination* question. If the latter, **stay silent** —
56
+ a quiet log is not proof of a problem (this is the weakest-transfer evidence in the research; over-firing
57
+ here would punish a team that simply talks on calls).
58
+
59
+ ## Entry artifact
60
+
61
+ **A team building without deciding together** — `.boss/config.json` has a collaborator (`"handle"`) AND
62
+ `docs/devlog.md` has ≥3 dated entries (real work has happened — same "enough has happened" gate the drift
63
+ and capture loops use). Below 3 entries it's too early; a brand-new team isn't nagged before there's work.
64
+
65
+ ## Exit artifact
66
+
67
+ **At least one shared decision recorded** — `docs/decisions/DEC-*.md` has ≥1 `DEC` (its `id:` frontmatter
68
+ matches). Recording a single decision together closes the loop: the founders have proven they *do* decide
69
+ jointly, and the never-decided-together smell no longer applies. The good outcome of a coordination nudge
70
+ is a `/decide`.
71
+
72
+ ## Drift
73
+
74
+ `entry: satisfied` (a team + ≥3 devlog entries) AND `exit: not satisfied` (no `DEC` yet) = loop OPEN →
75
+ conscience emits the `coordination` moment. The model composes the voice (per `boss-voice`: seasoned hand,
76
+ assume intelligence), reads only the bounded slice, and **judges before speaking** — never a "your teamwork
77
+ score is low" read, never a satisfaction prompt (self-report is *proven blind* to the drift). The value is
78
+ the specific, structural observation: *building a while, nothing decided together, are you two actually in
79
+ this jointly?* It serves the **partnership as the unit** and **never takes a side** between the cofounders —
80
+ it surfaces the seam; it never says whose fault it is. Pairs with `mentor-cofounder` for the deeper coaching.
81
+
82
+ ## Cost (BOSS eating its own dogfood)
83
+
84
+ - The **predicate gate is the cost control** — the model reads the bounded slice only after the cheap Node
85
+ checks confirm (team + work + no decision). Every other prompt the hook emits nothing.
86
+ - The read is **bounded** — decisions dir + board + team roster, never the whole project.
87
+ - The model fires **at most once per session** and stays silent when the parallel work is honest.
88
+
89
+ ## Known limitation (documented, like focus-loop's)
90
+
91
+ Exit is "≥1 `DEC` *ever*," so the moment targets the never-decided-together case and goes quiet after the
92
+ first shared decision — even if the founders later drift back into parallel solo work. A "decided together
93
+ *recently*" semantic would need a date-windowed count the predicate vocabulary doesn't have yet (the same
94
+ gap `building_since` aging fills on the board side). The evidence here is weak-transfer to begin with, so
95
+ the conservative once-and-quiet behavior is the right default; revisit if a real team shows the rebuilt-drift
96
+ case matters. **This is design-from-principle, labeled a bet** — there is no peer-reviewed study on
97
+ async multi-human/multi-AI founding teams; the loop watches the artifact channel the research points at,
98
+ and fires conservatively.
99
+
100
+ ## How to remix
101
+
102
+ - **Skip / override:** legitimate when the deciding genuinely happens off-repo (a distributed pair who talk
103
+ daily and just don't write `DEC`s). Override grammar:
104
+ ```
105
+ - **OVERRIDE:** skipped `coordination-loop` — rationale: <e.g. we decide on our daily call; the log lags
106
+ on purpose, we're not drifting>.
107
+ ```
108
+ Or mute it outright: `boss conscience mute coordination`.
109
+ - **Tune the threshold:** a team that writes decisions rarely by preference might raise the devlog gate.
110
+
111
+ ## Cite
112
+
113
+ PRINCIPLES.md (the conscience holds the seam). Ju & Aral 2025 (invisible human-AI team drift — the
114
+ motivating evidence), Wasserman (cofounder dynamics), Edmondson (psychological safety). The loop is the
115
+ *when*; the model's judgment + `boss board` + `/decide` (record one together) + `mentor-cofounder` are the
116
+ *how*. Research design input: `docs/research/IDEA-037-founding-teams-ai-design-input.md`.
@@ -0,0 +1,117 @@
1
+ ---
2
+ id: cost-budget-loop
3
+ type: loop
4
+ stage: L1-mvp
5
+ runner_type: hook
6
+ attributed_to: [Husain (look-at-your-data, applied to spend), Liu (structured outputs as cost lever), Mollick (cost-as-design-input)]
7
+ also_relevant: [Karpathy, Willison, Rauch]
8
+ entry:
9
+ - count_at_least:
10
+ path_glob: src/**
11
+ pattern: '(anthropic|@anthropic-ai/sdk|openai|OpenAI\(|Anthropic\(|messages\.create|chat\.completions\.create|generateText|streamText)'
12
+ min: 1
13
+ exit:
14
+ - exists: { path: docs/ai-cost-budget.md }
15
+ - count_at_least:
16
+ path_glob: src/**
17
+ pattern: '(logCall|log_call|ai-cost-logger|ai_cost_logger|costLogger)'
18
+ min: 1
19
+ drift_moment: cost
20
+ ---
21
+
22
+ # Loop: cost-budget (MVP)
23
+
24
+ The discipline that **names the bill before the bill names you.** When code contains an LLM SDK
25
+ call but no `docs/ai-cost-budget.md`, the founder is operating without unit economics for the
26
+ single most-load-bearing operating cost of an AI-native app. This loop opens the moment that
27
+ gap exists, and closes when the budget is declared AND a cost logger is in the codebase.
28
+
29
+ `runner_type: hook` — the conscience evaluates this loop on every UserPromptSubmit. The entry
30
+ predicate requires at least one LLM SDK call site, so it **doesn't fire on fresh projects** —
31
+ it opens only when the founder has actually reached for an LLM in code. This is the *"the app
32
+ is now AI-mediated"* inflection (IDEA-012's universal-cohort feature).
33
+
34
+ When open, the conscience surfaces the `cost` moment: *"You're calling an LLM in code without
35
+ a declared budget. Run `/ai-cost` to name what you'll spend per user, which models you chose,
36
+ and how you'll see the bill before it surprises you."*
37
+
38
+ ## Entry artifact
39
+
40
+ `src/` contains ≥1 LLM SDK call site — regex covers the common patterns:
41
+ - `anthropic`, `@anthropic-ai/sdk`, `Anthropic(`
42
+ - `openai`, `OpenAI(`
43
+ - `messages.create`, `chat.completions.create`
44
+ - Vercel AI SDK: `generateText`, `streamText`
45
+
46
+ Threshold of 1 (not 3 like design-tokens-loop) because cost discipline is *deontic at the
47
+ first call* — there's no "exploratory" version of token spend. The first call hits a real
48
+ billing meter.
49
+
50
+ Stack misses: founders on stacks the regex doesn't catch (LangChain wrappers, Replicate, Cohere,
51
+ Bedrock client libs, etc.) can either edit this loop's entry pattern, or simply run `/ai-cost`
52
+ manually (the loop respects override).
53
+
54
+ ## Purpose
55
+
56
+ Force the cost shape into the open before it accumulates. The skill (`/ai-cost`) walks the
57
+ founder through:
58
+ - Cohort-aware budget defaults (first-product gets a tight cap; vibe-virtuoso gets inspect-only;
59
+ domain-expert gets privacy-first logging)
60
+ - Per-call-site model choice with explicit *why*
61
+ - A wrapper logger so every call writes to `.boss/cost-log.jsonl`
62
+ - Review cadence + breach grammar
63
+ - Optional handoff to `mentor-architect` (cost shape → architecture) or `mentor-business`
64
+ (cost-per-user → pricing)
65
+
66
+ ## Exit artifact
67
+
68
+ `docs/ai-cost-budget.md` exists AND `src/` contains ≥1 reference to a cost-logging wrapper
69
+ (`logCall`, `log_call`, `ai-cost-logger`, `ai_cost_logger`, `costLogger`). The dual requirement
70
+ matters: a budget doc without instrumented code is theater (you wrote intent; you didn't wire
71
+ sight). Instrumentation without a budget is observability without a target.
72
+
73
+ ## Drift
74
+
75
+ Entry satisfied (≥1 LLM SDK call) AND exit not satisfied (no budget file OR no logger refs)
76
+ → loop is open → conscience emits `cost` moment.
77
+
78
+ Confidence is medium on first detection (one call site might be exploratory) and high once
79
+ there are multiple call sites — the math compounds with each user-facing path.
80
+
81
+ The voice (cohort-aware via v0.20's framing): name the gap in one line, point at `/ai-cost`,
82
+ hand the decision back. Never blocks. Override is recorded in devlog per IDEA-008.
83
+
84
+ ## How to remix
85
+
86
+ - **Skip:** legitimate when the project's LLM use is genuinely throwaway (a one-off dev
87
+ script that won't reach users). Override:
88
+ ```
89
+ - **OVERRIDE:** skipped `cost-budget-loop` — rationale: <e.g., this LLM call is in a
90
+ dev-only script; not in user-facing code; not deployed>.
91
+ ```
92
+ - **Swap discipline:** Husain's eval-driven cost reduction vs. caching-first (Anthropic prompt
93
+ caching as default) vs. batch-first (non-realtime workloads) vs. tiered-model (Haiku as
94
+ default, escalate to Sonnet only when needed). The loop's exit predicate checks for *some*
95
+ cost-logger in code, not a specific tool; the budget doc is where the founder records which
96
+ discipline they're applying.
97
+ - **Author your own:** a domain-specific cost discipline (e.g., a `gpu-spend-loop` for a
98
+ fine-tuning project where the load-bearing cost is training, not inference; or a
99
+ `vector-store-cost-loop` for a RAG-heavy project where embeddings + storage dominate).
100
+
101
+ ## When this loop re-opens
102
+
103
+ - Cost-logger wrapper deleted or renamed → exit predicate fails again.
104
+ - Budget doc deleted → ditto.
105
+ - New LLM SDK added (e.g., founder integrates a second provider) → entry may grow; check the
106
+ budget doc covers it; if not, treat as drift.
107
+ - Model swap on a load-bearing call site → the budget doc's model rows are stale; re-run
108
+ `/ai-cost` to refresh.
109
+ - Real-bill surprise → the bill IS the design signal. Add the new failure mode to the budget
110
+ doc; refine the levers.
111
+
112
+ ## Cite
113
+
114
+ Husain's *"almost all AI quality problems are visible in the data, and almost no one looks"*
115
+ applied to spend: *almost all AI cost problems are visible in the ledger, and almost no one
116
+ keeps one.* Liu (structured outputs as a cost lever — smaller schemas, smaller bills).
117
+ Mollick (cost-as-design-input: the bill shapes the product, not the other way around).
@@ -0,0 +1,113 @@
1
+ ---
2
+ id: cost-review-loop
3
+ type: loop
4
+ stage: L1-mvp
5
+ runner_type: hook
6
+ attributed_to: [Husain (the cadence the eval-quality loop demands), Ajesh Shah (PRINCIPLE #1 applied to spend — the discipline only works when both halves run)]
7
+ also_relevant: [Liu (structured outputs make cost-attribution mechanical), Maurya (running-the-business cadence)]
8
+ entry:
9
+ - exists: { path: docs/ai-cost-budget.md }
10
+ exit:
11
+ - count_at_least:
12
+ path_glob: docs/cost-reviews/REVIEW-*.md
13
+ pattern: '^- \*\*Total spend:\*\*'
14
+ min: 1
15
+ drift_moment: cost-stale
16
+ ---
17
+
18
+ # Loop: cost-review (MVP) — the cadence the budget doc declared
19
+
20
+ > The v0.25 audit named this as a discipline hole: *"`/ai-cost`'s weekly review cadence is
21
+ > declared in `docs/ai-cost-budget.md` but no skill reads `.boss/cost-log.jsonl`. The cadence
22
+ > is unenforced."* v0.30 closes that. /ai-cost declares the budget; /cost-review reads the
23
+ > ledger; both halves required.
24
+
25
+ The loop opens once the founder has run `/ai-cost` and a budget doc exists. Until at least
26
+ one cost-review file is recorded, the loop emits the `cost-stale` moment — *"you declared
27
+ the budget; you haven't looked at the ledger yet."* The first review closes the loop.
28
+
29
+ This is the **second time-of-work entry pattern** (after extraction-loop in v0.29). Entry is
30
+ *budget declared* (a deontic gate — once you commit to budget discipline you also commit to
31
+ read it); exit is *first review recorded*. Future versions may add recurring re-opening when
32
+ predicates gain time-awareness.
33
+
34
+ ## Entry artifact
35
+
36
+ `docs/ai-cost-budget.md` exists. This is the same exit predicate as cost-budget-loop, just
37
+ shifted in role: the moment cost-budget-loop closes (budget declared + logger wired), the
38
+ cost-review-loop opens (now read what you wired). The two loops form a sequence: declare →
39
+ read → declare-adjustments → read-again.
40
+
41
+ ## Purpose
42
+
43
+ Force the reading half of the discipline. Most projects ship `/ai-cost` and then never look
44
+ at the ledger; the budget becomes aspirational. This loop closes the gap between *"we
45
+ declared"* and *"we know."* The skill `/cost-review` is the actual read; the loop opens the
46
+ door at the first reasonable moment (immediately after the budget is declared).
47
+
48
+ ## Exit artifact
49
+
50
+ ≥1 file matching `docs/cost-reviews/REVIEW-*.md` containing a `^- \*\*Total spend:\*\*` line.
51
+ The `/cost-review` skill writes these. The presence of *any* review with real numbers closes
52
+ the loop; the discipline IS the practice of reading, not the volume of reviews.
53
+
54
+ To re-open the loop (for recurring weekly reviews), the founder can either:
55
+ - Rename or archive old reviews into a sub-directory (`docs/cost-reviews/archive/`) — old
56
+ files no longer match the glob; loop re-opens.
57
+ - Or simply re-run `/cost-review` weekly; the existence of a *newer* review doesn't change
58
+ the loop state today (predicate vocabulary doesn't compare file ages — same gap that
59
+ extraction-loop hit; same trade-off).
60
+
61
+ For v0.30, the loop is the **first-review inflection.** Future versions may add time-aware
62
+ predicates (*"latest review file's mtime is older than 7 days"*) to enforce recurring cadence.
63
+
64
+ ## Drift
65
+
66
+ Entry satisfied (budget doc exists) AND exit not satisfied (no review file) → loop is open →
67
+ conscience emits `cost-stale` moment.
68
+
69
+ Confidence: low immediately after the budget is declared (the founder might still be in
70
+ mid-session running other discipline skills; ledger may be empty); medium if the budget is
71
+ older than a session or two; high if real spend has accumulated in the ledger without a
72
+ review on record.
73
+
74
+ The voice (cohort-aware via v0.20's framing): name the unread-ledger gap in one line, point
75
+ at `/cost-review`, hand the decision back. Don't sound like a productivity-reward;
76
+ *"you declared a budget — worth looking at what actually happened?"* lands better than
77
+ *"great, time to review!"*
78
+
79
+ ## How to remix
80
+
81
+ - **Skip:** legitimate when the ledger is genuinely empty (no LLM calls yet — though usually
82
+ in that case `cost-budget-loop` is still open and this one shouldn't fire yet) OR when the
83
+ founder is in a single concentrated build session and reviewing-then-restarting would just
84
+ fragment attention. Override:
85
+ ```
86
+ - **OVERRIDE:** skipped `cost-review-loop` — rationale: <e.g., ledger has < 50 entries;
87
+ not yet enough signal to surface; re-check after FEAT-NNN ships>.
88
+ ```
89
+ - **Swap discipline:** Maurya's running-the-business weekly review vs. Husain's eval-quality
90
+ loop (review the eval set + costs together) vs. Liu's structured-outputs lens (track
91
+ cost-per-structured-schema-shape, not just per-call). Loop's exit predicate just needs
92
+ *some* review with real numbers; the discipline framing is the founder's.
93
+ - **Author your own:** a complementary review loop (e.g., `eval-review-loop` for re-running
94
+ eval sets weekly; `failure-state-review-loop` for confirming declared handlers actually
95
+ caught the recurring failures). Same time-of-work pattern.
96
+
97
+ ## When this loop re-opens (today: requires founder action)
98
+
99
+ v0.30 ships the first-review version. Once a review file exists, the loop stays closed —
100
+ even if the review is stale (older than the cadence the doc claims). The recurring version
101
+ is gated on the predicate vocabulary gaining time-awareness; same dependency as
102
+ extraction-loop.
103
+
104
+ For now: founders running the weekly cadence either re-run the skill or rotate old reviews
105
+ into an archive directory. The conscience doesn't auto-nudge after the first read.
106
+
107
+ ## Cite
108
+
109
+ Husain (look-at-your-data, applied to spend not just quality — *"almost all AI cost problems
110
+ are visible in the ledger, and almost no one keeps one OR reads it"*). Ajesh Shah (PRINCIPLE
111
+ #1 applied recursively: declaring is half; reading is the other half; both required).
112
+ Maurya (running-the-business cadence — the review IS the running). Liu (structured outputs
113
+ make per-call cost-attribution mechanical, not vibes).