bossbuild 0.97.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/PRINCIPLES.md +70 -0
  3. package/README.md +213 -0
  4. package/VERSION +1 -0
  5. package/bin/boss +3 -0
  6. package/library/README.md +19 -0
  7. package/library/agents/.gitkeep +0 -0
  8. package/library/agents/mentor-venture.md +57 -0
  9. package/library/hooks/.gitkeep +0 -0
  10. package/library/hooks/auto-log.js +133 -0
  11. package/library/hooks/memory-cue.js +82 -0
  12. package/library/hooks/secrets-guard.js +87 -0
  13. package/library/memory-seed/README.md +29 -0
  14. package/library/memory-seed/durable-facts-example.md +16 -0
  15. package/library/practices/.gitkeep +0 -0
  16. package/library/practices/agent-security.md +111 -0
  17. package/library/practices/ai-adoption-culture.md +104 -0
  18. package/library/practices/ai-ux-patterns.md +246 -0
  19. package/library/practices/celebration-of-done.md +100 -0
  20. package/library/practices/conscience-voicing.md +121 -0
  21. package/library/practices/context-discipline.md +116 -0
  22. package/library/practices/design-system.md +152 -0
  23. package/library/practices/git-workflow.md +119 -0
  24. package/library/practices/harm-taxonomy.md +45 -0
  25. package/library/practices/quality-ratchet.md +48 -0
  26. package/library/practices/revalidation.md +57 -0
  27. package/library/practices/scalable-architecture.md +111 -0
  28. package/library/practices/ship-it-live.md +149 -0
  29. package/library/practices/skill-authoring.md +70 -0
  30. package/library/skills/.gitkeep +0 -0
  31. package/library/skills/boss-learn/SKILL.md +63 -0
  32. package/library/skills/boss-sync/SKILL.md +48 -0
  33. package/package.json +49 -0
  34. package/registry/CHANGELOG.md +2737 -0
  35. package/src/board.js +655 -0
  36. package/src/brain.js +288 -0
  37. package/src/cli.js +542 -0
  38. package/src/conscience.js +426 -0
  39. package/src/insights.js +147 -0
  40. package/src/learn.js +92 -0
  41. package/src/map.js +103 -0
  42. package/src/modes.js +82 -0
  43. package/src/paths.js +36 -0
  44. package/src/registry.js +34 -0
  45. package/src/scaffold.js +138 -0
  46. package/src/sync.js +292 -0
  47. package/src/team.js +103 -0
  48. package/stages/L0-quickstart/manifest.json +12 -0
  49. package/stages/L0-quickstart/template/.claude/agents/coder-generalist.md +31 -0
  50. package/stages/L0-quickstart/template/.claude/agents/mentor-venture.md +57 -0
  51. package/stages/L0-quickstart/template/.claude/agents/pm.md +28 -0
  52. package/stages/L0-quickstart/template/.claude/hooks/conscience.js +89 -0
  53. package/stages/L0-quickstart/template/.claude/hooks/lib/loop-runtime.js +507 -0
  54. package/stages/L0-quickstart/template/.claude/hooks/lib/yaml.js +163 -0
  55. package/stages/L0-quickstart/template/.claude/hooks/memory-cue.js +82 -0
  56. package/stages/L0-quickstart/template/.claude/hooks/secrets-guard.js +87 -0
  57. package/stages/L0-quickstart/template/.claude/rules/your-app-code.md +17 -0
  58. package/stages/L0-quickstart/template/.claude/settings.json +36 -0
  59. package/stages/L0-quickstart/template/.claude/skills/boss/SKILL.md +161 -0
  60. package/stages/L0-quickstart/template/.claude/skills/boss-learn/SKILL.md +63 -0
  61. package/stages/L0-quickstart/template/.claude/skills/boss-sync/SKILL.md +55 -0
  62. package/stages/L0-quickstart/template/.claude/skills/canvas/SKILL.md +112 -0
  63. package/stages/L0-quickstart/template/.claude/skills/comprehend/SKILL.md +72 -0
  64. package/stages/L0-quickstart/template/.claude/skills/decide/SKILL.md +122 -0
  65. package/stages/L0-quickstart/template/.claude/skills/feedback/SKILL.md +68 -0
  66. package/stages/L0-quickstart/template/.claude/skills/import/SKILL.md +73 -0
  67. package/stages/L0-quickstart/template/.claude/skills/persona/SKILL.md +92 -0
  68. package/stages/L0-quickstart/template/.claude/skills/prototype/SKILL.md +114 -0
  69. package/stages/L0-quickstart/template/.claude/skills/triage/SKILL.md +104 -0
  70. package/stages/L0-quickstart/template/.claude/skills/welcome/SKILL.md +262 -0
  71. package/stages/L0-quickstart/template/AGENTS.md +31 -0
  72. package/stages/L0-quickstart/template/CLAUDE.md +57 -0
  73. package/stages/L0-quickstart/template/docs/IDS.md +42 -0
  74. package/stages/L0-quickstart/template/docs/ideas/INDEX.md +24 -0
  75. package/stages/L0-quickstart/template/docs/loops/canvas-loop.md +90 -0
  76. package/stages/L0-quickstart/template/docs/loops/capture-loop.md +64 -0
  77. package/stages/L1-mvp/manifest.json +12 -0
  78. package/stages/L1-mvp/template/.claude/agents/mentor-architect.md +124 -0
  79. package/stages/L1-mvp/template/.claude/agents/mentor-cofounder.md +85 -0
  80. package/stages/L1-mvp/template/.claude/agents/mentor-gtm.md +49 -0
  81. package/stages/L1-mvp/template/.claude/agents/program-manager.md +46 -0
  82. package/stages/L1-mvp/template/.claude/agents/tester.md +42 -0
  83. package/stages/L1-mvp/template/.claude/hooks/auto-log.js +133 -0
  84. package/stages/L1-mvp/template/.claude/rules/feature-context.md +18 -0
  85. package/stages/L1-mvp/template/.claude/skills/ai-cost/SKILL.md +249 -0
  86. package/stages/L1-mvp/template/.claude/skills/ai-failure-states/SKILL.md +226 -0
  87. package/stages/L1-mvp/template/.claude/skills/ai-first-init/SKILL.md +227 -0
  88. package/stages/L1-mvp/template/.claude/skills/close/SKILL.md +170 -0
  89. package/stages/L1-mvp/template/.claude/skills/consult/SKILL.md +72 -0
  90. package/stages/L1-mvp/template/.claude/skills/cost-review/SKILL.md +204 -0
  91. package/stages/L1-mvp/template/.claude/skills/design-tokens-init/SKILL.md +192 -0
  92. package/stages/L1-mvp/template/.claude/skills/drift-deep/SKILL.md +170 -0
  93. package/stages/L1-mvp/template/.claude/skills/evals/SKILL.md +154 -0
  94. package/stages/L1-mvp/template/.claude/skills/extract/SKILL.md +209 -0
  95. package/stages/L1-mvp/template/.claude/skills/judge-traces/SKILL.md +68 -0
  96. package/stages/L1-mvp/template/.claude/skills/log/SKILL.md +64 -0
  97. package/stages/L1-mvp/template/.claude/skills/practice/SKILL.md +92 -0
  98. package/stages/L1-mvp/template/.claude/skills/pretotype/SKILL.md +95 -0
  99. package/stages/L1-mvp/template/.claude/skills/red-team/SKILL.md +137 -0
  100. package/stages/L1-mvp/template/.claude/skills/revalidate/SKILL.md +51 -0
  101. package/stages/L1-mvp/template/.claude/skills/ship/SKILL.md +105 -0
  102. package/stages/L1-mvp/template/.claude/skills/smoke/SKILL.md +43 -0
  103. package/stages/L1-mvp/template/.claude/skills/spec/SKILL.md +145 -0
  104. package/stages/L1-mvp/template/claude-append.md +122 -0
  105. package/stages/L1-mvp/template/docs/loops/ai-failure-state-loop.md +107 -0
  106. package/stages/L1-mvp/template/docs/loops/coordination-loop.md +116 -0
  107. package/stages/L1-mvp/template/docs/loops/cost-budget-loop.md +117 -0
  108. package/stages/L1-mvp/template/docs/loops/cost-review-loop.md +113 -0
  109. package/stages/L1-mvp/template/docs/loops/design-tokens-loop.md +98 -0
  110. package/stages/L1-mvp/template/docs/loops/drift-loop.md +149 -0
  111. package/stages/L1-mvp/template/docs/loops/extraction-loop.md +128 -0
  112. package/stages/L1-mvp/template/docs/loops/focus-loop.md +106 -0
  113. package/stages/L1-mvp/template/docs/loops/pretotype-loop.md +88 -0
  114. package/stages/L1-mvp/template/docs/loops/spec-loop.md +83 -0
  115. package/stages/L2-v1/manifest.json +12 -0
  116. package/stages/L2-v1/template/.claude/agents/db-architect.md +91 -0
  117. package/stages/L2-v1/template/.claude/agents/mentor-business.md +124 -0
  118. package/stages/L2-v1/template/.claude/agents/mentor-fundraising.md +72 -0
  119. package/stages/L2-v1/template/.claude/agents/mentor-pitch.md +84 -0
  120. package/stages/L2-v1/template/.claude/agents/mentor-talent.md +84 -0
  121. package/stages/L2-v1/template/.claude/agents/ui-designer.md +81 -0
  122. package/stages/L2-v1/template/.claude/agents/ux-designer.md +87 -0
  123. package/stages/L2-v1/template/.claude/skills/board/SKILL.md +98 -0
  124. package/stages/L2-v1/template/.claude/skills/design-review/SKILL.md +77 -0
  125. package/stages/L2-v1/template/.claude/skills/ux-check/SKILL.md +93 -0
  126. package/stages/L2-v1/template/claude-append.md +59 -0
  127. package/stages/L2-v1/template/docs/loops/design-drift-loop.md +108 -0
  128. package/stages/L3-scale/README.md +13 -0
@@ -0,0 +1,154 @@
1
+ ---
2
+ name: evals
3
+ description: Build and run the eval set for an AI-mediated FEAT — "is it correct?" paired with /smoke's "is it alive?" Husain's discipline applied to LLM-mediated control-flow in {{PROJECT_NAME}} — look at your data, build the eval set FIRST, categorize failures by mode, vibes-based eval is only a starting point. Usage - /evals [FEAT-NNN | --new <feat>]
4
+ ---
5
+
6
+ # /evals — the AI-correctness gate
7
+
8
+ `/smoke` answers *is the app alive?* `/evals` answers *is the AI part correct?* In MVP mode, every
9
+ FEAT that puts an LLM in the control flow needs an eval set — even a tiny one. The discipline is
10
+ **Husain's**: *almost all AI quality problems are visible in the data, and almost no one looks.*
11
+ This skill is "look."
12
+
13
+ The MVP rule: **20 examples beats 0.** Even an unsophisticated eval set caught vibes earlier than
14
+ no eval set ever could. Categorize failures so the next iteration can target a category, not just
15
+ "make it better."
16
+
17
+ ## Correctness ≠ safety — the adversarial half
18
+
19
+ A clean `/evals` pass means the AI part is *correct* — not that it's *safe*. Safety holds under normal
20
+ use and **degrades under adversarial / jailbreak prompts across every model** (Stanford HAI AI Index
21
+ 2026: 362 AI incidents in 2025, up from 233; the industry reports capability benchmarks and almost never
22
+ the responsibility half). So an `/evals` run isn't *done* until **`/red-team`** has run the adversarial
23
+ half — its OWASP-Agentic battery is what that pass should probe. Correctness is `/evals`; resistance
24
+ under attack is `/red-team`; shipping an AI feature needs **both**, not either alone.
25
+
26
+ ## When to run it
27
+
28
+ - A FEAT-NNN involves an LLM call whose output drives behaviour (not just user-facing prose).
29
+ - *Before* you ship the FEAT — eval-set-FIRST, not retrofitted.
30
+ - After a real-user bug report — add the failing case to the eval set, then fix.
31
+ - Regression — after any model swap, prompt change, or upstream library bump, re-run the set.
32
+
33
+ ## How to run it
34
+
35
+ 1. **Pick the FEAT.** Read the FEAT spec's *Evals* section. If it doesn't exist (FEAT pre-dates the
36
+ v0.21.0 spec template), add it: declare an `Eval set path: docs/evals/FEAT-NNN.yml`.
37
+ 2. **Create or open** `docs/evals/FEAT-NNN.yml`. Format: a YAML list of cases, each with
38
+ `id / category / scenario / inputs / expected / why`. Categorize `should-pass` and
39
+ `should-fail` (the latter sub-categorized by failure mode per Husain's discipline —
40
+ `over-applies / hallucinates / refuses-wrongly / format-violation / etc.`).
41
+ 3. **Run them.** Either:
42
+ - A runner script (Node, ~150 lines like BOSS's own `docs/architecture/conscience-evals/
43
+ runner.js`) — preferred, machine-runnable, becomes a CI gate
44
+ - Or manually walk each case, recording pass/fail and what failed
45
+ 4. **Report.** One line per case: `✓ <id> <scenario>` or `✗ <id> → <reason>`. Summary table by
46
+ failure category — the categorized failure count IS the design signal for next iteration.
47
+ 5. **Add the failure** when something breaks live. Each real-world bug becomes a case in the set;
48
+ the set grows from real friction.
49
+
50
+ ## Eval set format (recommended)
51
+
52
+ ```yaml
53
+ - id: feat-007-pass-001
54
+ category: should-pass
55
+ scenario: <one-line natural description>
56
+ inputs:
57
+ <synthetic state / prompt / context>
58
+ expected:
59
+ <pass/fail criteria — what "right" looks like>
60
+ why: <why this case matters>
61
+
62
+ - id: feat-007-fail-001
63
+ category: should-fail
64
+ failure_mode: hallucination # one of: garbage | refusal | hallucination | timeout | cost-spike | other
65
+ scenario: <one-line>
66
+ inputs: …
67
+ expected:
68
+ fires: true # the detector / guard / refusal catches this
69
+ why: <what could go wrong if missed>
70
+ ```
71
+
72
+ Same shape as BOSS's own conscience-evals — copy that runner.js as a starting point if Node fits
73
+ your stack.
74
+
75
+ ## Failure-state coverage requirement (v0.30.0+ — for AI-mediated FEATs)
76
+
77
+ For any FEAT that puts an LLM in the user-visible path (i.e., one that declares responses in
78
+ `docs/ai-failure-states.md`), the eval set **must include at least one `should-fail` case
79
+ per declared failure state**, categorized by `failure_mode` matching the canonical names:
80
+
81
+ - `garbage` — schema-invalid / malformed / off-topic output
82
+ - `refusal` — safety-template / over-cautious non-answer / "I can't help with that"
83
+ - `hallucination` — confidently-wrong content (fabricated citations, invented APIs, etc.)
84
+ - `timeout` — call hangs / network drops / 5xx response
85
+ - `cost-spike` — input or output token count exceeds declared per-call cap
86
+
87
+ This closes the *"failure-state handlers can be stubs forever"* loophole. The handler stubs
88
+ in `src/` satisfy the `ai-failure-state-loop` predicate; the eval cases here verify the stubs
89
+ *actually do something* when the failure surfaces.
90
+
91
+ A handler whose `Eval-tested` field in `docs/ai-failure-states.md` reads `STUB` means the
92
+ founder has committed to writing this eval case — either now, or under a recorded override
93
+ (IDEA-008) with a re-open condition. **STUB without an override is the failure mode this
94
+ upgrade prevents.**
95
+
96
+ Cohort-aware: `first-product` may legitimately ship STUB + override on day-one builds
97
+ (*"haven't seen this failure yet; will write the case when FEAT-002 ships"*); `domain-expert`
98
+ in high-stakes domains should not ship STUB without an external escalation route documented.
99
+
100
+ ## Sharpening (2026 — Hamel Husain / Shreya Shankar)
101
+
102
+ The 2026 update to the eval discipline, from the people who teach it. Fold these in:
103
+
104
+ - **Error analysis comes first — on *real* traces, not invented cases.** Read your actual
105
+ session/agent traces, sort the failures into a taxonomy, *then* build evaluators for the modes you
106
+ actually see. Inventing eval cases before you've looked at real failures is "eval-driven
107
+ development" done backwards. (If the project runs BOSS's `auto-log` trace substrate, `.boss/trace.jsonl`
108
+ is exactly this raw material — IDEA-025.) Error analysis is 60–80% of the work.
109
+ - **Binary pass/fail, not 1–5 scores.** A Likert score hides the decision. Force each case to a
110
+ yes/no — "did it do the thing or not" — and let the *categorized* failures carry the nuance. Scores
111
+ feel rigorous and measure nothing.
112
+ - **One expert, not a committee.** Pick a single "benevolent dictator" who owns what pass means for
113
+ this FEAT. Averaging three people's vibes produces mush. (For high-stakes domains, that expert is a
114
+ domain expert — see the rule below.)
115
+ - **Don't let the model grade its own homework.** If you use LLM-as-judge, the judge must be a
116
+ *separate* pass from the call that produced the output, with its own examples of the judge being
117
+ wrong. A right answer reached through a bad/dangerous tool path is still a failure — evaluate the
118
+ *trajectory*, not just the endpoint. (For a FEAT with a real multi-step tool flow, assert on the
119
+ tool *sequence*, not only the final output; **UK AISI's Inspect** is the graduation-grade harness to
120
+ reach for if you outgrow hand-rolled trajectory checks — point at it, don't rebuild it.)
121
+ - **Cost hierarchy.** Cheap deterministic assertions first; reserve LLM-as-judge for the persistent,
122
+ genuinely-semantic failures. A useful mix to aim for: ~60% deterministic / ~30% LLM-as-judge /
123
+ ~10% human-in-the-loop.
124
+ - **Reliability ≠ single-shot — measure pass^k.** Non-deterministic output means a case that passes
125
+ once can fail the next call. Run each load-bearing case **k times and count how often it *all*
126
+ succeeds** (pass^k, from τ-bench) — that consistency rate, not one green run, is the real reliability
127
+ signal. Zero-dependency: a loop around the case you already wrote. (τ-bench / UK AISI Inspect are the
128
+ graduate-grade harnesses to point at when you outgrow it, never a CLI dependency.)
129
+
130
+ ## Structured outputs (Liu discipline) — strongly recommended
131
+
132
+ If the LLM call's output drives subsequent code (control flow, data routing, decisions), **schema
133
+ the output**. Pydantic-first / Zod-first / any-structured-output. Free-form prose is for human
134
+ eyeballs only. The eval set then asserts on the *schema*, not on prose interpretation. This single
135
+ practice eliminates ~80% of LLM-pipeline brittleness (Jason Liu).
136
+
137
+ ## Rules
138
+
139
+ - **Eval-set first.** Write 20+ cases before the LLM call ships, not retrofitted after. If you
140
+ don't know what 20 cases are, you don't yet know what the FEAT does.
141
+ - **Categorize failures.** Failure-mode categorization is more valuable than success count.
142
+ Husain: *failure modes are more valuable than success modes — categorize systematically.*
143
+ - **Vibes are a starting point.** First 5 iterations on vibes is fine; the 6th wants rubrics.
144
+ - **Look at your data.** When a case fails, *open the actual model output side-by-side with the
145
+ prompt*. The fault is almost always visible.
146
+ - **LLM-as-judge is fine but only with examples of the judge being wrong.** Without that,
147
+ you've added a confidently-wrong layer on top of a confidently-wrong layer.
148
+ - **Domain expertise beats LLM eval** when stakes are real (medical, legal, financial). A
149
+ doctor reviewing 30 outputs beats GPT reviewing 3,000 in the same domain.
150
+ - **Eval is not CI — yet.** Run on commit when the cost is small. Run on every commit when the
151
+ cost is justified. Decide deliberately.
152
+ - **Correctness isn't safety.** A green `/evals` pass isn't *done* until `/red-team`'s adversarial pass
153
+ runs — safety degrades under attack even when correctness holds (AI Index 2026). Two rails, both required.
154
+ - **Non-deterministic ≠ run-once.** Measure consistency across k trials (pass^k), not a single happy-path run.
@@ -0,0 +1,209 @@
1
+ ---
2
+ name: extract
3
+ description: Pause and sort patterns — PRINCIPLE #1's discipline as a skill. Reads recent work (git log, devlog, src/, library/) and proposes 1-3 candidate extractions, each routed UP (into BOSS's library/<cat>/ for reuse across projects via /boss-learn) or DOWN (into the app's own core code). Records the decision in docs/extractions/EXTR-NNN-*.md. The LLM-as-judge counterpart to the predicate-based loops — uses Claude's reading, not regex. Usage - /extract
4
+ ---
5
+
6
+ # /extract — pause and sort the pattern
7
+
8
+ > *"BOSS is always scaffolding, but scaffolding is the **motion**, not the goal. At every
9
+ > natural breakpoint — a mode transition, a shipped feature, the third time the same work
10
+ > repeats — **pause and sort the pattern two ways:** UP into BOSS as a reusable superset
11
+ > practice. DOWN into the app as core functionality."* — PRINCIPLE #1.
12
+
13
+ The conscience's other moments fire on predicate matches (regex over files). This one needs
14
+ **judgment** — *"is this pattern actually reusable?"* — which a regex can't see. /extract is
15
+ the LLM-as-judge counterpart: Claude reads recent work and proposes specific routes.
16
+
17
+ The discipline says **two destinations**, not one:
18
+ - **UP** → `library/<category>/` in the BOSS source repo. Every future project inherits it.
19
+ Promoted via `boss learn <path> --as <cat>`. *"This pattern is reusable across projects."*
20
+ - **DOWN** → into the app's core code (refactor inline duplication into a named module,
21
+ function, or schema in `src/`). *"This pattern is product, not scaffold."*
22
+
23
+ A pattern can also be **neither yet** — the third honest answer. Recording the *not yet* IS
24
+ the discipline; pretending everything is extractable is the failure mode.
25
+
26
+ ## When to run it
27
+
28
+ - The conscience surfaced the `capture` moment (extraction-loop opened — heuristic says ≥3
29
+ devlog entries, no extraction recorded yet). The loop's nudge is the prompt; run this skill
30
+ to act on it.
31
+ - After a FEAT ships (a natural breakpoint per PRINCIPLE #1).
32
+ - After `boss unlock` lands a new mode (another natural breakpoint).
33
+ - The *third time* you find yourself writing similar code or skill prompt or copy-paste-then-
34
+ edit (the *"three repetitions"* signal).
35
+ - Mid-session when something feels reusable but you can't name where it should live.
36
+
37
+ ## How to run it
38
+
39
+ ### 1. Orient on recent work
40
+
41
+ Read, silently:
42
+ - Last 10-20 commits (`git log --oneline -20`)
43
+ - The most recent 3-5 devlog entries (`docs/devlog.md` — newest first)
44
+ - The current FEAT (if any) and its acceptance criteria
45
+ - `library/` listing in the BOSS source repo (if accessible) — what's already extracted UP
46
+ - `src/` directory structure — what's accumulating DOWN
47
+
48
+ Don't announce these reads. Just orient.
49
+
50
+ ### 2. Identify 1-3 candidate patterns
51
+
52
+ Look for one of three signals (these are the same signals PRINCIPLE #1 names):
53
+
54
+ **Signal A — same work repeated.** A skill prompt rewritten three times across FEATs. A
55
+ component-shape copied to three pages. A test fixture duplicated across three test files. A
56
+ helper function reinvented twice.
57
+
58
+ **Signal B — a named-and-stable shape.** A pattern you've started referring to by name in
59
+ devlog or commits — *"the cohort-aware framing,"* *"the structured signal,"* *"the
60
+ fallback handler stub."* Names indicate the pattern has earned identity.
61
+
62
+ **Signal C — a load-bearing decision.** A choice that other choices depend on — a schema
63
+ shape, a prompt convention, a file-layout pattern, a test harness. Even if it only exists
64
+ once, its *role* is foundational.
65
+
66
+ For each candidate, name it in plain language. Don't propose more than 3 — pick the most
67
+ load-bearing ones. **It's fine to find zero** — record that explicitly.
68
+
69
+ ### 3. Route each candidate (UP / DOWN / NOT-YET)
70
+
71
+ For each candidate, ask the routing questions:
72
+
73
+ | Question | UP signal | DOWN signal | NOT-YET signal |
74
+ |---|---|---|---|
75
+ | **Can a sibling project reuse this without copy-paste?** | Yes | No (project-specific) | Maybe with rework |
76
+ | **Does this depend on this project's domain?** | No (stack-neutral) | Yes (domain-bound) | Partially |
77
+ | **Has it been used 3+ times in this project?** | Yes (battle-tested) | Yes but won't generalize | No (single use) |
78
+ | **Is the value in the pattern, or in the code?** | The pattern | The code | Unclear |
79
+ | **Where does future-you look for it?** | Across projects (BOSS library) | Within this project (src/) | Not findable yet |
80
+
81
+ Route each candidate based on the dominant signal:
82
+ - **UP** → `library/skills/`, `library/agents/`, `library/practices/`, `library/hooks/`, or
83
+ `library/memory-seed/` in the BOSS source repo. Run `boss learn <path> --as <cat>` to
84
+ copy + bump VERSION + add CHANGELOG entry. (Requires `$BOSS_DEV` or `npm link`-installed
85
+ BOSS — see [`/boss-learn`](../boss-learn/SKILL.md).)
86
+ - **DOWN** → refactor the duplication into a named module/function/schema in `src/`. /extract
87
+ doesn't execute the refactor (the founder owns the code); it names the target file path +
88
+ the smallest valuable refactor.
89
+ - **NOT-YET** → record the candidate + reason. *"Worth watching; not yet load-bearing enough
90
+ / not yet generalizable / not yet used outside one feature."*
91
+
92
+ ### 4. Write `docs/extractions/EXTR-NNN-<slug>.md`
93
+
94
+ Use this skeleton. The frontmatter + `Route` line make the file detectable by extraction-loop.
95
+
96
+ ```markdown
97
+ ---
98
+ id: EXTR-NNN
99
+ type: extraction
100
+ owner: pm
101
+ status: recorded
102
+ created: {{DATE}}
103
+ trigger: <devlog-3-entries | FEAT-NNN-shipped | mode-unlock | third-repetition | manual>
104
+ ---
105
+
106
+ # EXTR-NNN — <one-line summary of the extraction>
107
+
108
+ ## Recent context
109
+ _What was in the air at the breakpoint — last 3-5 devlog entries summarized in 2-3 sentences,
110
+ last commits referenced. Future-you reads this to remember why this extraction happened._
111
+
112
+ ## Candidate 1: <name>
113
+ - **What it is:** <one line>
114
+ - **Where it lives now:** <file paths, scope>
115
+ - **Route:** UP | DOWN | NOT-YET
116
+ - **Rationale:** <why this route — answer the routing questions briefly>
117
+ - **If UP:** target `library/<category>/<name>` — run `boss learn <src-path> --as <cat>` next.
118
+ - **If DOWN:** target `src/<path>` — refactor target named + the smallest valuable cut.
119
+ - **If NOT-YET:** re-open condition: <what would change the answer to UP or DOWN>.
120
+
121
+ ## Candidate 2: <name>
122
+ (same structure)
123
+
124
+ ## Candidate 3: <name>
125
+ (same structure — omit if fewer than 3 candidates)
126
+
127
+ ## What didn't make the cut
128
+ _Patterns you considered and explicitly rejected. Naming what's NOT an extraction is half the
129
+ discipline — it prevents over-extraction the next time the loop fires._
130
+ - <pattern> — <why not>
131
+
132
+ ## Notes
133
+ - Source devlog entries: <date range>
134
+ - Related FEATs: <FEAT-NNN refs>
135
+ - BOSS version when this was recorded: <from VERSION>
136
+ ```
137
+
138
+ ### 5. Update IDS allocation
139
+
140
+ Add a row in `docs/ideas/INDEX.md` (or wherever your project tracks IDs) under an *Extractions*
141
+ heading; allocate the next free `EXTR-NNN` integer by grepping `docs/extractions/*.md`.
142
+
143
+ ### 6. If any candidates are UP — invoke `/boss-learn`
144
+
145
+ For each UP-routed candidate, after the founder confirms:
146
+ - Suggest the `boss learn <path> --as <cat>` invocation.
147
+ - Hand off to `/boss-learn` for the two-way router judgment (it may re-route to DOWN if it
148
+ disagrees — that's the discipline's check on the discipline).
149
+ - Record in the EXTR file whether the UP succeeded (bumped VERSION? CHANGELOG line written?).
150
+
151
+ ### 7. If any candidates are DOWN — name the refactor
152
+
153
+ Don't execute the refactor inside `/extract` (that's `coder-generalist`'s job). Just:
154
+ - Name the target file path and the smallest valuable cut.
155
+ - Add a TODO to `docs/RESUME.md`'s Next-tasks: *"Refactor <name> per EXTR-NNN."*
156
+ - Hand off if the founder wants to start the refactor now.
157
+
158
+ ### 8. If all are NOT-YET — record honestly
159
+
160
+ NOT-YET is a legitimate answer for an extraction record. The `- **Route:** NOT-YET` line
161
+ still closes extraction-loop (the discipline IS the practice of pausing-and-routing, not the
162
+ volume of extractions). Future-you reads this file and sees: *"BOSS made me look; I looked
163
+ honestly; nothing was extractable yet."* That's the principle working.
164
+
165
+ ## Cohort-aware delivery
166
+
167
+ | Cohort | Posture |
168
+ |---|---|
169
+ | `first-product` | Walk through the routing carefully. Define UP / DOWN inline (they may not know what `library/` is). Lean toward NOT-YET for early projects; the discipline of *looking* matters more than the volume of *finding*. |
170
+ | `vibe-coder-newbie` | Show the three signals (A/B/C) with named examples from THIS project. Avoid abstract framings. |
171
+ | `non-tech-founder` | Plain language. UP = "reusable across projects" / DOWN = "make it real in this product" / NOT-YET = "not the right time to extract." They likely own the routing call but may delegate the actual extraction. |
172
+ | `eng-builder` | Terse. They'll spot extractables fast; the question is whether they'll *do* the work. The skill's job is to anchor the decision in the EXTR file, not to teach. |
173
+ | `vibe-virtuoso` | They have a backlog of extractable patterns from past projects. The skill's leverage here is *"which of THESE three from THIS project is the load-bearing one?"* — not the full inventory. |
174
+ | `indie-hacker` | Calm-company framing. UP is investment in the system; DOWN is investment in this product. NOT-YET is the most-used route — patience is the discipline. |
175
+ | `returning-founder` | Skip the routing-question table; they know it. Ask: *"Three sessions in. What did you do twice? What did you almost do a third time?"* They'll name the candidates without prompting. |
176
+ | `domain-expert` | High-stakes: extractions involving regulated logic (PHI handling, financial calculations, legal templates) lean NOT-YET-with-caveats *"this is too domain-specific to generalize; document the project-internal abstraction; do NOT lift to library/."* The default is conservative. |
177
+
178
+ ## Connection to other loops + skills
179
+
180
+ - **Triggered by:** `extraction-loop` (this skill's job is to close it). The loop opens at
181
+ the heuristic breakpoint; the skill is the judgment.
182
+ - **Routes to:** `/boss-learn` (for UP candidates) — see `library/` rules in the BOSS source
183
+ repo. `/boss-learn` is the two-destination router at the *promote* step; /extract is the
184
+ router at the *propose* step. They share PRINCIPLE #1; they sit at different inflection
185
+ points.
186
+ - **Adjacent:** `/log` (devlog discipline produces the entry signal); `/close` (session-end
187
+ may surface "consider /extract" when devlog has accumulated entries).
188
+
189
+ ## What this skill is NOT
190
+
191
+ - **Not a one-time ritual.** Re-run after each FEAT, each mode unlock, each *"third time"* signal.
192
+ - **Not an automatic refactor.** The skill names the route; the founder (or `coder-generalist`)
193
+ owns the actual code change.
194
+ - **Not a quality gate.** Skipping extraction doesn't fail anything. The override grammar
195
+ applies — record in devlog if you deliberately skip.
196
+
197
+ ## Rules
198
+
199
+ - **NOT-YET is a legitimate answer.** Honest no-pattern-extractable beats inventing one.
200
+ - **Two destinations, not one.** PRINCIPLE #1 is explicit: UP into BOSS or DOWN into app.
201
+ Treating extraction as one-way (always-UP) is the failure mode this skill exists to prevent.
202
+ - **Three signals, not feelings.** Same-work-repeated / named-and-stable / load-bearing-
203
+ decision. If you can't tie the candidate to one of the three, it's premature.
204
+ - **Record before route.** Write the EXTR-NNN file FIRST; invoke `/boss-learn` or refactor
205
+ AFTER. The record IS the discipline.
206
+ - **Cite PRINCIPLE #1.** This skill exists to encode that principle. Naming it in the EXTR
207
+ file ties the discipline back to its source.
208
+ - **At most three candidates per /extract run.** More than three means you're inventorying,
209
+ not pausing. Pick the load-bearing ones; let the rest re-surface naturally.
@@ -0,0 +1,68 @@
1
+ ---
2
+ name: judge-traces
3
+ description: Error analysis on your real session traces — the Hamel/Shankar discipline applied to your own work. Reads .boss/trace.jsonl (what agents actually did), helps you sort what went wrong into a binary pass/fail failure taxonomy, and routes the real failure modes to /boss-learn. The deliberate, founder-invoked reader for the auto-log trace substrate. Usage - /judge-traces [last N | all]
4
+ ---
5
+
6
+ # /judge-traces — read your real traces, find the real failure modes
7
+
8
+ The 2026 eval discipline (Hamel Husain / Shreya Shankar) is blunt: **error analysis on real traces is
9
+ 60–80% of the work, and almost no one looks.** `/judge-traces` is "look" — at *your* sessions, not at
10
+ golden cases someone imagined. It reads the trace your work already leaves and helps you turn it into a
11
+ failure taxonomy you can act on.
12
+
13
+ This is the **deliberate reader** for the `auto-log` trace substrate (`library/hooks/auto-log.js`). The
14
+ hook *collects* (passively, when enabled); this skill *judges* (deliberately, when you ask). They're
15
+ kept apart on purpose — collection is never judgment.
16
+
17
+ ## Prerequisite (and graceful degrade)
18
+
19
+ Reads `.boss/trace.jsonl`. If it doesn't exist or is empty, say so plainly and stop — don't invent
20
+ data:
21
+ > *"No trace yet. `auto-log` is dormant by default (a SubagentStop hook has a per-subagent cost). Turn
22
+ > it on in `.claude/settings.json` (see `library/hooks/auto-log.js`) and the trace will accumulate as
23
+ > agents do work. Come back here once there's something to read."*
24
+
25
+ Never fabricate a taxonomy from no data. An honest "nothing to judge yet" is the correct output.
26
+
27
+ ## How to run it
28
+
29
+ **1. Read the traces.** Load `.boss/trace.jsonl` (each line: `ts`, `session`, `agent`, `files`,
30
+ `file_count`). Default to the last ~30; `all` reads everything; `last N` reads N.
31
+
32
+ **2. Surface the shape, factually first.** Before judging: which agents did what, how often, on which
33
+ files. This is the cheap deterministic pass — counts, not opinions. (Hamel's cost hierarchy: cheap
34
+ assertions before any judgment.)
35
+
36
+ **3. Help the founder do error analysis — binary, not scored.** Walk the traces and sort what you can
37
+ see into **pass / fail**, never a 1–5 score. Failure modes to look for in agent work:
38
+ - `wrong-files` — an agent touched files outside its lane (a coder editing docs, a doc agent editing src)
39
+ - `thrash` — the same files churned across many sessions with no shipped outcome
40
+ - `silent-scope-creep` — a small ask that fanned into a large diff
41
+ - `no-trace-of-the-point` — sessions of work with nothing that maps to a named FEAT / risk
42
+ - `<your own>` — the taxonomy is yours; name the modes *you* actually see
43
+
44
+ **One expert, not a committee** (Hamel): *you* own what "fail" means for this project. Don't average
45
+ opinions — make the call.
46
+
47
+ **4. Don't let the judge grade its own homework.** Where a trace looks like a failure, the read is a
48
+ *separate* pass from whatever produced it — and judge the **trajectory** (did the path make sense),
49
+ not just whether the endpoint happened to be fine. A right outcome via a wrong path is still a finding.
50
+
51
+ **5. Route the real modes.** A failure mode that recurs is a candidate for `/boss-learn` — UP (a
52
+ practice/guard for every project) or DOWN (a fix in this app). A one-off is just a one-off; don't
53
+ systematize noise. Name the count: *"`wrong-files` appeared 4× across 3 sessions — worth a guard."*
54
+
55
+ ## Output
56
+
57
+ A short report: the factual shape (agents × files × frequency), then the binary failure taxonomy with
58
+ counts, then the 1–3 modes worth routing to `/boss-learn`. Keep it to what the trace actually shows.
59
+
60
+ ## Rules
61
+
62
+ - **No data → no taxonomy.** Honest "nothing yet" beats a fabricated one.
63
+ - **Binary, not Likert.** Pass/fail forces the decision; scores hide it.
64
+ - **Counts are the signal.** A failure that recurs matters; a one-off doesn't. Lead with frequency.
65
+ - **Local-only.** The trace is this machine's; nothing leaves it. This skill reads, never sends.
66
+ - **Collection ≠ judgment.** `auto-log` collects passively; this judges deliberately. Never fuse them
67
+ into an always-on auto-grader — that's the trap (the conscience would be grading every session
68
+ unprompted). Judgment is a thing you choose to do.
@@ -0,0 +1,64 @@
1
+ ---
2
+ name: log
3
+ description: Append a dated entry to docs/devlog.md — what landed this session, what's next, what surprised you. Lighter than commit messages, denser than CHANGELOG. The thing future-you reads before starting work. Usage - /log <one-line summary or detailed entry>
4
+ ---
5
+
6
+ # /log — the devlog
7
+
8
+ The devlog is the project's working memory between commits and RESUME. Commits answer *what
9
+ changed in the code*; the devlog answers *what happened in the session* — including the decisions
10
+ that didn't show up in a diff: a path tried and abandoned, an assumption updated, a person you
11
+ talked to, a number you saw.
12
+
13
+ If you only read one thing when picking the project back up, read the last devlog entry.
14
+
15
+ ## How to run it
16
+
17
+ 1. Open (or create) `docs/devlog.md`. If creating, seed with:
18
+
19
+ ```markdown
20
+ ---
21
+ id: DEVLOG
22
+ type: devlog
23
+ owner: pm
24
+ status: active
25
+ ---
26
+
27
+ # Devlog — {{PROJECT_NAME}}
28
+
29
+ Append-only. Newest at the top. Each entry: date, FEAT (if any), what landed, what's next.
30
+ ```
31
+
32
+ 2. Append a **new entry at the top** (under the header), dated today:
33
+
34
+ ```markdown
35
+ ## {{today}}
36
+ - **FEAT:** FEAT-NNN <name> _(or "no FEAT — exploration/ops")_
37
+ - **Landed:** <one or two lines — what's now real that wasn't before>
38
+ - **Next:** <the very next thing — concrete, one or two lines>
39
+ - **Surprises / decisions:** <only if there was one — what changed in your model of the problem>
40
+ ```
41
+
42
+ 3. If the user gave you a one-liner, that's enough — fill it into **Landed**, leave **Next** empty
43
+ only if they didn't say. Don't fabricate. Blanks are honest.
44
+ 4. If a FEAT closed (acceptance criteria met + smoke green), also flip its status in the FEAT doc
45
+ and `docs/ideas/INDEX.md` to `shipped`.
46
+
47
+ ## What belongs in a devlog entry
48
+
49
+ - The path you tried that didn't work and what you'd do instead.
50
+ - The number you saw — a latency, a user count, a sign-up rate.
51
+ - A conversation that changed the plan.
52
+ - A decision the diff doesn't capture ("we picked X over Y because…").
53
+
54
+ ## What doesn't
55
+
56
+ - A list of every file you edited. The diff already says that.
57
+ - Commit messages copied in verbatim.
58
+ - Plans for next week. Those go in `docs/RESUME.md` via `/close`.
59
+
60
+ ## Rules
61
+
62
+ - Newest at the top, append-only — never edit old entries. (Wrong? Add a correction at today's entry.)
63
+ - One entry per session, usually. Multiple short entries are fine if the day shifted gears.
64
+ - Short is better than complete. A devlog you'll actually write beats one you won't.
@@ -0,0 +1,92 @@
1
+ ---
2
+ name: practice
3
+ description: Capture a craft learning — a better way to build with AI you found — as a shared, attributed PRAC-NNN record your cofounder gets too. The team's commons for staying current on the fast-moving agentic-AI craft, so you both build on each other's discoveries instead of re-learning them. Staleness-aware: AI moves fast, so a practice can carry a review date. Usage - /practice <what you learned>
4
+ ---
5
+
6
+ # /practice — the shared craft commons
7
+
8
+ The craft of building with AI changes monthly — new models, new agent patterns, a cheaper way to do the
9
+ thing you did the expensive way last week. Each founder picks up discoveries on their own; without a
10
+ shared place, they stay trapped in one person's head, and a cofounder re-learns the same lesson the hard
11
+ way (or keeps doing it the outdated, expensive way).
12
+
13
+ A **practice** is one such learning, written down once and **shared** with the whole team: *"use streaming
14
+ for the long generations — it cut perceived latency in half"*, *"`claude-haiku-4-5` is plenty for the
15
+ classification step, ~10× cheaper than Opus"*, *"this prompt structure stopped the JSON drift."* The point
16
+ isn't ceremony — it's that **you both get current together and can focus on building**, not on worrying
17
+ whether you're behind or overspending.
18
+
19
+ ## When to reach for it
20
+
21
+ - You found a genuinely better/cheaper/newer way to do something with AI and your cofounder would benefit.
22
+ - You hit a sharp edge (a model quirk, a cost trap, an agent failure mode) and worked out how to handle it.
23
+ - A `/vet` verdict or outside best-practice proved itself in *your* build — capture what actually held.
24
+
25
+ Skip it for one-off trivia or anything that belongs in the code as a comment. A practice is a *transferable*
26
+ lesson, not a changelog line.
27
+
28
+ ## How to run it
29
+
30
+ 1. **Resolve who learned it** (the credit — and so a teammate knows who to ask):
31
+
32
+ ```bash
33
+ gh api user --jq '.login' 2>/dev/null || git config user.name
34
+ ```
35
+
36
+ Use it as `@<login>`; leave `@you` if neither resolves. Never fabricate.
37
+
38
+ 2. **Pick the next number.** Highest `PRAC-NNN` in `docs/practices/` + 1 (first is `PRAC-001`). Create the
39
+ directory if needed.
40
+
41
+ 3. **Write `docs/practices/PRAC-NNN-<slug>.md`:**
42
+
43
+ ```markdown
44
+ ---
45
+ id: PRAC-NNN
46
+ type: practice
47
+ owner: "@<github-login of whoever learned it>"
48
+ status: active # active | stale | retired
49
+ created: {{today}}
50
+ applies_to: <what this is about — e.g. "Claude Code" / "model: opus-4.8" / "prompting" / "cost">
51
+ review_by: <YYYY-MM-DD> # optional — when to re-check it's still the best way (see staleness below)
52
+ ---
53
+
54
+ # PRAC-NNN — <the learning, in one line>
55
+
56
+ ## What we learned
57
+ The practice itself, concretely enough that your cofounder could apply it tomorrow.
58
+
59
+ ## Why it works
60
+ The reasoning — so it transfers, and so a teammate can tell when it stops applying.
61
+
62
+ ## How to apply
63
+ The shortest path to using it here. A snippet, a command, a setting — whatever makes it real.
64
+ ```
65
+
66
+ 4. **Fill from what the founder gave you.** Don't pad it. A two-line practice that's true beats a page that
67
+ guesses. Blanks are honest.
68
+
69
+ ## Staleness — the part that keeps you current (not just documented)
70
+
71
+ The AI craft moves fast, so a practice can quietly **go out of date** — a model that was cheapest last
72
+ month isn't, a workaround a new release made unnecessary. Two honest moves:
73
+
74
+ - **Set `review_by:`** for anything tied to a specific model, price, or tool version — a date to re-check
75
+ *"is this still the best way?"* (Don't set it for timeless practices.) When it passes, `/revalidate` the
76
+ practice the same way you would a paused idea: still true? still the best way? anything changed? → keep /
77
+ update / retire (flip `status:` to `retired`, or supersede with a new `PRAC`).
78
+ - This is the team's quiet defense against **being outdated or overspending** — BOSS surfaces the question;
79
+ you don't have to carry the anxiety of tracking every model release yourself.
80
+
81
+ ## Shared by construction
82
+
83
+ `docs/practices/` **commits with the repo** (it's not gitignored), so the moment you push, every practice
84
+ is backed up and your cofounder has it. Attribution (`owner:`) is recognition + a pointer to who to ask —
85
+ not a scoreboard. This is the hive-mind half of the founder layer: you each make the other richer.
86
+
87
+ ## What this is not
88
+
89
+ - Not a style guide or the conventions doc — those live in `AGENTS.md`. A practice is a *discovered lesson*,
90
+ often AI-specific and time-sensitive.
91
+ - Not a place to log every change. If it isn't transferable craft, it doesn't earn a `PRAC`.
92
+ - Not a ranking. The commons measures *what the team knows*, never *who contributed more*.