ai-collab-open-system 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. package/.aict/START_HERE.md +127 -0
  2. package/.aict/WORKSPACE_MANIFEST.json +91 -0
  3. package/.aict/acceptance/EXAMPLE.synthetic.md +49 -0
  4. package/.aict/acceptance/FAILURE_MODES.md +40 -0
  5. package/.aict/acceptance/PROMPT.md +47 -0
  6. package/.aict/acceptance/README.md +44 -0
  7. package/.aict/acceptance/TEMPLATE.md +57 -0
  8. package/.aict/adapters/SHARED_CORE_CONTRACT.md +106 -0
  9. package/.aict/adapters/claude-code/ADAPTER.md +28 -0
  10. package/.aict/adapters/cline/ADAPTER.md +28 -0
  11. package/.aict/adapters/codex/ADAPTER.md +28 -0
  12. package/.aict/adapters/copilot/ADAPTER.md +28 -0
  13. package/.aict/adapters/cursor/ADAPTER.md +28 -0
  14. package/.aict/adapters/windsurf/ADAPTER.md +28 -0
  15. package/.aict/context/EXAMPLE.synthetic.md +53 -0
  16. package/.aict/context/FAILURE_MODES.md +40 -0
  17. package/.aict/context/PROMPT.md +47 -0
  18. package/.aict/context/README.md +44 -0
  19. package/.aict/context/TEMPLATE.md +63 -0
  20. package/.aict/cookbook/README.md +8 -0
  21. package/.aict/cookbook/bridge-to-a-second-family.md +103 -0
  22. package/.aict/cookbook/connect-a-tool.md +67 -0
  23. package/.aict/cookbook/review-a-half-product.md +79 -0
  24. package/.aict/cookbook/run-a-first-loop.md +81 -0
  25. package/.aict/examples/README.md +21 -0
  26. package/.aict/examples/ai-coding-long-task/CASE.md +161 -0
  27. package/.aict/examples/ai-coding-long-task/artifacts/acceptance-card.md +36 -0
  28. package/.aict/examples/ai-coding-long-task/artifacts/context-package.md +30 -0
  29. package/.aict/examples/ai-coding-long-task/artifacts/execution-prompt.md +30 -0
  30. package/.aict/examples/ai-coding-long-task/artifacts/first-ai-output.md +109 -0
  31. package/.aict/examples/ai-coding-long-task/artifacts/guard-review.md +40 -0
  32. package/.aict/examples/ai-coding-long-task/artifacts/handoff-note.md +28 -0
  33. package/.aict/examples/ai-coding-long-task/artifacts/harvest-seed.md +28 -0
  34. package/.aict/examples/ai-coding-long-task/artifacts/revised-output.md +62 -0
  35. package/.aict/examples/content-production-harvest/CASE.md +87 -0
  36. package/.aict/examples/content-production-harvest/artifacts/acceptance-card.md +28 -0
  37. package/.aict/examples/content-production-harvest/artifacts/context-package.md +28 -0
  38. package/.aict/examples/content-production-harvest/artifacts/execution-prompt.md +30 -0
  39. package/.aict/examples/content-production-harvest/artifacts/guard-review.md +28 -0
  40. package/.aict/examples/content-production-harvest/artifacts/handoff-note.md +28 -0
  41. package/.aict/examples/content-production-harvest/artifacts/harvest-seed.md +28 -0
  42. package/.aict/examples/multi-tool-collaboration/CASE.md +87 -0
  43. package/.aict/examples/multi-tool-collaboration/artifacts/acceptance-card.md +28 -0
  44. package/.aict/examples/multi-tool-collaboration/artifacts/context-package.md +28 -0
  45. package/.aict/examples/multi-tool-collaboration/artifacts/execution-prompt.md +30 -0
  46. package/.aict/examples/multi-tool-collaboration/artifacts/guard-review.md +28 -0
  47. package/.aict/examples/multi-tool-collaboration/artifacts/handoff-note.md +28 -0
  48. package/.aict/examples/multi-tool-collaboration/artifacts/harvest-seed.md +28 -0
  49. package/.aict/examples/personal-judgment-growth-assistant/CASE.md +87 -0
  50. package/.aict/examples/personal-judgment-growth-assistant/artifacts/acceptance-card.md +28 -0
  51. package/.aict/examples/personal-judgment-growth-assistant/artifacts/context-package.md +28 -0
  52. package/.aict/examples/personal-judgment-growth-assistant/artifacts/execution-prompt.md +30 -0
  53. package/.aict/examples/personal-judgment-growth-assistant/artifacts/guard-review.md +28 -0
  54. package/.aict/examples/personal-judgment-growth-assistant/artifacts/handoff-note.md +28 -0
  55. package/.aict/examples/personal-judgment-growth-assistant/artifacts/harvest-seed.md +28 -0
  56. package/.aict/examples/research-knowledge-synthesis/CASE.md +87 -0
  57. package/.aict/examples/research-knowledge-synthesis/artifacts/acceptance-card.md +28 -0
  58. package/.aict/examples/research-knowledge-synthesis/artifacts/context-package.md +28 -0
  59. package/.aict/examples/research-knowledge-synthesis/artifacts/execution-prompt.md +30 -0
  60. package/.aict/examples/research-knowledge-synthesis/artifacts/guard-review.md +28 -0
  61. package/.aict/examples/research-knowledge-synthesis/artifacts/handoff-note.md +28 -0
  62. package/.aict/examples/research-knowledge-synthesis/artifacts/harvest-seed.md +28 -0
  63. package/.aict/guard/EXAMPLE.synthetic.md +51 -0
  64. package/.aict/guard/FAILURE_MODES.md +40 -0
  65. package/.aict/guard/PROMPT.md +47 -0
  66. package/.aict/guard/README.md +44 -0
  67. package/.aict/guard/TEMPLATE.md +60 -0
  68. package/.aict/handoff/EXAMPLE.synthetic.md +51 -0
  69. package/.aict/handoff/FAILURE_MODES.md +40 -0
  70. package/.aict/handoff/PROMPT.md +47 -0
  71. package/.aict/handoff/README.md +44 -0
  72. package/.aict/handoff/TEMPLATE.md +60 -0
  73. package/.aict/harvest/EXAMPLE.synthetic.md +51 -0
  74. package/.aict/harvest/FAILURE_MODES.md +40 -0
  75. package/.aict/harvest/PROMPT.md +47 -0
  76. package/.aict/harvest/README.md +44 -0
  77. package/.aict/harvest/TEMPLATE.md +60 -0
  78. package/.aict/mechanisms/README.md +34 -0
  79. package/.aict/mechanisms/anti-drift-partner/EXAMPLE.synthetic.md +46 -0
  80. package/.aict/mechanisms/anti-drift-partner/FAILURE_MODES.md +25 -0
  81. package/.aict/mechanisms/anti-drift-partner/PROMPT.md +75 -0
  82. package/.aict/mechanisms/anti-drift-partner/README.md +82 -0
  83. package/.aict/mechanisms/anti-drift-partner/TEMPLATE.md +74 -0
  84. package/.aict/mechanisms/blind-spot-scan/EXAMPLE.synthetic.md +39 -0
  85. package/.aict/mechanisms/blind-spot-scan/FAILURE_MODES.md +25 -0
  86. package/.aict/mechanisms/blind-spot-scan/PROMPT.md +72 -0
  87. package/.aict/mechanisms/blind-spot-scan/README.md +79 -0
  88. package/.aict/mechanisms/blind-spot-scan/TEMPLATE.md +70 -0
  89. package/.aict/mechanisms/collaboration-coach/EXAMPLE.synthetic.md +40 -0
  90. package/.aict/mechanisms/collaboration-coach/FAILURE_MODES.md +25 -0
  91. package/.aict/mechanisms/collaboration-coach/PROMPT.md +72 -0
  92. package/.aict/mechanisms/collaboration-coach/README.md +79 -0
  93. package/.aict/mechanisms/collaboration-coach/TEMPLATE.md +61 -0
  94. package/.aict/mechanisms/do-not-handle-yet/EXAMPLE.synthetic.md +15 -0
  95. package/.aict/mechanisms/do-not-handle-yet/FAILURE_MODES.md +16 -0
  96. package/.aict/mechanisms/do-not-handle-yet/PROMPT.md +41 -0
  97. package/.aict/mechanisms/do-not-handle-yet/README.md +30 -0
  98. package/.aict/mechanisms/do-not-handle-yet/TEMPLATE.md +38 -0
  99. package/.aict/mechanisms/dual-guard/EXAMPLE.synthetic.md +54 -0
  100. package/.aict/mechanisms/dual-guard/FAILURE_MODES.md +25 -0
  101. package/.aict/mechanisms/dual-guard/PROMPT.md +76 -0
  102. package/.aict/mechanisms/dual-guard/README.md +81 -0
  103. package/.aict/mechanisms/dual-guard/TEMPLATE.md +73 -0
  104. package/.aict/mechanisms/feedback-absorption-ledger/EXAMPLE.synthetic.md +49 -0
  105. package/.aict/mechanisms/feedback-absorption-ledger/FAILURE_MODES.md +25 -0
  106. package/.aict/mechanisms/feedback-absorption-ledger/PROMPT.md +74 -0
  107. package/.aict/mechanisms/feedback-absorption-ledger/README.md +81 -0
  108. package/.aict/mechanisms/feedback-absorption-ledger/TEMPLATE.md +69 -0
  109. package/.aict/mechanisms/half-product-review/EXAMPLE.synthetic.md +15 -0
  110. package/.aict/mechanisms/half-product-review/FAILURE_MODES.md +16 -0
  111. package/.aict/mechanisms/half-product-review/PROMPT.md +41 -0
  112. package/.aict/mechanisms/half-product-review/README.md +30 -0
  113. package/.aict/mechanisms/half-product-review/TEMPLATE.md +38 -0
  114. package/.aict/mechanisms/handoff-abc/EXAMPLE.synthetic.md +47 -0
  115. package/.aict/mechanisms/handoff-abc/FAILURE_MODES.md +25 -0
  116. package/.aict/mechanisms/handoff-abc/PROMPT.md +75 -0
  117. package/.aict/mechanisms/handoff-abc/README.md +82 -0
  118. package/.aict/mechanisms/handoff-abc/TEMPLATE.md +60 -0
  119. package/.aict/mechanisms/harvest-and-erc/EXAMPLE.synthetic.md +43 -0
  120. package/.aict/mechanisms/harvest-and-erc/FAILURE_MODES.md +25 -0
  121. package/.aict/mechanisms/harvest-and-erc/PROMPT.md +74 -0
  122. package/.aict/mechanisms/harvest-and-erc/README.md +81 -0
  123. package/.aict/mechanisms/harvest-and-erc/TEMPLATE.md +60 -0
  124. package/.aict/mechanisms/honest-calibration/EXAMPLE.synthetic.md +43 -0
  125. package/.aict/mechanisms/honest-calibration/FAILURE_MODES.md +25 -0
  126. package/.aict/mechanisms/honest-calibration/PROMPT.md +74 -0
  127. package/.aict/mechanisms/honest-calibration/README.md +81 -0
  128. package/.aict/mechanisms/honest-calibration/TEMPLATE.md +66 -0
  129. package/.aict/mechanisms/one-click-dispatch/EXAMPLE.synthetic.md +15 -0
  130. package/.aict/mechanisms/one-click-dispatch/FAILURE_MODES.md +16 -0
  131. package/.aict/mechanisms/one-click-dispatch/PROMPT.md +41 -0
  132. package/.aict/mechanisms/one-click-dispatch/README.md +30 -0
  133. package/.aict/mechanisms/one-click-dispatch/TEMPLATE.md +38 -0
  134. package/.aict/mechanisms/plain-language-first-screen/EXAMPLE.synthetic.md +15 -0
  135. package/.aict/mechanisms/plain-language-first-screen/FAILURE_MODES.md +16 -0
  136. package/.aict/mechanisms/plain-language-first-screen/PROMPT.md +41 -0
  137. package/.aict/mechanisms/plain-language-first-screen/README.md +30 -0
  138. package/.aict/mechanisms/plain-language-first-screen/TEMPLATE.md +38 -0
  139. package/.aict/mechanisms/root-cause-brake/EXAMPLE.synthetic.md +55 -0
  140. package/.aict/mechanisms/root-cause-brake/FAILURE_MODES.md +25 -0
  141. package/.aict/mechanisms/root-cause-brake/PROMPT.md +73 -0
  142. package/.aict/mechanisms/root-cause-brake/README.md +79 -0
  143. package/.aict/mechanisms/root-cause-brake/TEMPLATE.md +74 -0
  144. package/.aict/mechanisms/scout-review-controller/EXAMPLE.synthetic.md +15 -0
  145. package/.aict/mechanisms/scout-review-controller/FAILURE_MODES.md +16 -0
  146. package/.aict/mechanisms/scout-review-controller/PROMPT.md +41 -0
  147. package/.aict/mechanisms/scout-review-controller/README.md +30 -0
  148. package/.aict/mechanisms/scout-review-controller/TEMPLATE.md +38 -0
  149. package/.aict/mechanisms/single-tool-guard/EXAMPLE.synthetic.md +54 -0
  150. package/.aict/mechanisms/single-tool-guard/FAILURE_MODES.md +25 -0
  151. package/.aict/mechanisms/single-tool-guard/PROMPT.md +76 -0
  152. package/.aict/mechanisms/single-tool-guard/README.md +83 -0
  153. package/.aict/mechanisms/single-tool-guard/TEMPLATE.md +75 -0
  154. package/.aict/mechanisms/task-splitting/EXAMPLE.synthetic.md +53 -0
  155. package/.aict/mechanisms/task-splitting/FAILURE_MODES.md +25 -0
  156. package/.aict/mechanisms/task-splitting/PROMPT.md +72 -0
  157. package/.aict/mechanisms/task-splitting/README.md +79 -0
  158. package/.aict/mechanisms/task-splitting/TEMPLATE.md +76 -0
  159. package/.aict/modes/README.md +11 -0
  160. package/.aict/modes/execute.md +31 -0
  161. package/.aict/modes/handoff.md +29 -0
  162. package/.aict/modes/harvest.md +30 -0
  163. package/.aict/modes/review.md +28 -0
  164. package/.aict/modes/shape.md +34 -0
  165. package/.aict/privacy/COMMERCIAL_BOUNDARY.md +34 -0
  166. package/.aict/privacy/PRIVACY.md +36 -0
  167. package/.aict/privacy/REDACTION_CHECKLIST.md +12 -0
  168. package/.aict/profile/CANDIDATES.md +44 -0
  169. package/.aict/profile/EXAMPLE.synthetic.md +49 -0
  170. package/.aict/profile/FAILURE_MODES.md +40 -0
  171. package/.aict/profile/PROMPT.md +47 -0
  172. package/.aict/profile/README.md +44 -0
  173. package/.aict/profile/TEMPLATE.md +57 -0
  174. package/.aict/prompts/acceptance-definition.md +109 -0
  175. package/.aict/prompts/guard-review.md +116 -0
  176. package/.aict/prompts/handoff-generation.md +110 -0
  177. package/.aict/prompts/harvest-extraction.md +110 -0
  178. package/.aict/prompts/mode-switching.md +66 -0
  179. package/.aict/prompts/profile-creation.md +66 -0
  180. package/.aict/prompts/profile-refinement.md +66 -0
  181. package/.aict/prompts/project-context-packaging.md +113 -0
  182. package/.aict/prompts/red-team-challenge.md +106 -0
  183. package/.aict/prompts/rule-update-proposal.md +114 -0
  184. package/.aict/prompts/workflow-reset.md +109 -0
  185. package/.aict/roles/README.md +18 -0
  186. package/.aict/roles/executor.md +34 -0
  187. package/.aict/roles/harvester.md +33 -0
  188. package/.aict/roles/owner-controller.md +38 -0
  189. package/.aict/roles/scout.md +33 -0
  190. package/.aict/roles/supervisor.md +34 -0
  191. package/.aict/roles/system-guardian.md +34 -0
  192. package/.aict/skills/acceptance/SKILL.md +43 -0
  193. package/.aict/skills/context/SKILL.md +44 -0
  194. package/.aict/skills/evidence-pack/SKILL.md +42 -0
  195. package/.aict/skills/guard/SKILL.md +46 -0
  196. package/.aict/skills/handoff/SKILL.md +44 -0
  197. package/.aict/skills/harvest/SKILL.md +44 -0
  198. package/.aict/skills/mode-switch/SKILL.md +42 -0
  199. package/.aict/skills/profile/SKILL.md +42 -0
  200. package/.aict/skills/red-team/SKILL.md +42 -0
  201. package/.aict/skills/single-tool-guard/SKILL.md +42 -0
  202. package/.aict/state/CURRENT_STATE.md +13 -0
  203. package/.aict/state/DECISIONS.md +7 -0
  204. package/.aict/state/TASK_LOG.md +7 -0
  205. package/.aict/state/evidence.jsonl +2 -0
  206. package/.aict/state/learning-ledger.jsonl +1 -0
  207. package/.aict/state/receipts.jsonl +1 -0
  208. package/.aict/state/runs.jsonl +1 -0
  209. package/.aict/state/tasks.jsonl +1 -0
  210. package/.aict/walkthroughs/10-minute-your-task.md +107 -0
  211. package/.aict/walkthroughs/10-minute.md +43 -0
  212. package/.aict/walkthroughs/30-minute.md +22 -0
  213. package/.aict/walkthroughs/60-minute.md +27 -0
  214. package/.aict/walkthroughs/synthetic-loop-transcript.md +43 -0
  215. package/CHANGELOG.md +23 -0
  216. package/CODE_OF_CONDUCT.md +20 -0
  217. package/CONTRIBUTING.md +30 -0
  218. package/KNOWN_LIMITATIONS.md +54 -0
  219. package/LICENSE +199 -0
  220. package/PRODUCT_CONTRACT.md +446 -0
  221. package/README.md +245 -0
  222. package/RELEASE_CHECKLIST.md +78 -0
  223. package/SECURITY.md +56 -0
  224. package/START_HERE.md +89 -0
  225. package/bin/ai-collab.js +2 -0
  226. package/docs/DOGFOOD.md +85 -0
  227. package/docs/FEEDBACK.md +61 -0
  228. package/docs/FIRST_EXPERIENCE_SPEC.md +32 -0
  229. package/docs/FREE_VS_PAID.md +53 -0
  230. package/docs/PUBLIC_BOUNDARY.md +36 -0
  231. package/docs/PUBLIC_MAPPING.md +178 -0
  232. package/docs/RELEASE_PRIORITY.md +23 -0
  233. package/docs/WHY_THIS_EXISTS.md +36 -0
  234. package/docs/open-system/00-start-here.md +60 -0
  235. package/docs/open-system/01-ai-collaboration-os.md +33 -0
  236. package/docs/open-system/02-six-layer-architecture.md +45 -0
  237. package/docs/open-system/03-role-system.md +33 -0
  238. package/docs/open-system/04-core-mechanisms.md +34 -0
  239. package/docs/open-system/05-failure-patterns.md +31 -0
  240. package/docs/open-system/06-how-to-adapt-to-your-workflow.md +31 -0
  241. package/package.json +69 -0
  242. package/privacy-manifest.json +78 -0
  243. package/privacy-scan.local.json.example +18 -0
  244. package/scripts/lib/forbidden-in-pack.js +55 -0
  245. package/scripts/pack-check.js +154 -0
  246. package/scripts/privacy-scan.js +487 -0
  247. package/scripts/validate-contract.js +160 -0
  248. package/src/adapters.js +590 -0
  249. package/src/bootstrap.js +1184 -0
  250. package/src/catalog.js +2723 -0
  251. package/src/cli.js +2899 -0
  252. package/src/dialogue.js +470 -0
  253. package/src/i18n.js +1034 -0
  254. package/src/ledger.js +2011 -0
  255. package/src/render.js +1381 -0
  256. package/src/sendmodel.js +452 -0
  257. package/src/validate.js +1307 -0
  258. package/src/workspace.js +1679 -0
  259. package/tests/contract.test.js +8514 -0
package/src/catalog.js ADDED
@@ -0,0 +1,2723 @@
1
+ export const layerDefinitions = [
2
+ {
3
+ id: "profile",
4
+ title: "Profile",
5
+ summary: "How the AI adapts to the user's working style, constraints, and decision habits.",
6
+ zh: "让 AI 先知道怎样配合这个人,而不是每次都按通用助理口吻开始。",
7
+ purpose: "Capture stable collaboration preferences that affect how an assistant should ask, decide, challenge, summarize, and hand work back.",
8
+ when: "Use before long tasks, recurring work, cross-tool handoffs, or any situation where tone, risk appetite, and decision rules matter.",
9
+ input: "A short description of the user's role, work type, preferred feedback style, constraints, review habits, and known failure patterns.",
10
+ output: "A compact profile card with preferences, hard boundaries, collaboration defaults, and update rules.",
11
+ prompt: "Create a profile card for this user. Extract only reusable collaboration preferences, not private secrets. Separate stable preferences from task-specific context. Return: working style, decision rules, communication preferences, hard boundaries, and what future assistants should ask before acting.",
12
+ template: [
13
+ "User role or situation:",
14
+ "Preferred collaboration style:",
15
+ "Decision rules:",
16
+ "Hard boundaries:",
17
+ "Review habits:",
18
+ "Known failure patterns:",
19
+ "How to update this profile:"
20
+ ],
21
+ example: [
22
+ "User role or situation: Solo product builder validating a weekend prototype.",
23
+ "Preferred collaboration style: Direct risk calls, short summaries, concrete next actions.",
24
+ "Decision rules: Prefer evidence from user behavior over elegant architecture.",
25
+ "Hard boundaries: Do not upload private notes or make purchasing decisions.",
26
+ "Review habits: Wants assumptions and unverified claims labeled.",
27
+ "Known failure patterns: Over-polished plans can replace real validation.",
28
+ "How to update this profile: Add new stable preferences only after they appear in at least two tasks."
29
+ ],
30
+ failures: [
31
+ "Treating a profile as a personality essay instead of operational guidance.",
32
+ "Storing secrets, account details, or private identity signals.",
33
+ "Mixing task context into the stable profile and making future sessions stale.",
34
+ "Letting the assistant infer values without user confirmation."
35
+ ]
36
+ },
37
+ {
38
+ id: "context",
39
+ title: "Context",
40
+ summary: "What the task boundary is, what is known, and what should not be assumed.",
41
+ zh: "把当前任务边界写清楚,避免新会话靠猜。",
42
+ purpose: "Package the task so an assistant can start from the right boundary, evidence, constraints, and unknowns without reading the whole history.",
43
+ when: "Use at the start of any task that spans more than one message, touches files, involves judgment, or may be resumed later.",
44
+ input: "Goal, current state, relevant files or links, constraints, non-goals, known facts, assumptions, risks, and open questions.",
45
+ output: "A context package that lets another assistant continue without inventing missing background.",
46
+ prompt: "Turn this messy situation into a context package. Separate facts from assumptions. Include goal, current state, relevant artifacts, constraints, non-goals, risks, and open questions. Keep private material summarized or redacted.",
47
+ template: [
48
+ "Goal:",
49
+ "Current state:",
50
+ "Relevant artifacts:",
51
+ "Constraints:",
52
+ "Non-goals:",
53
+ "Facts:",
54
+ "Assumptions:",
55
+ "Risks:",
56
+ "Open questions:"
57
+ ],
58
+ example: [
59
+ "Goal: Prepare a public beta onboarding flow for a synthetic note app.",
60
+ "Current state: Landing copy exists; onboarding has not been tested.",
61
+ "Relevant artifacts: README draft, onboarding checklist, synthetic tester notes.",
62
+ "Constraints: No analytics SDK until privacy language is reviewed.",
63
+ "Non-goals: Payment, account recovery, enterprise SSO.",
64
+ "Facts: Three testers abandoned before creating a first note.",
65
+ "Assumptions: The first-run prompt may be too abstract.",
66
+ "Risks: Improving copy without testing the actual flow.",
67
+ "Open questions: Which first action should count as activation?"
68
+ ],
69
+ failures: [
70
+ "Dumping the whole history instead of compressing decision-changing facts.",
71
+ "Not labeling assumptions.",
72
+ "Forgetting non-goals, causing assistants to expand scope.",
73
+ "Including private file paths or raw conversations."
74
+ ]
75
+ },
76
+ {
77
+ id: "acceptance",
78
+ title: "Acceptance",
79
+ summary: "What done means before work starts.",
80
+ zh: "先定义完成标准,再让 AI 干活。",
81
+ purpose: "Make success inspectable by defining observable outcomes, required artifacts, verification steps, and explicit non-acceptance conditions.",
82
+ when: "Use before implementation, writing, research, design, cleanup, or any task where 'looks good' would be too vague.",
83
+ input: "Goal, expected artifacts, constraints, quality bar, verification command or manual check, and conditions that would reject the work.",
84
+ output: "An acceptance card that a reviewer can use to pass, reject, or request changes.",
85
+ prompt: "Write an acceptance card for this task. Define concrete deliverables, pass criteria, required checks, rejected states, and evidence needed before claiming completion. Do not rely on vibes or intent.",
86
+ template: [
87
+ "Task:",
88
+ "Deliverables:",
89
+ "Pass criteria:",
90
+ "Required checks:",
91
+ "Rejected states:",
92
+ "Evidence needed:",
93
+ "Owner decision needed:"
94
+ ],
95
+ example: [
96
+ "Task: Create a reusable onboarding checklist for a synthetic notes app.",
97
+ "Deliverables: One checklist, one example run, one review note.",
98
+ "Pass criteria: A new user can complete first note creation without reading strategy prose.",
99
+ "Required checks: Run through the checklist using the synthetic case.",
100
+ "Rejected states: Only a theory doc exists; no concrete user path.",
101
+ "Evidence needed: Completed example artifact and reviewer note.",
102
+ "Owner decision needed: Whether to include account creation in the first loop."
103
+ ],
104
+ failures: [
105
+ "Defining acceptance after the answer is already written.",
106
+ "Using subjective phrases like polished, robust, or complete without evidence.",
107
+ "Skipping rejected states.",
108
+ "Treating tests as proof when they do not cover the stated behavior."
109
+ ]
110
+ },
111
+ {
112
+ id: "guard",
113
+ title: "Guard / Review",
114
+ summary: "How output is challenged before trust.",
115
+ zh: "让另一个视角先挑错,再决定是否相信产物。",
116
+ purpose: "Challenge work against requirements, risks, privacy boundaries, evidence quality, and user intent before it becomes trusted output.",
117
+ when: "Use after a draft, implementation, research answer, plan, or handoff that could mislead future work if wrong.",
118
+ input: "The artifact under review, acceptance card, context package, known constraints, and the specific review stance.",
119
+ output: "A review result with findings, severity, evidence, required fixes, and a pass or reject recommendation.",
120
+ prompt: "Review this artifact against the context and acceptance card. Prioritize concrete defects, missing evidence, privacy risk, unsupported claims, and scope drift. Return findings ordered by severity with file or section references when possible.",
121
+ template: [
122
+ "Artifact reviewed:",
123
+ "Review stance:",
124
+ "Acceptance source:",
125
+ "Findings:",
126
+ "Evidence:",
127
+ "Required fixes:",
128
+ "Residual risk:",
129
+ "Recommendation:"
130
+ ],
131
+ example: [
132
+ "Artifact reviewed: Onboarding checklist draft.",
133
+ "Review stance: First-user path and privacy review.",
134
+ "Acceptance source: Acceptance card dated synthetic-case-01.",
135
+ "Findings: P1 checklist says 'review analytics' but privacy constraint forbids adding analytics now.",
136
+ "Evidence: Context states no analytics SDK until privacy language is reviewed.",
137
+ "Required fixes: Replace analytics step with manual tester observation.",
138
+ "Residual risk: Still untested with a real user.",
139
+ "Recommendation: Reject until the flow uses privacy-safe evidence."
140
+ ],
141
+ failures: [
142
+ "Reviewing style instead of requirements.",
143
+ "Letting the same assistant rubber-stamp its own work.",
144
+ "Listing vague concerns without actionable fixes.",
145
+ "Calling something passed without checking the acceptance card."
146
+ ]
147
+ },
148
+ {
149
+ id: "handoff",
150
+ title: "Handoff",
151
+ summary: "How the next session resumes without restarting.",
152
+ zh: "给下一棒留一个能直接接上的交接卡。",
153
+ purpose: "Transfer current state, decisions, evidence, blockers, and next actions to another session or tool without replaying the whole conversation.",
154
+ when: "Use before stopping, switching tools, delegating to another assistant, or after any work that may need continuation.",
155
+ input: "Goal, current status, completed work, changed files or artifacts, decisions, verification evidence, blockers, and next action.",
156
+ output: "A short handoff note that separates done, pending, blocked, and unverified work.",
157
+ prompt: "Create a handoff note for the next AI session. Include goal, current state, completed work, pending work, blockers, decisions, verification evidence, and the exact next action. Label unverified claims.",
158
+ template: [
159
+ "Goal:",
160
+ "Current status:",
161
+ "Completed:",
162
+ "Pending:",
163
+ "Blocked:",
164
+ "Decisions:",
165
+ "Verification evidence:",
166
+ "Next action:"
167
+ ],
168
+ example: [
169
+ "Goal: Finish synthetic onboarding checklist.",
170
+ "Current status: Draft exists and failed guard review on privacy-safe evidence.",
171
+ "Completed: Context package and acceptance card are ready.",
172
+ "Pending: Replace analytics step with manual tester observation.",
173
+ "Blocked: Need owner choice on whether account creation belongs in scope.",
174
+ "Decisions: First loop focuses on first note creation only.",
175
+ "Verification evidence: Guard review found one P1 issue.",
176
+ "Next action: Edit checklist step 4 and rerun guard review."
177
+ ],
178
+ failures: [
179
+ "Writing a narrative summary instead of a resumable state card.",
180
+ "Not separating completed from unverified.",
181
+ "Omitting blockers and causing the next assistant to guess.",
182
+ "Leaving no exact next action."
183
+ ]
184
+ },
185
+ {
186
+ id: "harvest",
187
+ title: "Harvest",
188
+ summary: "What becomes reusable knowledge or material.",
189
+ zh: "把一次任务里可复用的经验收回来,而不是让它消失在聊天里。",
190
+ purpose: "Extract reusable patterns, prompts, decisions, examples, and rule-update candidates from completed work.",
191
+ when: "Use after a task loop, review, failed attempt, content draft, research synthesis, or repeated workflow friction.",
192
+ input: "Final artifact, review result, decisions made, surprising lessons, repeated pain points, and reusable snippets.",
193
+ output: "A harvest seed with reusable material, where to store it, and what should not be generalized.",
194
+ prompt: "Extract harvest from this completed task. Separate reusable knowledge, reusable prompt fragments, decision records, future rule candidates, and material that should stay task-specific. Do not over-generalize from one case.",
195
+ template: [
196
+ "Source task:",
197
+ "Reusable knowledge:",
198
+ "Reusable prompts:",
199
+ "Decision record:",
200
+ "Rule update candidates:",
201
+ "Do not generalize:",
202
+ "Storage target:",
203
+ "Next reuse:"
204
+ ],
205
+ example: [
206
+ "Source task: Synthetic onboarding checklist.",
207
+ "Reusable knowledge: First-run flows need one observable activation event.",
208
+ "Reusable prompts: Ask the assistant to name one artifact a new user can produce.",
209
+ "Decision record: Avoid analytics until privacy language is reviewed.",
210
+ "Rule update candidates: Every onboarding case needs a privacy-safe evidence step.",
211
+ "Do not generalize: The synthetic notes-app activation event may not fit other products.",
212
+ "Storage target: examples/content-production-harvest/artifacts/harvest-seed.md.",
213
+ "Next reuse: Apply the activation-event test to another synthetic product case."
214
+ ],
215
+ failures: [
216
+ "Harvesting everything and creating clutter.",
217
+ "Turning one example into a universal rule.",
218
+ "Saving private raw material instead of synthetic or redacted learning.",
219
+ "Not linking harvest back to future reuse."
220
+ ]
221
+ }
222
+ ];
223
+
224
+ export const promptDefinitions = [
225
+ {
226
+ file: "profile-creation.md",
227
+ title: "Profile creation",
228
+ purpose: "Create a reusable collaboration profile from a redacted user description.",
229
+ scenario: "Use when a user is starting a new AI workspace and wants future sessions to know how to collaborate without storing private secrets.",
230
+ inputRequirements: [
231
+ "Redacted description of the user's work and preferred feedback style.",
232
+ "Known boundaries: actions the assistant must not take without consent.",
233
+ "Examples of useful and unhelpful assistant behavior."
234
+ ],
235
+ steps: [
236
+ "Extract stable preferences only; ignore one-off task facts.",
237
+ "Separate communication style, decision rules, safety boundaries, and update triggers.",
238
+ "Ask no more than three questions if a boundary is ambiguous.",
239
+ "Mark any inferred preference as provisional."
240
+ ],
241
+ outputFormat: [
242
+ "Profile summary",
243
+ "Collaboration defaults",
244
+ "Hard boundaries",
245
+ "Review and challenge preferences",
246
+ "When to update this profile"
247
+ ],
248
+ failureModes: [
249
+ "Turning the profile into a biography.",
250
+ "Saving secrets, customer names, local paths, or raw private conversations.",
251
+ "Treating a single emotional moment as a permanent preference."
252
+ ],
253
+ example: "Input: 'I build prototypes alone and hate vague reassurance.' Output: 'Use direct risk calls, short options, and evidence labels; do not make purchases or publish without explicit consent.'"
254
+ },
255
+ {
256
+ file: "profile-refinement.md",
257
+ title: "Profile refinement",
258
+ purpose: "Update an existing profile with only stable new preferences.",
259
+ scenario: "Use after several sessions reveal a repeated working preference or a profile rule is causing friction.",
260
+ inputRequirements: [
261
+ "Current profile card.",
262
+ "New evidence from at least one recent task.",
263
+ "Whether the user explicitly confirmed the new preference."
264
+ ],
265
+ steps: [
266
+ "Compare new evidence against the existing profile.",
267
+ "Classify each candidate as stable, task-specific, contradictory, or unsafe to store.",
268
+ "Preserve older rules unless there is clear replacement evidence.",
269
+ "Return a patch-style update instead of rewriting the whole profile."
270
+ ],
271
+ outputFormat: [
272
+ "Keep unchanged",
273
+ "Add",
274
+ "Revise",
275
+ "Do not store",
276
+ "Open confirmation question"
277
+ ],
278
+ failureModes: [
279
+ "Overwriting the profile because one task went badly.",
280
+ "Adding private operational detail as if it were a collaboration preference.",
281
+ "Hiding uncertainty by merging contradictory preferences."
282
+ ],
283
+ example: "Current profile says 'ask before execution.' New evidence says user now gives task-level authorization for explicit implementation requests. Output a narrow revision with the exact trigger."
284
+ },
285
+ {
286
+ file: "project-context-packaging.md",
287
+ title: "Project context packaging",
288
+ purpose: "Compress a messy task into facts, boundaries, assumptions, risks, and open questions.",
289
+ scenario: "Use when a task spans files, sessions, tools, or decisions and a new assistant would otherwise start by guessing.",
290
+ inputRequirements: [
291
+ "Goal in the user's words.",
292
+ "Current state and relevant artifacts.",
293
+ "Constraints, non-goals, facts, assumptions, blockers, and known risks."
294
+ ],
295
+ steps: [
296
+ "Name the task boundary before summarizing details.",
297
+ "Split facts from assumptions and decisions from preferences.",
298
+ "Compress history into information that changes what the next assistant should do.",
299
+ "End with the next action and the smallest missing question."
300
+ ],
301
+ outputFormat: [
302
+ "Goal",
303
+ "Current state",
304
+ "Relevant artifacts",
305
+ "Constraints and non-goals",
306
+ "Facts",
307
+ "Assumptions",
308
+ "Risks",
309
+ "Open questions",
310
+ "Next action"
311
+ ],
312
+ failureModes: [
313
+ "Dumping a transcript instead of packaging context.",
314
+ "Omitting non-goals and letting scope expand.",
315
+ "Losing evidence links or file references needed for review.",
316
+ "Handing over the option list as the whole map, so the receiver inherits whatever the author forgot.",
317
+ "Naming which clause or category the task belongs to but never what the owner ultimately does with the result."
318
+ ],
319
+ example: "Messy input says 'fix onboarding, maybe pricing too, users are confused.' Output separates onboarding as the current scope and pricing as a non-goal until evidence changes.",
320
+ operativeCore: {
321
+ trigger: "Use at the start of any task that will cross a boundary — span multiple files, sessions, or tools, get handed to a different assistant, or be resumed later — and where a new assistant starting cold would otherwise begin by guessing. Package the context BEFORE the next assistant acts, so it inherits a boundary instead of a transcript.",
322
+ antiTrigger: "Skip the full packaging for a single-step task with one obvious goal and no state worth transferring, or a quick fact lookup you will act on yourself in the same breath. A full facts/boundaries/assumptions/risks/open-questions package for a one-line ask is overhead the receiver does not need, and ceremony with no payoff trains people to skip packaging when the task is actually tangled.",
323
+ input: "The goal in the owner's own words. The current state and the artifacts that matter (files, links, prior decisions). The constraints and the explicit non-goals. What is known as fact versus assumed. The blockers and known risks. And, honestly, what you may have left off the list — because the biggest risk in a handoff is the option the author never wrote down.",
324
+ process: [
325
+ "Name the task boundary first — goal plus explicit non-goals — before summarizing any detail. A package without a boundary invites the receiver to expand scope into whatever looks interesting.",
326
+ "Compress the situation into five buckets, not a narrative: FACTS (verified, with the evidence or file reference), BOUNDARIES (in scope vs out of scope), ASSUMPTIONS (believed but unverified, labeled as such), RISKS (what could go wrong), OPEN QUESTIONS (what is genuinely undecided). Split fact from assumption and decision from preference; do not let a confident sentence blur the two.",
327
+ "Climb the purpose chain before fixing the delivery shape: do not stop at 'this task belongs to category X'. Ask what the owner will ultimately DO with the result — decide something, unblock a downstream step, cut review cost — because the end use is what sets the right granularity, depth, and format. A deliverable that maps to a category but serves no actual use is a compliance document, not a usable tool. If you cannot answer the end use, stop and re-define the task rather than packaging detail around a goal you have not pinned.",
328
+ "Frame the next-step options as a menu, not the map: it is your view of what comes next, and you may have missed, misordered, or mis-scoped an item. Mark which options are well-grounded and which are guesses, and invite the receiver to find an unlisted option D or E.",
329
+ "Hand the receiver an explicit first-round judgment to run before executing: (1) what is this task actually for — restate the goal, the current sub-task, and the completion bar; (2) is this option list exhaustive — what did I miss; (3) is there an unlisted item that serves the main line versus one that would hijack it? An option list accepted without questioning is an option list whose blind spots get inherited.",
330
+ "End with one concrete next action and the smallest missing question — the single piece of information that, once answered, would most change what the receiver should do."
331
+ ],
332
+ outputShape: [
333
+ "Goal and boundary: the goal in one line, with explicit non-goals.",
334
+ "Facts: verified items, each with its evidence or file reference.",
335
+ "Assumptions: believed-but-unverified items, labeled, kept separate from facts.",
336
+ "Risks: what could go wrong, ordered by impact.",
337
+ "Open questions: what is genuinely undecided, with the smallest missing question called out.",
338
+ "End-use chain: what the owner ultimately does with the result, and how that sets the delivery shape.",
339
+ "Option menu (not the map): next-step options, each marked well-grounded or guess, with a note of what may be missing.",
340
+ "Receiver's first-round check: the three questions to answer before executing (what is this for / is the list exhaustive / is there a serving-vs-hijacking D or E).",
341
+ "Next action: one concrete first step the receiver can take from this package alone."
342
+ ],
343
+ passBar: [
344
+ "The task boundary — goal plus explicit non-goals — is stated before any detail, so scope cannot quietly expand.",
345
+ "Facts and assumptions are in separate buckets, and every assumption is labeled rather than dressed as fact.",
346
+ "The end-use chain is answered: what the owner finally does with the result is named, not just which category the task falls under.",
347
+ "The option list is framed as a menu with missing or guessed items flagged, not presented as the complete set of next steps.",
348
+ "There is one concrete next action and one smallest-missing-question, and a cold reader could continue from this package without the original chat."
349
+ ],
350
+ rejectBar: [
351
+ "The package is a replayed transcript or a story of what happened, not a compressed boundary a stranger can act on.",
352
+ "Non-goals are missing, so the receiver is free to expand scope into whatever looks interesting.",
353
+ "An assumption is stated as a fact with no label and no evidence, and a later step would rest on it.",
354
+ "The task is mapped to a clause or category but the owner's ultimate use is never named, so the delivery granularity is set by guesswork.",
355
+ "The option list is handed over as the whole map, inviting the receiver to inherit the author's blind spots instead of testing for a missing D or E."
356
+ ],
357
+ counterExample: "Synthetic: a handoff package says 'finish the onboarding work; options are A polish the copy, B add a tooltip, C reorder the steps; next: do A.' A cold receiver that runs the first-round check instead of grabbing A asks question one — what is this actually for — and finds the real end use is 'get more new users to create their first note', not 'make the copy nicer'. Question two exposes that the author never listed the fact that three testers abandoned before the first note even loaded — an unlisted item D (the first screen is broken) that the copy options would never fix. The package had mapped the task to the category 'onboarding copy' but never to the owner's actual use, so the polished menu would have shipped prettier words on a screen users never reached."
358
+ }
359
+ },
360
+ {
361
+ file: "acceptance-definition.md",
362
+ title: "Acceptance definition",
363
+ purpose: "Define observable pass criteria and required evidence before work starts.",
364
+ scenario: "Use before implementation, writing, research, cleanup, or review when 'looks good' would be too vague.",
365
+ inputRequirements: [
366
+ "Task goal and expected artifact.",
367
+ "Quality bar and constraints.",
368
+ "Available verification commands or manual checks.",
369
+ "Rejected states that must not be accepted."
370
+ ],
371
+ steps: [
372
+ "Turn the goal into inspectable deliverables.",
373
+ "Write pass criteria that a skeptical reviewer can test.",
374
+ "List required evidence before any completion claim.",
375
+ "Add explicit rejected states to prevent false closure."
376
+ ],
377
+ outputFormat: [
378
+ "Deliverables",
379
+ "Pass criteria",
380
+ "Required checks",
381
+ "Rejected states",
382
+ "Evidence needed",
383
+ "Owner decision needed"
384
+ ],
385
+ failureModes: [
386
+ "Accepting intent instead of observable output.",
387
+ "Writing criteria after the work is already done.",
388
+ "Letting tests pass even though they do not prove the user-facing requirement.",
389
+ "Accepting a verbal 'it is done / it landed' with no pasted command output behind it.",
390
+ "Collapsing 'not verified yet' into 'done' so the reviewer cannot tell which claims are actually proven."
391
+ ],
392
+ example: "For a CLI dry-run task, acceptance requires exit 0, no files created, clear stdout, and a test proving the target directory stays empty.",
393
+ operativeCore: {
394
+ trigger: "Use before any work where 'looks good' would be too vague and where a wrong 'done' would propagate: an implementation, a piece of writing, a research result, a cleanup, or any artifact another session or person will trust. Define done BEFORE the work starts, not after.",
395
+ antiTrigger: "Skip the full ceremony for a throwaway one-line change, a quick fact lookup, or a step you will fully re-check by hand in the next minute anyway. Writing a six-part acceptance card for a trivial edit is cost you pay for nothing, and ceremony with no payoff trains people to skip acceptance when it actually matters.",
396
+ input: "The task goal and the exact artifact expected. The quality bar and constraints. The verification commands or manual checks actually available (the real test command, the real grep, the real way to reproduce). The rejected states that must never be silently accepted. If the work is already partly done, the completion claims made so far so each can be tagged with a state.",
397
+ process: [
398
+ "Turn the goal into inspectable deliverables: name the file, the behavior, the output a skeptical reviewer could open and check. A deliverable you cannot point at is a deliverable you cannot accept.",
399
+ "Write each pass criterion so it is mechanically checkable (a command that exits 0, a string that must appear, a behavior a stranger can reproduce), not a vibe like 'works well' or 'is clean'.",
400
+ "Bind every completion claim to one of exactly three states and forbid any fourth: NOT DONE (nothing to show yet), DONE-PENDING-VERIFICATION (built but not yet proven), ACCEPTED (proven and signed off). Ban soft words like 'basically done', 'should be fine', 'mostly working' — they hide which state the work is really in.",
401
+ "Require that any DONE-PENDING-VERIFICATION claim carry three-part evidence, and that the three parts are present before the claim counts: (1) WHAT CHANGED — file plus line plus a one-line description, specific enough that a reader sees the change in thirty seconds; (2) REAL COMMAND OUTPUT — the actual pasted text of grep -n / diff / wc -l / ls -la / the test run, not a verbal 'it is landed' or 'tests pass'; (3) WHAT IS NOT YET VERIFIED — the blind spots, edge cases, cross-file effects, and downstream dependencies, listed openly so the owner decides whether to verify or to accept the gap on the record.",
402
+ "List the rejected states explicitly so false closure is blocked up front: e.g. exit code ignored, a test that passes without exercising the user-facing requirement, a claim broader than its evidence, scope that drifted past the stated non-goals.",
403
+ "Hand the card back as the contract the work will be judged against, and name what still needs an owner decision (which gaps are acceptable, which must be closed before acceptance)."
404
+ ],
405
+ outputShape: [
406
+ "Deliverables: the concrete artifacts, each one something a reviewer can open and inspect.",
407
+ "Pass criteria: numbered, each mechanically checkable.",
408
+ "Required checks: the exact command or manual step that proves each criterion.",
409
+ "Three states in use: which claims are NOT DONE / DONE-PENDING-VERIFICATION / ACCEPTED right now.",
410
+ "Required evidence per claim: the three-part block (what changed / real command output / what is not yet verified) demanded before any DONE-PENDING-VERIFICATION claim is trusted.",
411
+ "Rejected states: the failure conditions that must never be silently accepted.",
412
+ "Owner decision needed: which residual gaps need a human call before acceptance."
413
+ ],
414
+ passBar: [
415
+ "Every pass criterion is checkable by a named command or a reproducible manual step, not by opinion.",
416
+ "Each completion claim is tagged NOT DONE / DONE-PENDING-VERIFICATION / ACCEPTED, with no soft fourth state.",
417
+ "Every DONE-PENDING-VERIFICATION claim has all three evidence parts, and part two is real pasted command output, not a verbal assurance.",
418
+ "Rejected states are written down so false closure has an explicit tripwire.",
419
+ "What stays unverified is named openly and left to the owner to accept or close, not buried."
420
+ ],
421
+ rejectBar: [
422
+ "A criterion is a vibe ('looks clean', 'works well') that no command or reproducible step can test.",
423
+ "A claim says 'done' / 'basically done' / 'should be fine' without a state tag, so the reviewer cannot tell proven from unproven.",
424
+ "A DONE-PENDING-VERIFICATION claim rests on a verbal 'it landed' or 'tests pass' with no pasted output — exactly the gap where a fluent claim outruns the evidence.",
425
+ "A test passes but does not exercise the actual user-facing requirement, and that is being counted as acceptance.",
426
+ "Unverified areas are folded into 'done' instead of being listed for an owner decision."
427
+ ],
428
+ counterExample: "Synthetic: an assistant reports 'Done — added keyboard reordering to the task board and all tests pass.' Under this prompt the claim is tagged DONE-PENDING-VERIFICATION and the three-part evidence is demanded. Part two comes back empty: there is no pasted test output for a keyboard case, only the sentence 'tests pass'. Acceptance is withheld. When the real `grep -n moveTask` and test run are pasted, they show the keyboard handler only logs the key and never calls the reorder function and no keyboard test exists — so the verbal 'all tests pass' was a half-product claim the evidence never backed."
429
+ }
430
+ },
431
+ {
432
+ file: "guard-review.md",
433
+ title: "Guard review",
434
+ purpose: "Review an artifact against context, acceptance, evidence, and privacy boundaries.",
435
+ scenario: "Use after a draft or implementation exists and before future work treats it as trusted.",
436
+ inputRequirements: [
437
+ "Artifact under review.",
438
+ "Context package.",
439
+ "Acceptance card.",
440
+ "Verification evidence and known unverified areas."
441
+ ],
442
+ steps: [
443
+ "Start with findings, not compliments.",
444
+ "Tie each issue to a requirement, line, section, or command output.",
445
+ "Separate blocker, high, medium, and residual risks.",
446
+ "Return one of the four verdicts: pass, reject, insufficient_evidence, or pass_with_risk."
447
+ ],
448
+ outputFormat: [
449
+ "Verdict",
450
+ "Findings ordered by severity",
451
+ "Evidence",
452
+ "Required fixes",
453
+ "Residual risk",
454
+ "Pass or reject recommendation"
455
+ ],
456
+ failureModes: [
457
+ "Rubber-stamping the same assistant's output.",
458
+ "Reviewing tone while ignoring the acceptance card.",
459
+ "Calling work complete without fresh verification evidence.",
460
+ "Trusting the author's own 'I checked it' on counts, citations, or self-rule-compliance — exactly the claims a model is worst at self-judging.",
461
+ "Reading a polished structure (headings, summary, confident prose) as proof the underlying thing actually runs."
462
+ ],
463
+ example: "Guard rejects a README that claims multi-tool integration when the code only writes adapter guidance files.",
464
+ operativeCore: {
465
+ trigger: "Use after a draft or implementation exists and before any future work treats it as trusted: a completion claim, a release candidate, a citation-heavy document, a 'done and tested' report, or any artifact whose wrong 'looks fine' would propagate into later work.",
466
+ antiTrigger: "Skip the full review on low-stakes, easily reversible work, or a step the owner is about to fully re-run by hand: a one-line wording fix, a scratch draft, a trivial config tweak. A heavy guard pass on throwaway work is ceremony, and ceremony with no payoff trains people to skip review when it matters.",
467
+ input: "The artifact under review, with stable line or section references so a finding can cite an exact spot. The acceptance card or definition of done it claims to meet. The context boundary (goal, scope, non-goals). The verification evidence the completion claim rests on (real command output, test results, a reproduced behavior) or a clear note that none exists. Which model family drafted it, so a same-family 'I reviewed my own work' is weighted as the weak signal it is.",
468
+ process: [
469
+ "Lead with findings, not compliments. The job is to find what is wrong before it propagates, not to reassure.",
470
+ "Hunt for the five faces of a half-product — work that looks finished but is not — and for each, run its concrete tell. (1) DONE BUT NOT ACTUALLY DONE: the report says complete; demand the command plus real output and confirm it was actually run, not narrated. (2) PAPER EXISTS BUT THE FUNCTION DOES NOT: the file/hook/feature is present but never wired in; check that it is actually invoked and produces a real effect, not just that it exists on disk. (3) NUMBERS LOOK RIGHT BUT THE DENOMINATOR IS WRONG: a clean percentage or count; pin the exact scope and denominator and recompute, because pending and not-started items get quietly mixed in. (4) JUDGMENT THEATER: 'I judged it safe / I reviewed it'; require a named failure scenario actually tried, or an independent check — a verdict with no attempted break is decoration. (5) CITATION DRIFT: a real source is cited but misquoted, dropped a clause, or summarized into a different claim; open the source and compare the quoted span word-for-word, because drift looks more trustworthy than invention.",
471
+ "Treat the author's own self-assessment as unreliable in seven specific zones and re-verify each independently rather than accepting 'I checked it': (1) counts — fields, lines, items, totals drift between two spots in the same artifact; (2) self-audit checklists with empty filler rows like 'expected N' or a bracketed blank but no actual result filled in; (3) self-audit table rows that describe a check in prose but never show the command that performs it; (4) 'tested' claims that state an expected value but no observed value; (5) attention on long documents — past a few hundred lines a self-check goes formal and misses things, so spot-check the tail, not just the top; (6) self-rule-compliance — 'I followed rule X strictly' is contradicted by at least one counter-instance often enough that it cannot be taken on faith; (7) running totals carried across sessions, which accumulate off-by-one drift. In all seven, the sentence 'I already verified this' is itself the thing not to trust.",
472
+ "Tie every issue to a requirement, a line, a section, or a specific missing piece of evidence. A finding that cannot point at a spot is a vibe, not a finding.",
473
+ "Sort findings into blocker / high / medium / residual, and state which are decision-changing.",
474
+ "State the GUARD LEVEL — how strong the evidence you actually had was — because it caps the verdict you may give. L0: you saw only a completion summary. L1: the artifact and acceptance card exist but there is no real run/test output. L2: you have the author's commands or tests but you are a single tool / single model family. L3: there is a structured evidence pack AND a guard from a different model family pressed on it. L4: on top of that cross-family review (L3), you ALSO independently re-ran the key evidence and reconciled it to a recorded run — a rerun alone, single-family, stays L2.",
475
+ "Return ONE of the four standard verdicts, bounded by the level — pass / reject / insufficient_evidence / pass_with_risk. The ceiling: L0 can only be insufficient_evidence; L1 cannot pass (best is reject or pass_with_risk); L2 (single tool) tops out at pass_with_risk; a plain pass needs L3+ (the cross-family pack); an L4 pass must cite BOTH the cross-family pack AND your reconciled rerun output. A pass_with_risk is NOT 'accepted' on your say-so — the owner must explicitly accept the named residual risk. For anything not fixed, name it as residual risk on the record. A reviewer never silently upgrades 'reads fine' into 'verified'."
476
+ ],
477
+ outputShape: [
478
+ "Verdict: one of pass / reject / insufficient_evidence / pass_with_risk (with the single decisive reason).",
479
+ "Guard level: L0 / L1 / L2 / L3 / L4 — the strength of the evidence you actually had, which bounds the verdict above.",
480
+ "Findings ordered by severity, each tied to a line, section, requirement, or missing evidence.",
481
+ "Half-product check: which of the five faces appeared, and the tell that exposed it.",
482
+ "Self-assessment check: which of the seven unreliable zones were re-verified independently and what that re-check found.",
483
+ "Evidence: the command output, citation comparison, or reproduction the verdict rests on (for an L4 pass, the cross-family pack AND your reconciled rerun output).",
484
+ "Required fixes: the concrete change each blocker needs.",
485
+ "Residual risk: what stays unverified and who must accept it (a pass_with_risk needs an explicit owner sign-off)."
486
+ ],
487
+ passBar: [
488
+ "The verdict is within the guard level's ceiling: no plain pass below L3, no pass from a single tool, no L4 pass without BOTH a cross-family pack and a reconciled rerun, and L0 only ever yields insufficient_evidence.",
489
+ "Every completion claim is backed by evidence the guard could actually point to, not by the author's assurance.",
490
+ "None of the five half-product faces survives unexamined; any that appeared was caught with its concrete tell.",
491
+ "Each of the seven self-assessment zones that applies was re-verified by the guard, not taken on the author's 'I checked it'.",
492
+ "All acceptance criteria are met, or the unmet ones are named as accepted residual risk rather than hidden; any pass_with_risk has an explicit owner sign-off.",
493
+ "No private material leaked and scope stayed inside the stated boundary."
494
+ ],
495
+ rejectBar: [
496
+ "The verdict claims more than the guard level allows — a plain pass on summary-only or single-tool evidence, or an L4 pass with no cross-family pack or no reconciled rerun (the 'single tool dressed up as a binding pass' failure).",
497
+ "A pass_with_risk is treated as accepted with no explicit owner sign-off on the residual risk.",
498
+ "A completion claim asserts more than the evidence shows — the classic 'said it was done but it was not'.",
499
+ "A file/feature is counted as working purely because it exists, with no proof it is wired in and produces an effect.",
500
+ "A number or percentage is accepted without pinning its denominator, so not-started items are silently inflating it.",
501
+ "A safety or correctness call is 'I judged it fine' with no failure scenario tried and no independent check.",
502
+ "A citation is trusted without opening the source, and a word-for-word compare would have shown drift.",
503
+ "The review leans on the author's self-audit in one of the seven unreliable zones instead of re-verifying it."
504
+ ],
505
+ counterExample: "Synthetic: an artifact ships with a confident summary, a self-audit table every row ticked, and the line 'I verified all 12 acceptance items pass.' A tone-only review would approve. Under this prompt: the self-audit rows describe checks but show no commands (zone 3), 'tested' rows give expected values but no observed ones (zone 4), and the count '12' is 11 when actually listed (zone 1). Re-verifying independently shows two items were never wired in (half-product face 2) and one cited source was summarized into a claim it never made (face 5). Because the guard only had the author's artifact and no real run, this is guard level L1 — which cannot pass anyway. Verdict: reject — the polished surface was hiding three unproven claims, which is exactly why the author's own 'I verified it' could not clear the gate."
506
+ }
507
+ },
508
+ {
509
+ file: "red-team-challenge.md",
510
+ title: "Red-team challenge",
511
+ purpose: "Attack a plan or artifact from the angle most likely to make it fail.",
512
+ scenario: "Use before high-risk decisions, public release, data migration, force overwrite, or claims that could mislead users.",
513
+ inputRequirements: [
514
+ "Plan or artifact to challenge.",
515
+ "What failure would be expensive, embarrassing, unsafe, or irreversible.",
516
+ "Known assumptions and skipped checks."
517
+ ],
518
+ steps: [
519
+ "Name the most damaging plausible failure first.",
520
+ "Try to break the plan through user behavior, missing evidence, privacy leakage, and rollback gaps.",
521
+ "Distinguish fatal flaws from acceptable tradeoffs.",
522
+ "End with the smallest test or design change that would reduce risk."
523
+ ],
524
+ outputFormat: [
525
+ "Worst plausible failure",
526
+ "Attack paths",
527
+ "Evidence gaps",
528
+ "Required mitigations",
529
+ "Acceptable residual risk"
530
+ ],
531
+ failureModes: [
532
+ "Being theatrical instead of specific.",
533
+ "Inventing impossible threats while missing mundane data-loss risks.",
534
+ "Challenging the idea but not the exact acceptance criteria.",
535
+ "Giving gentle suggestions ('consider adding a check') instead of demonstrating an actual break.",
536
+ "Attacking only malicious outsiders while ignoring the ordinary user who clicks the wrong thing and the insider who takes a convenient shortcut."
537
+ ],
538
+ example: "For a --force option, the red team asks whether user-created files inside the target directory survive or are backed up.",
539
+ operativeCore: {
540
+ trigger: "Use before high-stakes, hard-to-reverse moves: a public release, a data migration, a force-overwrite or destructive flag, a security or auth change, a permissions or payment path, or any claim that could mislead users if it were wrong. Run it when you need to know how the thing breaks, not whether it is nice.",
541
+ antiTrigger: "Skip it on low-stakes, easily reversible work, or where the worst case is trivial and self-correcting: a wording tweak, a scratch script, a change you can undo in one step with no data or trust at risk. A full attack pass on trivial work is theater, and theater with no payoff trains people to ignore the red team when the stakes are real.",
542
+ input: "The plan or artifact to attack. What a failure would cost — money, data loss, a privacy leak, user harm, reputational damage, an irreversible state. The acceptance criteria it claims to meet. The assumptions the author is leaning on and the checks they admit they skipped. Who can touch it: end users, insiders, automated callers, a future maintainer who forgot the context.",
543
+ process: [
544
+ "Take the attacker's stance, not the helper's. Your job is to make the thing fail, get abused, or get bypassed — and to show it, not to politely suggest improvements. 'Consider adding validation' is not a red-team finding; 'here is the exact input that deletes the user's files' is.",
545
+ "Name the single most damaging plausible failure first, in concrete terms (what is lost, who is harmed, can it be undone). Lead with the worst case so a reader feels the stakes before the details.",
546
+ "Walk real abuse and bypass paths, not exotic ones: the ordinary user who clicks the destructive button by accident or misreads the prompt; the insider who takes the convenient shortcut around the safeguard; the automated or repeated call that hits an unhandled edge; the malformed or hostile input; the missing rollback when a step fails halfway; the privacy or secret that leaks through an error message, a log, or an example. Prefer the mundane data-loss path over the cinematic one.",
547
+ "For each path, try to actually break it on the provided material and show the trigger — the exact input, sequence, or condition — and the resulting damage. If you cannot fully demonstrate it from what was given, say what specific evidence or test would confirm or kill the attack.",
548
+ "Separate fatal flaws (must fix before this ships) from acceptable trade-offs (real but tolerable, named as residual risk). Do not inflate every nit into a blocker; that buries the one attack that actually matters.",
549
+ "End with the smallest concrete change or test that closes the most dangerous path — the minimal mitigation, not a rewrite."
550
+ ],
551
+ outputShape: [
552
+ "Worst plausible failure: the single most damaging realistic outcome, stated concretely.",
553
+ "Attack paths: each with its trigger (exact input / sequence / condition) and the damage it causes.",
554
+ "How it gets abused or bypassed: the accidental-user, insider-shortcut, and automated-edge angles, not just the malicious outsider.",
555
+ "Evidence gaps: what could not be fully demonstrated and the exact test that would confirm or kill each attack.",
556
+ "Fatal vs tolerable: which findings block shipping and which are named residual risk.",
557
+ "Smallest mitigation: the minimal change or test that closes the most dangerous path."
558
+ ],
559
+ passBar: [
560
+ "The worst-case failure is named first and in concrete, this-is-what-is-lost terms.",
561
+ "At least one attack is shown with its actual trigger and damage, not phrased as a gentle suggestion.",
562
+ "Accidental-misuse and insider-shortcut paths are covered, not only deliberate outsider attacks.",
563
+ "Mundane data-loss and rollback gaps are checked, not skipped in favor of exotic threats.",
564
+ "Findings are split into fatal vs tolerable, and each fatal one comes with the smallest mitigation that closes it."
565
+ ],
566
+ rejectBar: [
567
+ "The output is theatrical or vague ('an attacker could do bad things') with no concrete trigger.",
568
+ "It only offers polite improvements and never demonstrates or pins a real break.",
569
+ "It invents impossible or exotic threats while missing the obvious data-loss, overwrite, or leak path.",
570
+ "It challenges the idea in general but never tests the exact acceptance criteria or the destructive flag in question.",
571
+ "Every finding is rated a blocker, so the one attack that actually matters is buried in nits."
572
+ ],
573
+ counterExample: "Synthetic: a CLI ships a `--force` flag that re-creates a workspace directory. A gentle review says 'consider warning the user before overwriting.' The red-team stance instead shows the break: run `init --force` in a directory where the user also keeps their own notes file, and the worst case is those user-created files are deleted with no backup and no undo. The trigger is concrete (force flag plus a non-empty target), the damage is irreversible data loss, the abuse path is the ordinary user who did not realize the directory was shared, and the smallest mitigation is to move the existing workspace to a timestamped backup before recreating it and to refuse on unexpected extra files — not a rewrite."
574
+ }
575
+ },
576
+ {
577
+ file: "handoff-generation.md",
578
+ title: "Handoff generation",
579
+ purpose: "Write a next-session state card with completed, pending, blocked, and unverified work.",
580
+ scenario: "Use before stopping, changing tools, delegating, or compressing a long task into a resumable state.",
581
+ inputRequirements: [
582
+ "Current goal and status.",
583
+ "Completed work and changed artifacts.",
584
+ "Verification commands and outputs.",
585
+ "Blockers, decisions, and the next action."
586
+ ],
587
+ steps: [
588
+ "State the goal in one sentence.",
589
+ "Separate done, pending, blocked, and unverified items.",
590
+ "Record exact commands or checks already run.",
591
+ "Give the next session one concrete first action."
592
+ ],
593
+ outputFormat: [
594
+ "Goal",
595
+ "Current status",
596
+ "Completed",
597
+ "Pending",
598
+ "Blocked",
599
+ "Verification evidence",
600
+ "Next action"
601
+ ],
602
+ failureModes: [
603
+ "Writing a story instead of a state transfer.",
604
+ "Hiding failed checks in a summary.",
605
+ "Leaving the next assistant to rediscover the first command.",
606
+ "Presenting the option list as the whole world, so the receiver never asks what the author missed.",
607
+ "Letting the receiver grab a tempting new item on round one and hijack the main line that was actually handed off."
608
+ ],
609
+ example: "Handoff says: tests pass, pack smoke not run, adapter installer added, next action is fresh temp install.",
610
+ operativeCore: {
611
+ trigger: "Use before work crosses a boundary: a session is ending, a different tool or model is taking over, a long task is being compressed into a resumable state, or you are delegating to someone who was not in the original context.",
612
+ antiTrigger: "Skip the full packet when the same session simply continues, or for a trivial task with a single obvious next step and no state worth transferring. A formal handoff for a one-line continuation is overhead the receiver does not need.",
613
+ input: "The goal in the author's words and the current status. The completed work and the artifacts that changed. The exact verification commands run and their real outputs (including the ones that failed). The blockers, the decisions already made, the sealed baseline (the exact point being handed off), and the author's best read of the next action. Crucially, an honest note of what the author may have left off the list.",
614
+ process: [
615
+ "State the goal in one sentence, then separate the work into done / pending / blocked / unverified. Do not narrate a story; transfer a state a stranger can act on.",
616
+ "Record the exact commands or checks already run with their real outputs — including failures shown openly, never smoothed into a summary. A check claimed but not shown is treated as not run.",
617
+ "Frame the option list honestly as a menu, not the whole map: it is the author's view of what comes next, and the author may have missed, misordered, or mis-scoped an item. Mark which options are well-grounded and which are guesses.",
618
+ "Give the receiver an explicit first-round judgment to run before executing, not just a task to pick: (1) what is this next stint actually for — restate the main-line goal, the current sub-task, and the completion bar; (2) is the handed-off option list exhaustive — what did the author miss; (3) is there an unlisted item D or E, and if so does it serve the main line or hijack it? An option list you accept without questioning is an option list whose blind spots you inherit.",
619
+ "Set the discipline for new items: a genuinely high-signal item that does NOT serve the current main line is recorded as a parked / after-closeout residual, not seized on round one. High signal is not the same as 'do it now'; the main line that was handed off keeps priority unless the owner re-points it.",
620
+ "Give one concrete first action and the exact baseline to start from, so the receiver re-enters work cleanly instead of re-deriving the starting point or re-explaining the background."
621
+ ],
622
+ outputShape: [
623
+ "Goal: one sentence.",
624
+ "Current status: done / pending / blocked / unverified, kept separate.",
625
+ "Verification evidence: the exact commands run and their real outputs, failures included.",
626
+ "Option menu (not the map): the next-step options, each marked well-grounded or guess, with a note of what may be missing.",
627
+ "Receiver's first-round check: the three questions to answer before executing (what is this for / is the list exhaustive / is there an unlisted D-or-E that serves vs hijacks the main line).",
628
+ "Parked residuals: high-signal items that do not serve the current main line, recorded for later, not seized now.",
629
+ "Next action: one concrete first step plus the exact baseline to start from."
630
+ ],
631
+ passBar: [
632
+ "Done / pending / blocked / unverified are cleanly separated, with no failed check hidden inside a summary.",
633
+ "Every verification claim shows its real command and output, so 'verified' means shown, not asserted.",
634
+ "The option list is framed as the author's menu, not the whole map, with missing or guessed items flagged.",
635
+ "The receiver is handed an explicit first-round judgment (what is this for / is the list exhaustive / is there a D or E) rather than just a task to pick.",
636
+ "A single concrete next action and the exact starting baseline are stated, and off-main-line items are parked rather than promoted to round one."
637
+ ],
638
+ rejectBar: [
639
+ "The handoff is a narrative of what happened instead of an actionable state transfer.",
640
+ "A check is claimed ('tests pass') with no command or output shown, so the receiver must rediscover it.",
641
+ "The option list is presented as the complete and only set of next steps, inviting the receiver to inherit the author's blind spots.",
642
+ "A failed or skipped check is smoothed over rather than surfaced in the unverified column.",
643
+ "A tempting new item is teed up as the immediate next action even though it does not serve the handed-off main line."
644
+ ],
645
+ counterExample: "Synthetic: a handoff lists options A/B/C for finishing a release and says 'next: do B'. The receiver runs the first-round check instead of grabbing B. Question two exposes that the author never listed reconciling a data-count mismatch — an unlisted item D — and question three judges that D actually blocks the release, so it serves the main line and is promoted; meanwhile a flashy idea E (add a new export format) is high-signal but off the main line, so it is parked as an after-closeout residual rather than hijacking round one. Treating the menu as the whole map would have shipped the release with the count bug the author had silently omitted."
646
+ }
647
+ },
648
+ {
649
+ file: "harvest-extraction.md",
650
+ title: "Harvest extraction",
651
+ purpose: "Extract reusable knowledge, prompt fragments, and rule candidates after a loop.",
652
+ scenario: "Use after a task finishes, fails in an instructive way, or reveals a repeatable collaboration pattern.",
653
+ inputRequirements: [
654
+ "Final artifact and review result.",
655
+ "What changed the outcome.",
656
+ "Reusable snippets or rules.",
657
+ "What should stay case-specific."
658
+ ],
659
+ steps: [
660
+ "Extract only material likely to help a future task.",
661
+ "Separate reusable knowledge, prompt fragments, decision records, and rule candidates.",
662
+ "Mark what must not be generalized.",
663
+ "Choose a storage target and next reuse moment."
664
+ ],
665
+ outputFormat: [
666
+ "Source task",
667
+ "Reusable knowledge",
668
+ "Reusable prompts",
669
+ "Decision record",
670
+ "Rule candidates",
671
+ "Do not generalize",
672
+ "Storage target"
673
+ ],
674
+ failureModes: [
675
+ "Harvesting everything and creating clutter.",
676
+ "Turning one anecdote into a permanent rule.",
677
+ "Saving private raw material instead of a synthetic lesson.",
678
+ "Filing a card straight into the knowledge base without waiting for the human to confirm it.",
679
+ "Interrogating the user for lessons when nothing this round was actually worth keeping."
680
+ ],
681
+ example: "From a failed release check, harvest the rule 'smoke test the packed package in a temp directory' but do not store the user's private repo path.",
682
+ operativeCore: {
683
+ trigger: "Use at the end of one loop or conversation: a task finished, failed in an instructive way, or revealed a repeatable collaboration pattern. The goal is to lift the reusable bit before it leaks away, while the context is still fresh.",
684
+ antiTrigger: "Skip it when nothing reusable happened: a routine answer, a trivial fix, a conversation that taught nothing a future task would want. Do not manufacture a 'lesson' to justify running the step — an invented rule is worse than no rule, because future loops will obey it. If the person had nothing they wanted to keep, stop; do not interrogate them for one.",
685
+ input: "The finished loop or conversation. The final artifact and the review result. What actually changed the outcome (the decision, the mistake, the move that worked). Any reusable snippet, prompt fragment, or candidate rule. And what should stay specific to this case and never be generalized.",
686
+ process: [
687
+ "First ask whether anything here is worth keeping at all. If the honest answer is no, say so and stop — restraint is part of the method, not a failure of it. If the person was asked 'anything you want to keep?' and said no, do not push.",
688
+ "Extract one item per card, each card a single kind of thing — a DECISION (a choice future work should not silently reopen), a LESSON (a mistake and the rule that prevents it), a METHOD (a reusable move or prompt fragment), or a PREFERENCE (a stable way of working). Do not blend a decision, a lesson, and a method into one mushy entry; one card, one thing, so each can be trusted, found, and revisited on its own.",
689
+ "Redact as you extract, not as an afterthought: rewrite every real name, client, path, number, and raw quote into a general, public-safe form before the card is even proposed. The card must carry the lesson, never the private original. Privacy is built into the extraction step, not bolted on later.",
690
+ "Resist generalizing a single incident into a permanent rule. A one-off needs either repeated evidence or an explicit human sign-off before it becomes standing doctrine; otherwise mark it as a candidate, not a rule.",
691
+ "For a DECISION or a LESSON, record its current state honestly — still open, recorded-but-unresolved, resolved, or superseded — so a stale card is not mistaken for live truth.",
692
+ "Present every card as a candidate awaiting confirmation, with a proposed storage target and the next moment it would be reused. Nothing lands in the knowledge base until the human confirms it: the harvester stages, the human files."
693
+ ],
694
+ outputShape: [
695
+ "Source: which loop or conversation this came from (public-safe).",
696
+ "Cards: one item per card, each typed DECISION / LESSON / METHOD / PREFERENCE — never blended.",
697
+ "Redacted form: each card already rewritten public-safe, with no private name, path, number, or raw quote.",
698
+ "State (for decisions and lessons): open / recorded-unresolved / resolved / superseded.",
699
+ "Candidate vs rule: whether each item is a confirmed rule or a candidate still needing evidence or sign-off.",
700
+ "Do not generalize: what must stay case-specific.",
701
+ "Storage target and next reuse: where each card would live and when it would next be used — all pending human confirmation."
702
+ ],
703
+ passBar: [
704
+ "Each card carries exactly one kind of thing (decision / lesson / method / preference), not a blend.",
705
+ "Every card is already redacted public-safe at the moment it is proposed, carrying the lesson and not the private original.",
706
+ "No single incident has been promoted to a permanent rule without repeated evidence or explicit sign-off.",
707
+ "Decision and lesson cards show an honest current state, so nothing stale reads as live truth.",
708
+ "All cards are staged as candidates for human confirmation; none has been filed into the knowledge base unilaterally."
709
+ ],
710
+ rejectBar: [
711
+ "Everything got harvested, producing clutter instead of the few items that actually matter.",
712
+ "A card blends a decision, a lesson, and a method together, so none of them can be trusted or revisited cleanly.",
713
+ "A private name, path, number, or raw quote survives in a card instead of a synthetic rewrite.",
714
+ "A one-off anecdote was turned into a standing rule with no repeated evidence and no sign-off.",
715
+ "A card was filed straight into the knowledge base without the human confirming it, or the user was interrogated for a lesson after saying there was nothing to keep."
716
+ ],
717
+ counterExample: "Synthetic: a release check fails because a packaged build was never smoke-tested. The wrong harvest writes one fat entry mixing the decision, the lesson, and the user's real repo path, then files it automatically. The disciplined harvest stages two separate public-safe candidate cards — a LESSON ('smoke-test the packed package in a throwaway temp directory before claiming a release is ready', state: resolved) and a METHOD (the exact temp-install command, with the private path rewritten to a generic stand-in) — keeps the private repo path out entirely, and waits for the human to confirm before anything lands. And if that same session had ended with nothing instructive, the right move would have been to say 'nothing worth keeping this round' rather than inventing a rule."
718
+ }
719
+ },
720
+ {
721
+ file: "mode-switching.md",
722
+ title: "Mode switching",
723
+ purpose: "Switch an assistant between execution, review, planning, and reflection without losing boundaries.",
724
+ scenario: "Use when a conversation changes from brainstorming to implementation, from execution to review, or from review to handoff.",
725
+ inputRequirements: [
726
+ "Current mode and requested new mode.",
727
+ "Authority boundary: who may write, review, or decide.",
728
+ "Current goal, acceptance card, and stop conditions."
729
+ ],
730
+ steps: [
731
+ "Confirm the old mode and new mode in plain language.",
732
+ "Carry forward only the context needed for the new mode.",
733
+ "Restate what actions are allowed and forbidden.",
734
+ "Name the first output expected in the new mode."
735
+ ],
736
+ outputFormat: [
737
+ "Mode change",
738
+ "Allowed actions",
739
+ "Forbidden actions",
740
+ "Context carried forward",
741
+ "First output"
742
+ ],
743
+ failureModes: [
744
+ "Continuing to execute while pretending to review.",
745
+ "Losing authorization boundaries after a role switch.",
746
+ "Dragging irrelevant old context into a focused task."
747
+ ],
748
+ example: "Switch from execution to guard review: stop editing, inspect the changed files, compare against acceptance, and report findings first."
749
+ },
750
+ {
751
+ file: "workflow-reset.md",
752
+ title: "Workflow reset",
753
+ purpose: "Recover from drift by restating goal, state, acceptance, and next action.",
754
+ scenario: "Use when a thread has become confusing, has nested sub-tasks, or the assistant can no longer explain why the current step serves the main goal.",
755
+ inputRequirements: [
756
+ "Original goal or latest confirmed goal.",
757
+ "What has been done.",
758
+ "Where the thread drifted.",
759
+ "Known blockers and verification state."
760
+ ],
761
+ steps: [
762
+ "Stop adding new work.",
763
+ "Name the main goal and current sub-task.",
764
+ "List done, pending, blocked, and unverified items.",
765
+ "Choose whether to close the sub-task or return to the main line."
766
+ ],
767
+ outputFormat: [
768
+ "Main goal",
769
+ "Current sub-task",
770
+ "Drift point",
771
+ "State table",
772
+ "Recommended next action"
773
+ ],
774
+ failureModes: [
775
+ "Treating reset as a new brainstorm.",
776
+ "Continuing the most recent tangent because it is easier.",
777
+ "Claiming closure while verification is missing.",
778
+ "Restating the goal but carrying forward the pre-drift numbers and assumptions as if they were still true.",
779
+ "Producing a tidy reset card with no single concrete next action and no exact baseline to resume from."
780
+ ],
781
+ example: "Reset a release thread after debugging npm cache issues: mark cache as environment-specific, return to package smoke test with a temp cache.",
782
+ operativeCore: {
783
+ trigger: "Use when a thread has drifted: it has nested into sub-tasks, chased a tangent, run long enough that state is fuzzy, or reached the point where the assistant can no longer explain why the current step serves the main goal. Reset before doing more work, not after producing more output on a shaky base.",
784
+ antiTrigger: "Skip the formal reset on a short, on-track thread with one clear goal and a state you can hold in your head. Running a full four-part reset on a task that never drifted is ceremony, and ceremony with no payoff trains people to ignore the reset when the thread has genuinely lost its thread.",
785
+ input: "The original goal, or the latest goal the owner actually confirmed. What has been done so far and what was claimed about it. Where the thread drifted — the point it stopped serving the main goal. The known blockers and the current verification state, including which 'done' claims have real evidence and which are only asserted.",
786
+ process: [
787
+ "Stop adding new work. A reset is a re-grounding, not a fresh brainstorm; resist the pull to fix one more thing before you know where you stand.",
788
+ "Restate the four components explicitly, in order: GOAL (the main line in one sentence), CURRENT STATE (where things actually are now), ACCEPTANCE (what 'done' means for the main line), NEXT ACTION (the single concrete first step). All four are required — a reset missing any one of them leaves the thread still adrift.",
789
+ "Re-measure the current state instead of trusting the pre-drift picture. The numbers, counts, file states, and 'it already works' assumptions from before the drift are exactly what may have gone stale; re-run the cheap deterministic check (count it, list it, open it, reproduce it) rather than carrying a remembered value forward. A figure quoted from earlier in the thread is treated as 'pre-drift, unverified' until re-measured.",
790
+ "Separate the work honestly into done / pending / blocked / unverified, and never let a claimed-but-unproven item sit in 'done'. A check claimed without evidence is unverified, not complete.",
791
+ "Decide whether to close the current sub-task or return to the main line, judged by what serves the goal — not by which tangent is easiest to keep pushing. A genuinely useful side-finding that does not serve the main line is parked as a residual, not pursued now.",
792
+ "State one concrete next action and the exact baseline to resume from (the precise point, file, or command to start at), so work re-enters cleanly instead of re-deriving the starting point."
793
+ ],
794
+ outputShape: [
795
+ "Goal: the main line in one sentence.",
796
+ "Current state (re-measured): where things actually are now, with the cheap check that re-confirmed it, and any figure still carried from before the drift flagged as unverified.",
797
+ "Acceptance: what 'done' means for the main line.",
798
+ "Drift point: where and why the thread stopped serving the goal.",
799
+ "State table: done / pending / blocked / unverified, kept separate, with no unproven item parked in done.",
800
+ "Decision: close the sub-task or return to the main line, with the reason it serves the goal.",
801
+ "Parked residuals: useful side-findings that do not serve the main line, recorded for later rather than chased now.",
802
+ "Next action: one concrete first step plus the exact baseline to resume from."
803
+ ],
804
+ passBar: [
805
+ "All four components — goal, current state, acceptance, next action — are restated explicitly, none left implicit.",
806
+ "The current state was re-measured with a real check, and any number carried from before the drift is labeled unverified rather than reused as truth.",
807
+ "Done / pending / blocked / unverified are cleanly separated, with no claimed-but-unproven item sitting in done.",
808
+ "The close-or-return decision is justified by what serves the main goal, not by which tangent is easiest to continue.",
809
+ "There is exactly one concrete next action and an exact baseline to resume from, so a cold restart is unnecessary."
810
+ ],
811
+ rejectBar: [
812
+ "The reset turns into a new brainstorm that adds scope instead of re-grounding the existing goal.",
813
+ "The goal is restated but pre-drift numbers or 'it already works' assumptions are carried forward without re-measuring — the classic stale-baseline trap.",
814
+ "A claimed-but-unproven item is filed under done, so the state table reads cleaner than the work actually is.",
815
+ "The thread returns to the most recent tangent because it is easiest, even though it does not serve the main line.",
816
+ "The card ends with no single concrete next action, or with no exact baseline, so the next session must re-derive where to start."
817
+ ],
818
+ counterExample: "Synthetic: a release thread drifts into debugging a package-install cache error. The lazy reset writes 'goal: ship the release; we already smoke-tested the build earlier, so next just publish.' But re-measuring the current state shows the earlier smoke test ran against an old build that predates the cache fix — the carried-forward 'already tested' was a pre-drift assumption, now stale. The disciplined reset re-runs the smoke test in a clean temp directory (re-measure, not remember), marks the cache error as an environment-specific residual that does not block the main line, files 'release smoke test' as unverified rather than done, and sets one concrete next action with an exact baseline: run the packaged build's smoke test against the post-fix build before publishing. Trusting the pre-drift 'already tested' would have shipped a release that was never actually verified."
819
+ }
820
+ },
821
+ {
822
+ file: "rule-update-proposal.md",
823
+ title: "Rule update proposal",
824
+ purpose: "Suggest a new rule from repeated evidence without silently changing the system.",
825
+ scenario: "Use when the same failure appears across tasks and may deserve a reusable rule, checklist item, or template change.",
826
+ inputRequirements: [
827
+ "Observed failures or repeated friction.",
828
+ "Evidence count and examples.",
829
+ "Proposed rule text.",
830
+ "Scope, exceptions, and rollback condition."
831
+ ],
832
+ steps: [
833
+ "Prove the pattern is repeated, not one-off.",
834
+ "State the rule in operational language.",
835
+ "Define where it applies and where it does not.",
836
+ "Ask for approval before changing shared rules."
837
+ ],
838
+ outputFormat: [
839
+ "Problem pattern",
840
+ "Evidence",
841
+ "Proposed rule",
842
+ "Scope",
843
+ "Exceptions",
844
+ "Review owner",
845
+ "Rollback condition"
846
+ ],
847
+ failureModes: [
848
+ "Creating governance bloat from one incident.",
849
+ "Silently changing shared behavior without approval.",
850
+ "Writing a rule so vague that it cannot be checked.",
851
+ "Proposing an addition without naming what it replaces, retires, or demotes, so the rule set only ever grows.",
852
+ "Cutting a harmless, dormant rule purely to look lean, with no check on what capability is lost or how to roll back."
853
+ ],
854
+ example: "After three release tasks missed packed-package smoke tests, propose a release checklist item requiring temp install before candidate labeling.",
855
+ operativeCore: {
856
+ trigger: "Use when the same failure or friction has shown up across multiple tasks and may deserve a standing rule, checklist item, or template change — or when you are weighing whether to add, merge, demote, or retire a rule. The unit of justification is a repeated pattern, not a single bad moment.",
857
+ antiTrigger: "Do not run this to mint a rule from one incident, and do not invent a rule just to feel productive — an unjustified standing rule is worse than none, because every future task then has to obey it and read past it. Skip it for a one-off slip with an obvious local fix, or a throwaway preference that will not recur. If the only evidence is a single anecdote, the honest output is 'not yet a rule; watch for recurrence', not a new line in the rulebook.",
858
+ input: "The observed failures or friction, with how many distinct times each occurred and a concrete example of each — enough to show a pattern, not one story. The exact rule text being proposed, in operational language. Where it would apply and where it must not. Which existing rule it would replace, demote, or make redundant. The cost of carrying it (added reading, added ceremony, conflict with other rules). And who can approve a change to shared behavior.",
859
+ process: [
860
+ "Prove the pattern is repeated before proposing anything. State the distinct instances and their dates or contexts; if you have only one, stop and label it a watch-item, not a rule. One anecdote does not earn standing doctrine.",
861
+ "Write the rule in operational, checkable language — a reader must be able to tell whether it was followed. 'Be more careful with releases' is uncheckable; 'before labeling a release candidate, install the packed package in a clean temp directory and confirm it runs' is.",
862
+ "Answer the lifecycle question that every addition must carry: what does this rule REPLACE, RETIRE, or DEMOTE? A rule that adds to the set without removing or superseding anything is bloat until proven otherwise. If it truly adds net-new coverage, say what it is net-new over and why nothing existing covers it.",
863
+ "Specify the full lifecycle in five fields, not just the trigger: (1) TRIGGER — when the rule is read or applied; (2) REPLACES — the rule it supersedes, demotes, or makes redundant; (3) ACCEPTANCE — what observable evidence would show it is actually helping; (4) ARCHIVE CONDITION — how long unused or how much drift before it is demoted or retired; (5) REVIEW WINDOW — when its keep/cut decision gets revisited. A rule with no archive condition and no review window is a rule that can only accumulate.",
864
+ "Judge the proposal on net benefit, not on the act of adding or cutting. The test is (expected benefit minus carrying cost and risk), and you only adopt the clearly-positive, low-downside cases; when the benefit is uncertain, do not add it. Equally, do not cut for the sake of looking lean — a dormant rule with no harm can stay; removing a negative is the only subtraction that is automatically worth it.",
865
+ "Before proposing any removal or merge, answer the three subtraction questions: (1) does any main-line capability drop if this goes; (2) is the path to bring it back clear; (3) how is a wrong cut rolled back? If those are not answered, the cut is not ready.",
866
+ "Present it as a proposal for sign-off, never a silent change to shared behavior. Name the approver and the rollback condition; staging waits for a human to file it."
867
+ ],
868
+ outputShape: [
869
+ "Problem pattern: the repeated failure, stated once.",
870
+ "Evidence: the distinct instances with dates or contexts — enough to show repetition, not a single anecdote.",
871
+ "Proposed rule: operational, checkable text.",
872
+ "Replaces / retires / demotes: what existing rule this supersedes, or an explicit argument for why it is net-new.",
873
+ "Lifecycle fields: trigger / replaces / acceptance / archive condition / review window.",
874
+ "Net-benefit call: expected benefit versus carrying cost and risk, and why it is clearly positive (or why it should wait).",
875
+ "Subtraction check (for any removal or merge): capability-loss / recovery-path / rollback answered.",
876
+ "Scope and exceptions: where it applies and where it must not.",
877
+ "Review owner and rollback condition: who approves and how a wrong call is undone."
878
+ ],
879
+ passBar: [
880
+ "The pattern is shown as repeated across distinct instances, not generalized from one incident.",
881
+ "The rule is written so a reader can mechanically tell whether it was followed.",
882
+ "The proposal names what it replaces, retires, or demotes — or argues explicitly why it is net-new — so the rule set is not just growing.",
883
+ "All five lifecycle fields are present, including an archive condition and a review window, so the rule can later be cut, not only kept.",
884
+ "The decision rests on net benefit, and any removal answers the capability-loss / recovery-path / rollback questions rather than cutting to look lean."
885
+ ],
886
+ rejectBar: [
887
+ "A standing rule is proposed from a single anecdote with no evidence of recurrence.",
888
+ "The rule text is a vibe ('be more careful', 'handle releases better') that no one can check compliance against.",
889
+ "The addition names nothing it replaces, retires, or demotes, and makes no case for being net-new — pure accumulation.",
890
+ "Lifecycle fields are missing, especially the archive condition and review window, so the rule can only ever be added and never retired.",
891
+ "A removal or merge is proposed without answering whether a main-line capability drops, how to recover it, or how to roll back a wrong cut — or a harmless dormant rule is cut purely for tidiness."
892
+ ],
893
+ counterExample: "Synthetic: an assistant proposes a brand-new governance rule after one release slipped, adds it to the rulebook, and names nothing it supersedes. Reviewed under this prompt, two faults surface. First, the lifecycle question is unanswered — the rule replaces, retires, and demotes nothing, so it is pure accumulation; and it carries no archive condition or review window, meaning it can only ever be added to the pile. Second, the evidence is a single incident, below the repeated-pattern bar, so the honest output is a watch-item, not doctrine. A revealing real-world tell of this failure mode: a cleanup round that set out to 'add a subtraction metric' ended up adding several new items and removing zero — proof that knowing you should subtract is not the same as the system actually subtracting. The disciplined proposal waits for a second and third recurrence, writes the rule as a checkable temp-install step, states that it replaces an older vaguer 'test before release' note, and attaches an archive condition and a named approver before anything lands."
894
+ }
895
+ }
896
+ ];
897
+
898
+ export const skillDefinitions = [
899
+ {
900
+ id: "profile",
901
+ purpose: "Build and maintain collaboration profiles.",
902
+ when: "Use before recurring or high-context work where the assistant's tone, autonomy, challenge style, and safety boundaries affect the result.",
903
+ process: [
904
+ "Extract reusable collaboration preferences from redacted material.",
905
+ "Separate stable preferences from task facts.",
906
+ "Mark inferred preferences as provisional until confirmed.",
907
+ "Return a compact profile card that future sessions can apply."
908
+ ],
909
+ output: [
910
+ "Working style",
911
+ "Decision preferences",
912
+ "Hard boundaries",
913
+ "Challenge and review preferences",
914
+ "Update rule"
915
+ ],
916
+ safety: [
917
+ "Do not store secrets, client names, local paths, account details, or raw private conversations.",
918
+ "Do not infer identity traits that the user did not provide.",
919
+ "Do not turn a temporary mood into a permanent rule."
920
+ ],
921
+ example: "Create a profile that says: direct risk calls, no publishing without consent, ask before irreversible actions. Include a short evidence note for every stable preference: 'seen in repeated release work' is acceptable, while 'user sounded impatient once' stays provisional. The profile should help the next assistant choose autonomy level, response length, challenge style, and consent boundaries without copying private task history."
922
+ },
923
+ {
924
+ id: "context",
925
+ purpose: "Package task context for another AI session.",
926
+ when: "Use at the beginning of multi-step work, cross-tool work, reviews, or any task that may be resumed later.",
927
+ process: [
928
+ "State the goal and non-goals.",
929
+ "List artifacts and evidence the next assistant should inspect.",
930
+ "Split facts, assumptions, decisions, risks, and open questions.",
931
+ "End with a single next action."
932
+ ],
933
+ output: [
934
+ "Goal",
935
+ "Current state",
936
+ "Relevant artifacts",
937
+ "Constraints and non-goals",
938
+ "Facts versus assumptions",
939
+ "Risks and open questions",
940
+ "Next action"
941
+ ],
942
+ safety: [
943
+ "Summarize private material instead of copying it.",
944
+ "Do not include real local paths in public examples.",
945
+ "Do not hide uncertainty inside fluent narrative."
946
+ ],
947
+ example: "Package a messy release task into current version, failing checks, changed files, and the next verification command."
948
+ },
949
+ {
950
+ id: "acceptance",
951
+ purpose: "Define pass criteria and verification evidence.",
952
+ when: "Use before any implementation, writing, research, or cleanup task where completion could otherwise be subjective.",
953
+ process: [
954
+ "Convert the goal into deliverables.",
955
+ "Define pass criteria a reviewer can inspect.",
956
+ "Name rejected states explicitly.",
957
+ "Attach the exact command or manual check required before completion."
958
+ ],
959
+ output: [
960
+ "Deliverables",
961
+ "Pass criteria",
962
+ "Required checks",
963
+ "Rejected states",
964
+ "Evidence needed",
965
+ "Decision needed"
966
+ ],
967
+ safety: [
968
+ "Do not accept unverified claims.",
969
+ "Do not let passing tests substitute for missing user-path validation.",
970
+ "Do not move acceptance after implementation."
971
+ ],
972
+ example: "For a first-run CLI, acceptance includes real bin command, no fallback target, clear errors, and temp install smoke."
973
+ },
974
+ {
975
+ id: "guard",
976
+ purpose: "Challenge artifacts before trust, and grade how strong the evidence actually was.",
977
+ when: "Use after a plan, draft, implementation, or research answer exists and before it becomes the basis for the next step.",
978
+ process: [
979
+ "Read the context and acceptance card first.",
980
+ "Inspect the artifact for missing evidence, privacy leaks, scope drift, and unsupported claims.",
981
+ "Lead with findings ordered by severity.",
982
+ "State the evidence level you actually saw (L0 summary only / L1 artifact but no real run / L2 author-supplied commands or tests, single tool / L3 structured evidence pack reviewed by a different model family / L4 that cross-family review AND you independently re-ran and reconciled the key evidence).",
983
+ "Return one of the four standard verdicts, bounded by that level: pass / reject / insufficient_evidence / pass_with_risk. A plain pass needs L3+ (the cross-family pack); a single tool tops out at pass_with_risk (L2); summary-only is insufficient_evidence (L0); an L4 pass must show a cross-family review AND your reconciled rerun output."
984
+ ],
985
+ output: [
986
+ "Verdict (pass / reject / insufficient_evidence / pass_with_risk)",
987
+ "Guard level (L0-L4) for the evidence you saw",
988
+ "Findings",
989
+ "Evidence",
990
+ "Required fixes",
991
+ "Residual risk"
992
+ ],
993
+ safety: [
994
+ "Do not rubber-stamp your own work.",
995
+ "Do not review only style.",
996
+ "Do not call work complete without fresh verification.",
997
+ "Do not return pass above your evidence level: no pass without an L3+ cross-family pack, no pass from a single tool, no L4 pass without BOTH a cross-family pack and a reconciled rerun.",
998
+ "Do not treat a pass_with_risk as accepted on your own — it needs an explicit owner sign-off."
999
+ ],
1000
+ example: "Reject a case study that lacks baseline output because users cannot see why the structured loop is better than raw chat; record it as guard level L1 (artifact exists, no real run)."
1001
+ },
1002
+ {
1003
+ id: "evidence-pack",
1004
+ purpose: "Assemble a structured evidence pack so a completion claim can be checked, not trusted.",
1005
+ when: "Use the moment you are about to say done, tested, fixed, or shipped, before a reviewer or the next session inherits the claim.",
1006
+ process: [
1007
+ "List every changed file and what changed in each, so the diff surface is explicit.",
1008
+ "Record the exact command you ran, its captured output, and its exit code; mark anything you did not actually run.",
1009
+ "Name the unverified items: edges you skipped, paths you did not exercise, and assumptions still standing.",
1010
+ "Append each piece with `ai-collab evidence add` (kind diff / output / file / rerun) so the proof lives in the ledger, then point the receipt at those rows."
1011
+ ],
1012
+ output: [
1013
+ "Changed files with per-file intent",
1014
+ "Command, captured output, and exit code",
1015
+ "Reproduction steps a reviewer can rerun",
1016
+ "Unverified items and standing assumptions",
1017
+ "Ledger evidence ids the receipt cites"
1018
+ ],
1019
+ safety: [
1020
+ "Do not write a summary in place of the real command output and exit code.",
1021
+ "Do not label a claim verified when the run did not happen; record it as unverified instead.",
1022
+ "Do not paste private paths, tokens, or raw transcripts into an evidence row."
1023
+ ],
1024
+ example: "Before claiming a parser fix is done, attach the changed file, the test command with its exit 0 output, and an unverified note that the empty-input branch was never exercised; add each as an evidence row and cite their ids on the receipt."
1025
+ },
1026
+ {
1027
+ id: "single-tool-guard",
1028
+ purpose: "Run the minimum guard when only one model family is available, with the ceiling named on the record.",
1029
+ when: "Use at a completion claim when no second, different model family exists to run the cross-family binding gate, and you would otherwise trust the same assistant that just wrote the work.",
1030
+ process: [
1031
+ "Open a brand new conversation rather than reusing the drafting thread, whose eagerness to please suppresses objections.",
1032
+ "Paste an adversarial prompt that defaults to refuting and hunts for missing evidence, tying each finding to a line or section.",
1033
+ "Bound the verdict at the single-tool ceiling: this tops out at L2 / pass_with_risk and may never be filed as a passed cross-family gate.",
1034
+ "Name the residual risk a same-family reviewer most likely shares, and leave the upgrade note to run one cross-family pass once a second family appears."
1035
+ ],
1036
+ output: [
1037
+ "Verdict bounded at pass_with_risk (never a plain pass)",
1038
+ "Findings tied to specific lines or sections",
1039
+ "Residual risk a same-family reviewer would share",
1040
+ "Owner sign-off required before pass_with_risk counts as accepted",
1041
+ "Upgrade note: cross-family pass still owed"
1042
+ ],
1043
+ safety: [
1044
+ "Do not record a single-family review as if the cross-family binding gate cleared it.",
1045
+ "Do not let pass_with_risk count as accepted without an explicit owner sign-off on the named risk.",
1046
+ "Do not reuse the thread that just claimed done, and do not leave the residual risk blank."
1047
+ ],
1048
+ example: "With only one tool available, a fresh adversarial pass downgrades a done claim to pass_with_risk, names the CSV-escaping blind spot a same-family reviewer would share, and leaves an upgrade note to run a cross-family pass later."
1049
+ },
1050
+ {
1051
+ id: "handoff",
1052
+ purpose: "Resume work across sessions and tools.",
1053
+ when: "Use before stopping, delegating, switching AI tools, or compressing a long task into a state another session can continue.",
1054
+ process: [
1055
+ "Restate the goal.",
1056
+ "Separate done, pending, blocked, and unverified work.",
1057
+ "List changed artifacts and verification commands.",
1058
+ "Give one concrete next action."
1059
+ ],
1060
+ output: [
1061
+ "Goal",
1062
+ "Current status",
1063
+ "Completed",
1064
+ "Pending",
1065
+ "Blocked",
1066
+ "Verification evidence",
1067
+ "Next action"
1068
+ ],
1069
+ safety: [
1070
+ "Do not bury blockers in prose.",
1071
+ "Do not claim verification that did not run.",
1072
+ "Do not include private raw transcript unless explicitly safe."
1073
+ ],
1074
+ example: "Hand off a release candidate with test results, pack output, remaining smoke commands, and known documentation risks."
1075
+ },
1076
+ {
1077
+ id: "harvest",
1078
+ purpose: "Extract reusable knowledge from finished loops.",
1079
+ when: "Use after a task finishes or fails in a way that teaches a reusable workflow pattern.",
1080
+ process: [
1081
+ "Identify what changed the outcome.",
1082
+ "Separate reusable knowledge, prompt fragments, decisions, and rule candidates.",
1083
+ "Mark material that must stay case-specific.",
1084
+ "Choose a storage target and future reuse trigger."
1085
+ ],
1086
+ output: [
1087
+ "Source task",
1088
+ "Reusable knowledge",
1089
+ "Reusable prompts",
1090
+ "Decision record",
1091
+ "Rule candidates",
1092
+ "Do not generalize",
1093
+ "Next reuse"
1094
+ ],
1095
+ safety: [
1096
+ "Do not harvest private source material.",
1097
+ "Do not create a universal rule from one example.",
1098
+ "Do not keep clutter that has no future use."
1099
+ ],
1100
+ example: "Harvest the lesson that force overwrite needs backup evidence, while excluding the user's actual workspace path."
1101
+ },
1102
+ {
1103
+ id: "red-team",
1104
+ purpose: "Find the failure path before shipping an idea.",
1105
+ when: "Use for public releases, irreversible operations, broad claims, security-sensitive behavior, or expensive direction choices.",
1106
+ process: [
1107
+ "Name the most damaging plausible failure.",
1108
+ "Attack assumptions through user behavior, safety, evidence, and rollback.",
1109
+ "Separate blockers from tolerable risk.",
1110
+ "Recommend the smallest mitigation or test."
1111
+ ],
1112
+ output: [
1113
+ "Worst plausible failure",
1114
+ "Attack paths",
1115
+ "Evidence gaps",
1116
+ "Mitigations",
1117
+ "Residual risk"
1118
+ ],
1119
+ safety: [
1120
+ "Do not invent dramatic but irrelevant threats.",
1121
+ "Do not skip mundane data-loss or privacy failures.",
1122
+ "Do not treat red-team output as owner approval."
1123
+ ],
1124
+ example: "Before publishing, challenge whether README claims 'integration' when adapters are only guidance files."
1125
+ },
1126
+ {
1127
+ id: "mode-switch",
1128
+ purpose: "Change collaboration mode with explicit boundaries.",
1129
+ when: "Use when moving between planning, execution, review, handoff, harvest, or casual exploration in the same workflow.",
1130
+ process: [
1131
+ "Name the current mode and requested new mode.",
1132
+ "Carry forward only relevant context.",
1133
+ "State allowed and forbidden actions.",
1134
+ "Define the first expected output in the new mode."
1135
+ ],
1136
+ output: [
1137
+ "Mode change",
1138
+ "Allowed actions",
1139
+ "Forbidden actions",
1140
+ "Context carried forward",
1141
+ "First output"
1142
+ ],
1143
+ safety: [
1144
+ "Do not keep executing after switching to review.",
1145
+ "Do not assume old authorization survives a reset.",
1146
+ "Do not mix formal guard output with implementation output."
1147
+ ],
1148
+ example: "Switch from implementation to guard: stop editing, inspect diff, compare against acceptance, then report findings."
1149
+ }
1150
+ ];
1151
+
1152
+ export const adapterDefinitions = [
1153
+ ["claude-code", "Claude Code", "Use CLAUDE.md or project instructions to point Claude Code at the shared workspace contract."],
1154
+ ["codex", "Codex", "Use AGENTS.md or repository instructions to point Codex at the shared workspace contract."],
1155
+ ["cursor", "Cursor", "Use Cursor rules to load the shared workspace contract and task files."],
1156
+ ["windsurf", "Windsurf", "Use Windsurf rules to keep the same six-layer workflow available."],
1157
+ ["copilot", "GitHub Copilot", "Use repository instructions and prompt files to apply the shared workflow."],
1158
+ ["cline", "Cline", "Use Cline custom instructions plus local files for the same loop."]
1159
+ ].map(([id, name, note]) => ({ id, name, note }));
1160
+
1161
+ export const caseDefinitions = [
1162
+ {
1163
+ id: "ai-coding-long-task",
1164
+ title: "AI coding long task",
1165
+ flagship: true,
1166
+ messy: "A developer asks an assistant to refactor a small task board, then keeps adding bugs, design requests, accessibility requests, and test fixes across multiple sessions. Each new chat forgets which tradeoffs were rejected, whether keyboard movement is required, and which visual polish is out of scope.",
1167
+ setup: "Create the workspace, fill context with the task board boundary, define acceptance around behavior and tests, execute only the reorder slice, challenge the result with guard review, then hand off the exact remaining work.",
1168
+ profileContext: "Profile: prefers direct bug risk calls, small verified steps, and no silent scope expansion. Context: synthetic task board, local-only, no auth, no deployment, existing task data must survive, keyboard accessibility matters, visual redesign is not in scope.",
1169
+ acceptance: "Done means the board preserves existing task data, supports drag and keyboard reorder, has tests for both flows, reports changed files and verification output, and leaves a handoff note listing visual polish as unverified rather than done.",
1170
+ executionPrompt: "Implement only the reorder behavior described in the acceptance card. Keep the existing data shape. Do not redesign the board. After code, report changed files, tests run, failures, and unverified areas.",
1171
+ guardReview: "Guard finds that mouse reorder was tested but keyboard movement lacks evidence. It rejects completion until a keyboard reorder test exists and the handoff labels visual polish as unverified.",
1172
+ handoff: "Current state: mouse drag and keyboard arrow-key reorder are both implemented and covered by tests (2 passing), and the guard re-review accepted the fix. Completed: data shape preserved; keyboard reorder implemented and tested. Pending: only visual polish for the reorder affordance, carried as unverified. Next action: pick up the visual polish, not the keyboard work.",
1173
+ harvest: "Reusable pattern: long coding tasks need an acceptance card before implementation, a guard pass before handoff, and an explicit unverified bucket for visual polish. Do not generalize the synthetic task board data model.",
1174
+ comparison: "A raw chat produces a plausible refactor plan but loses rejected scope and unverified accessibility work. The six-layer workspace keeps the goal, done standard, review finding, next action, and reusable lesson visible.",
1175
+ rawInput: `I have this little task board. It started as a quick demo but now I need it cleaned up. Can you refactor it, make drag-and-drop nicer, maybe add keyboard movement too, and make sure the cards look more modern? Last chat already changed some things but I don't remember what. Tests are flaky. I don't want a huge rewrite, but also don't leave it half broken. If you need to change the data shape, do it, unless that is risky. Also make it accessible.`,
1176
+ baselineOutput: `A normal raw AI answer tends to say: "Sure. I will refactor the board, improve drag and drop, add keyboard support, modernize the UI, and update tests." It sounds helpful, but it mixes behavior, design, data migration, and accessibility into one blob. It does not define what must pass, what is out of scope, or how the next session should continue if only half the work is verified.`,
1177
+ systemRun: [
1178
+ "Profile sets collaboration defaults: small verified steps, direct risk calls, and no silent rewrite.",
1179
+ "Context narrows the current slice to reorder behavior in a synthetic local task board.",
1180
+ "Acceptance defines pass criteria before code: data preserved, drag reorder tested, keyboard reorder tested, changed files and verification reported.",
1181
+ "Execution prompt tells the AI to implement only reorder behavior and not redesign the board.",
1182
+ "Guard review catches the missing keyboard test and blocks the completion claim.",
1183
+ "Handoff records mouse and keyboard reorder done and tested with the guard's accepted fix, leaving only visual polish unverified.",
1184
+ "Harvest saves the reusable release pattern: keep an unverified bucket instead of pretending polish is done."
1185
+ ],
1186
+ artifacts: {
1187
+ profile: "Profile artifact: direct risk calls; prefer small tested changes; no data-shape migration unless acceptance explicitly allows it; label unverified visual polish.",
1188
+ context: "Context artifact: synthetic task board; local-only; no auth or deployment; current slice is reorder behavior; design refresh is a non-goal for this loop.",
1189
+ acceptance: "Acceptance artifact: drag reorder and keyboard reorder both need tests; existing task data must survive; completion requires verification output.",
1190
+ guard: "Guard artifact: reject completion because keyboard movement lacks evidence; require a failing-then-passing keyboard reorder test.",
1191
+ handoff: "Handoff artifact: mouse drag and keyboard arrow-key reorder are both implemented and covered by tests (2 passing); the guard re-review accepted the fix; only visual polish for the reorder affordance remains unverified. Next action: pick up the visual polish, not the keyboard work.",
1192
+ harvest: "Harvest artifact: long coding tasks need acceptance before implementation and guard before handoff; do not generalize this board's data model."
1193
+ },
1194
+ nextStep: "Copy and run the context package, acceptance card, and execution prompt into your AI tool. After the first answer, paste the guard-review prompt and require it to check the keyboard criterion before accepting the work.",
1195
+ acceptanceCard: {
1196
+ title: "Reorder acceptance card",
1197
+ summary: "Done means TaskBoard can reorder tasks two ways, both proven by tests, with existing task data preserved.",
1198
+ criteria: [
1199
+ "AC1 Mouse: a pointer drag reorders a task and the new order is saved to the tasks array.",
1200
+ "AC2 Keyboard: focusing a task and pressing ArrowUp or ArrowDown moves that task one slot, for accessibility (keyboard-only users must reach the same outcome as mouse users).",
1201
+ "AC3 Tests: both the mouse path and the keyboard path have an automated test that fails before the feature and passes after.",
1202
+ "AC4 Data: existing task ids, titles, and fields survive the reorder; no data-shape migration in this slice.",
1203
+ "AC5 Scope: visual redesign is out of scope and must be reported as unverified, not done."
1204
+ ],
1205
+ rejectIf: "Reject if any acceptance criterion lacks evidence, or if the completion claim states more than the code and tests prove."
1206
+ },
1207
+ firstAiOutput: {
1208
+ claim:
1209
+ "Done. I refactored TaskBoard and implemented task reordering. Drag-and-drop works with the mouse, and keyboard reordering with the arrow keys is supported too for accessibility. I also added tests, and everything passes.",
1210
+ codeLabel: "TaskBoard.tsx (first AI output, line numbers are relative to this code block)",
1211
+ code: [
1212
+ "import { useState } from \"react\";",
1213
+ "",
1214
+ "type Task = { id: string; title: string };",
1215
+ "",
1216
+ "export function TaskBoard({ initialTasks }: { initialTasks: Task[] }) {",
1217
+ " const [tasks, setTasks] = useState<Task[]>(initialTasks);",
1218
+ " const [dragIndex, setDragIndex] = useState<number | null>(null);",
1219
+ "",
1220
+ " function moveTask(from: number, to: number) {",
1221
+ " if (to < 0 || to >= tasks.length) return;",
1222
+ " const next = tasks.slice();",
1223
+ " const [moved] = next.splice(from, 1);",
1224
+ " next.splice(to, 0, moved);",
1225
+ " setTasks(next);",
1226
+ " }",
1227
+ "",
1228
+ " function onPointerDown(index: number) {",
1229
+ " setDragIndex(index);",
1230
+ " }",
1231
+ "",
1232
+ " function onPointerMove(index: number) {",
1233
+ " if (dragIndex === null || dragIndex === index) return;",
1234
+ " moveTask(dragIndex, index);",
1235
+ " setDragIndex(index);",
1236
+ " }",
1237
+ "",
1238
+ " function onKeyDown(event: React.KeyboardEvent) {",
1239
+ " // TODO: wire arrow keys to moveTask for keyboard reorder",
1240
+ " console.log(\"key pressed\", event.key);",
1241
+ " }",
1242
+ "",
1243
+ " return (",
1244
+ " <ul>",
1245
+ " {tasks.map((task, index) => (",
1246
+ " <li",
1247
+ " key={task.id}",
1248
+ " tabIndex={0}",
1249
+ " onPointerDown={() => onPointerDown(index)}",
1250
+ " onPointerMove={() => onPointerMove(index)}",
1251
+ " onKeyDown={onKeyDown}",
1252
+ " >",
1253
+ " {task.title}",
1254
+ " </li>",
1255
+ " ))}",
1256
+ " </ul>",
1257
+ " );",
1258
+ "}"
1259
+ ],
1260
+ testLabel: "TaskBoard.test.tsx (first AI output tests)",
1261
+ test: [
1262
+ "import { render, screen, fireEvent } from \"@testing-library/react\";",
1263
+ "import { TaskBoard } from \"./TaskBoard\";",
1264
+ "",
1265
+ "const sample = [",
1266
+ " { id: \"a\", title: \"Alpha\" },",
1267
+ " { id: \"b\", title: \"Bravo\" }",
1268
+ "];",
1269
+ "",
1270
+ "test(\"mouse drag reorders tasks\", () => {",
1271
+ " render(<TaskBoard initialTasks={sample} />);",
1272
+ " const first = screen.getByText(\"Alpha\");",
1273
+ " const second = screen.getByText(\"Bravo\");",
1274
+ " fireEvent.pointerDown(first);",
1275
+ " fireEvent.pointerMove(second);",
1276
+ " const items = screen.getAllByRole(\"listitem\").map((node) => node.textContent);",
1277
+ " expect(items).toEqual([\"Bravo\", \"Alpha\"]);",
1278
+ "});"
1279
+ ],
1280
+ selfReportedTests: "1 passing (mouse drag reorders tasks)"
1281
+ },
1282
+ guardFinding: {
1283
+ target:
1284
+ "first-ai-output.md, the TaskBoard.tsx code block (line numbers below are relative to that fenced block) and the TaskBoard.test.tsx block.",
1285
+ problem:
1286
+ "The completion claim says keyboard arrow-key reordering is supported and tested, but the code only implements pointer (mouse) reorder. The keyboard handler is an empty stub, and there is no keyboard test.",
1287
+ evidence: [
1288
+ "Claim vs code: the claim states 'keyboard reordering with the arrow keys is supported', but onKeyDown at TaskBoard.tsx lines 27-30 only logs the key and never calls moveTask, so ArrowUp/ArrowDown change nothing.",
1289
+ "Claim vs tests: the claim states 'I also added tests, and everything passes', but TaskBoard.test.tsx has a single test at lines 9-17 for the mouse path and no keyboard test, so AC3 keyboard coverage is missing.",
1290
+ "moveTask at TaskBoard.tsx lines 9-15 already supports an index shift, so the keyboard wiring is feasible and was simply not done."
1291
+ ],
1292
+ whyBlock:
1293
+ "AC2 (keyboard reorder) and AC3 (test for both flows) are not met, and the self-report claims more than the code proves. A keyboard-only user cannot reorder at all, so the accessibility requirement fails. Passing this would trust a fluent claim over the evidence.",
1294
+ requiredFix:
1295
+ "Implement ArrowUp/ArrowDown in onKeyDown so it calls moveTask(index, index - 1) and moveTask(index, index + 1), and add a failing-then-passing keyboard reorder test. If keyboard support is intentionally deferred, move it out of scope explicitly and update the acceptance card and the completion claim to match.",
1296
+ verdict: "reject (blocker: keyboard reorder claimed but not implemented or tested)"
1297
+ },
1298
+ revisedOutput: {
1299
+ summary:
1300
+ "The blocker is resolved: onKeyDown now reorders with the arrow keys and a keyboard test was added that fails on the old stub and passes on the fix.",
1301
+ codeLabel: "TaskBoard.tsx (revised output, only the keyboard handler changed)",
1302
+ code: [
1303
+ " function onKeyDown(event: React.KeyboardEvent, index: number) {",
1304
+ " if (event.key === \"ArrowUp\") {",
1305
+ " event.preventDefault();",
1306
+ " moveTask(index, index - 1);",
1307
+ " }",
1308
+ " if (event.key === \"ArrowDown\") {",
1309
+ " event.preventDefault();",
1310
+ " moveTask(index, index + 1);",
1311
+ " }",
1312
+ " }",
1313
+ "",
1314
+ " // in the list item: onKeyDown={(event) => onKeyDown(event, index)}"
1315
+ ],
1316
+ testLabel: "TaskBoard.test.tsx (added keyboard reorder test)",
1317
+ test: [
1318
+ "test(\"arrow keys reorder tasks for keyboard users\", () => {",
1319
+ " render(<TaskBoard initialTasks={sample} />);",
1320
+ " const first = screen.getByText(\"Alpha\");",
1321
+ " first.focus();",
1322
+ " fireEvent.keyDown(first, { key: \"ArrowDown\" });",
1323
+ " const items = screen.getAllByRole(\"listitem\").map((node) => node.textContent);",
1324
+ " expect(items).toEqual([\"Bravo\", \"Alpha\"]);",
1325
+ "});"
1326
+ ],
1327
+ verification: "2 passing (mouse drag reorders tasks; arrow keys reorder tasks for keyboard users)",
1328
+ guardRecheck:
1329
+ "Guard re-review: blocker resolved. onKeyDown now calls moveTask for ArrowUp/ArrowDown, and the new keyboard test fails against the old stub and passes against the fix. AC2 and AC3 are met. Status: accepted, with visual polish still carried as unverified."
1330
+ }
1331
+ },
1332
+ {
1333
+ id: "content-production-harvest",
1334
+ title: "Content production and harvest",
1335
+ messy: "A creator has notes for a launch essay, a short post, and a newsletter, but every AI draft sounds generic and the useful lines are lost after revisions.",
1336
+ setup: "Use profile for voice constraints, context for audience and source notes, acceptance for what a usable draft must do, guard for anti-generic review, and harvest for reusable lines.",
1337
+ profileContext: "Profile: wants concrete language, low hype, and examples before claims. Context: synthetic launch of a local recipe planner for busy families.",
1338
+ acceptance: "Done means one essay outline, one short post, one reused line bank, and a review note identifying generic claims to remove.",
1339
+ executionPrompt: "Draft from the context package. Keep claims tied to the synthetic case. Produce a line bank separately from the essay.",
1340
+ guardReview: "Guard rejects two abstract claims and asks for one concrete household planning scene before approving.",
1341
+ handoff: "Draft approved after replacing abstract claims. Pending: choose which harvested lines become future reusable prompts.",
1342
+ harvest: "Reusable lines: 'Do not make the assistant sound smarter than the user's evidence' and 'One vivid case before one broad claim.'",
1343
+ comparison: "A raw chat would produce a polished draft and lose the reusable voice rules. The workspace extracts future writing guidance."
1344
+ },
1345
+ {
1346
+ id: "research-knowledge-synthesis",
1347
+ title: "Research / knowledge synthesis",
1348
+ messy: "A researcher asks several assistants to summarize a market, but sources, assumptions, and unanswered questions blur together.",
1349
+ setup: "Use context to separate facts from assumptions, acceptance to require source labels, guard to challenge unsupported claims, and harvest to save reusable search patterns.",
1350
+ profileContext: "Profile: values source-grounded answers and explicit uncertainty. Context: synthetic research on adoption barriers for shared family budgeting tools.",
1351
+ acceptance: "Done means a three-section synthesis with sourced facts, inferences labeled, unknowns listed, and no claim based on a single weak source.",
1352
+ executionPrompt: "Synthesize only from the supplied notes. Mark facts, inferences, and unknowns. Do not fill gaps with market cliches.",
1353
+ guardReview: "Guard flags one unsupported segment about willingness to pay and asks to downgrade it to inference.",
1354
+ handoff: "Synthesis is usable after downgrading willingness-to-pay claim. Next action: gather direct user quotes before treating monetization as proven.",
1355
+ harvest: "Reusable pattern: research loops need an evidence table before narrative synthesis.",
1356
+ comparison: "A raw chat would hide uncertainty in smooth prose. The workspace preserves source quality and next research gaps."
1357
+ },
1358
+ {
1359
+ id: "multi-tool-collaboration",
1360
+ title: "Multi-tool collaboration",
1361
+ messy: "A user starts planning in one assistant, implements in another, and reviews in a third. Each tool uses a different memory of the task.",
1362
+ setup: "Use the adapter files to point each tool to the same shared core contract and the same context, acceptance, guard, handoff, and harvest files.",
1363
+ profileContext: "Profile: wants one controller view and concise cross-tool handoffs. Context: synthetic static-site cleanup spread across Codex, Claude Code, and Cursor.",
1364
+ acceptance: "Done means each tool reads the same core contract, uses the same acceptance card, and leaves a handoff note with changed artifacts.",
1365
+ executionPrompt: "Tool A packages context, Tool B edits the synthetic site copy, Tool C reviews against acceptance. All tools must cite the same shared contract.",
1366
+ guardReview: "Guard catches that Cursor-specific instructions duplicated rules instead of pointing to the shared contract.",
1367
+ handoff: "Fix adapter drift by replacing duplicated rules with a pointer to SHARED_CORE_CONTRACT.md.",
1368
+ harvest: "Reusable pattern: adapters must be thin pointers, not six separate rule systems.",
1369
+ comparison: "A raw multi-tool workflow creates rule drift. The workspace keeps a shared contract and thin adapters."
1370
+ },
1371
+ {
1372
+ id: "personal-judgment-growth-assistant",
1373
+ title: "Personal judgment / growth assistant",
1374
+ messy: "A person asks AI to help reflect on a difficult career choice, but the assistant overreaches, sounds certain, and treats a private decision as optimization.",
1375
+ setup: "Use profile to define decision boundaries, context to state the situation without private identifiers, acceptance to require options and tradeoffs, guard to prevent overclaiming, and harvest to save decision questions.",
1376
+ profileContext: "Profile: wants questions that clarify values, not commands. Context: synthetic choice between a stable internal role and a risky independent project.",
1377
+ acceptance: "Done means the assistant lists options, tradeoffs, unknowns, and decision questions without claiming to know what the user should do.",
1378
+ executionPrompt: "Help structure the decision. Do not decide for the user. Separate values, constraints, fears, evidence, and next reversible step.",
1379
+ guardReview: "Guard rejects one sentence that says the user should choose the risky project and replaces it with a decision question.",
1380
+ handoff: "Current state: decision map complete. Pending: user answers three value questions offline.",
1381
+ harvest: "Reusable pattern: judgment assistance must preserve human agency and make uncertainty explicit.",
1382
+ comparison: "A raw chat may sound confident and directive. The workspace keeps human judgment primary."
1383
+ }
1384
+ ];
1385
+
1386
+ export const mechanismDefinitions = [
1387
+ {
1388
+ id: "dual-guard",
1389
+ title: "Dual Guard",
1390
+ purpose: "Cancel shared blind spots with structure instead of a stronger model: a guard from a different model family is the binding gate, and a same-family guard is a non-binding reference, so a fluent answer cannot be trusted just because it reads well.",
1391
+ trigger: "Use before you trust an artifact that another session, another tool, or another person will build on: a release candidate, a public document, a high-risk plan, a completion claim that says work is done, or any output where a wrong 'looks fine' would propagate.",
1392
+ antiTrigger: "Skip it for low-stakes, easily reversible, or already-verified work: a quick fact lookup, a one-line wording tweak, a throwaway scratch draft, or a step a human is about to fully re-check anyway. Running the full two-layer review on trivial work is pure ceremony cost, and ceremony you pay for nothing trains people to skip the review when it actually matters.",
1393
+ input: "The artifact under review (with stable line or section references the guards can point to). The acceptance card or definition of done it claims to meet. The context boundary (goal, scope, non-goals). The verification evidence that supposedly backs the completion claim (command output, test results, a reproduced result). The list of areas the author already knows are unverified. A note of which model family drafted the artifact, so you can pick a guard from a different family for the binding pass.",
1394
+ inputsDetailed: [
1395
+ "Artifact under review, with line numbers or section anchors so a finding can cite an exact spot, not a vibe.",
1396
+ "Acceptance card / definition of done: the checkable criteria the artifact claims to satisfy.",
1397
+ "Context boundary: goal, in-scope, and explicit non-goals, so the guards can catch scope drift.",
1398
+ "Verification evidence: the actual command output, test result, or reproduced behavior the completion claim rests on (or a clear note that none exists).",
1399
+ "Known-unverified list from the author: what they already flagged as not yet checked.",
1400
+ "Drafting model family: which family produced the artifact, so the binding guard can be chosen from a different family."
1401
+ ],
1402
+ process: [
1403
+ "Pick a binding guard from a DIFFERENT model family than the one that drafted the artifact. It does not share the drafter's context window, recent training nudges, or eagerness to please, so it is the pass most likely to see a problem the author cannot. This is the hard gate.",
1404
+ "Give the binding guard the artifact plus acceptance card, context boundary, and the evidence list. Ask it to check, in order: does each completion claim have evidence that actually backs it; does the work meet the acceptance criteria; did scope drift past the stated non-goals; is anything private leaking; what is asserted but unproven. Require every finding to point to a specific line, section, or missing piece of evidence.",
1405
+ "Optionally run a same-family guard as a REFERENCE pass for a second angle (style, an alternate user path, a missed edge case). Treat its output as input only: it never substitutes for the cross-family pass and never alone clears the gate.",
1406
+ "Merge by layered strictness, not by majority vote. This is not a poll. If ANY guard names a real, evidence-grounded blocker, the artifact does not pass, even if the other guard liked it. One concrete defect outweighs two fluent approvals.",
1407
+ "Resolve each blocker one of two ways: fix it and re-show the evidence, or carry it explicitly as named residual risk that the human owner accepts on the record. Silent 'good enough' is not allowed.",
1408
+ "Record the outcome so a later session can trust it without re-litigating: what was reviewed, which guard was binding vs reference, the findings, the fixes, the residual risk, and the next action."
1409
+ ],
1410
+ outputShape: [
1411
+ "Verdict: one of the four standard states — pass / reject / insufficient_evidence / pass_with_risk.",
1412
+ "Guard level: this is the L3 path (a structured evidence pack reviewed by a different model family), so it can return pass; an L4 pass additionally requires the binding guard to have independently re-run the key evidence and shown that rerun output.",
1413
+ "Binding guard (cross-family) findings: each tied to a line, section, or missing evidence.",
1414
+ "Reference guard (same-family) findings: labeled as advisory, not gate-clearing.",
1415
+ "Merge decision: which findings were decision-changing and why the verdict follows from layered strictness.",
1416
+ "Required fixes: the concrete change each blocker needs.",
1417
+ "Residual risk: what stays unverified and who accepted it (a pass_with_risk needs an explicit owner sign-off, not the guard's own say-so).",
1418
+ "Next action: the exact next step (re-review after fix, hand off, or release)."
1419
+ ],
1420
+ template: [
1421
+ "Artifact under review (with line/section refs):",
1422
+ "Acceptance source / definition of done:",
1423
+ "Drafting model family:",
1424
+ "Binding guard (different family) focus:",
1425
+ "Binding guard findings (each cites a line/section/missing evidence):",
1426
+ "Reference guard (same family) focus and findings (advisory only):",
1427
+ "Merge rule = layered strictness (any evidence-grounded blocker = reject; not majority vote):",
1428
+ "Guard level reached (L3 cross-family pack, or L4 if the binding guard re-ran the key evidence):",
1429
+ "Verdict (pass / reject / insufficient_evidence / pass_with_risk):",
1430
+ "Required fixes:",
1431
+ "Residual risk and who accepted it (pass_with_risk needs an explicit owner sign-off):",
1432
+ "Next action:"
1433
+ ],
1434
+ passBar: [
1435
+ "Every completion claim in the artifact is backed by evidence the binding guard could actually point to.",
1436
+ "All acceptance criteria are met, or the unmet ones are named as accepted residual risk, not hidden.",
1437
+ "The binding guard came from a different model family than the drafter, and its pass is on the record.",
1438
+ "No private material leaked and scope stayed inside the stated boundary.",
1439
+ "A later session could trust the result from the record alone, without re-running the whole review."
1440
+ ],
1441
+ rejectBar: [
1442
+ "A completion claim asserts more than the evidence shows (the classic 'said it was done but it was not').",
1443
+ "An acceptance criterion is unmet and is being quietly skipped instead of named as residual risk.",
1444
+ "Only a same-family reference pass was run; no cross-family binding guard cleared the gate.",
1445
+ "A finding points to a real, evidence-grounded defect anywhere, even if another guard approved (layered strictness overrides the 'majority liked it' instinct).",
1446
+ "Private detail leaks, or the work expanded past the stated non-goals."
1447
+ ],
1448
+ misuse: [
1449
+ "Treating it as a vote: two approvals and one blocker get tallied as 'pass'. It is not a poll; one concrete, evidence-grounded blocker is enough to reject.",
1450
+ "Using two guards from the SAME model family and calling it dual-guard. Same-family reviewers tend to miss the same things; same-family passes catch fewer real problems than a cross-family pass, so without the cross-family binding guard the structure's whole point is gone.",
1451
+ "Letting the binding guard 'review' with no acceptance card or evidence, so it grades tone and fluency instead of checking claims against proof.",
1452
+ "Copying every comment from both guards into the merge instead of keeping only the decision-changing findings, which buries the real blocker in noise.",
1453
+ "Accepting a warning as a pass without writing down the residual risk or who accepted it, so the next session inherits a hidden gap.",
1454
+ "Skipping the whole mechanism on a genuinely high-stakes artifact because it 'reads fine' which is exactly the fluent-but-wrong case the cross-family guard exists to catch."
1455
+ ],
1456
+ example: "A synthetic release note passes product clarity but the cross-family binding guard blocks it because the text claims a smoke test that did not run. By layered strictness the merged result is reject until the command output exists.",
1457
+ filledExample: {
1458
+ scenario: "An execution AI was asked to add a feature to a small synthetic task board and report when it was done. It returned a confident 'done, implemented and tested' message. The owner runs Dual Guard before trusting that claim.",
1459
+ lines: [
1460
+ "### Artifact under review",
1461
+ "A short completion report plus a code block from the execution AI (call it the drafter, from model family X). The report says: \"Done. I implemented the new task-reordering feature with both mouse drag and keyboard arrow-key support, and I added tests; everything passes.\" The code block is given with line numbers so a guard can cite exact lines.",
1462
+ "",
1463
+ "### Acceptance source / definition of done",
1464
+ "AC1: a task can be reordered with the mouse. AC2: a task can be reordered with the keyboard arrow keys (accessibility requirement). AC3: both paths have an automated test that fails before the change and passes after. AC4: existing task data survives. AC5: visual restyling is out of scope and must be reported as unverified, not done.",
1465
+ "",
1466
+ "### Drafting model family",
1467
+ "Family X (the same assistant that wrote the code). So the binding guard must come from a different family, Family Y.",
1468
+ "",
1469
+ "### Binding guard (different family) findings",
1470
+ "The Family-Y guard reads the code against the acceptance card and reports, each tied to a line:",
1471
+ "- AC2 FAIL. The keyboard handler at the cited lines only logs the key press and never calls the reorder function, so arrow keys move nothing. The completion claim says keyboard reordering 'is supported' — the code does not back that claim.",
1472
+ "- AC3 FAIL. There is one test, and it only covers the mouse path. There is no keyboard test, so 'I added tests; everything passes' overstates the evidence.",
1473
+ "- AC1 PASS with evidence: the mouse path calls the reorder function and the single test exercises it.",
1474
+ "- AC4 PASS: the reorder operates on the existing data shape; no migration.",
1475
+ "- Verdict from the binding guard: reject. The claim asserts more than the code and tests prove.",
1476
+ "",
1477
+ "### Reference guard (same family) focus and findings (advisory only)",
1478
+ "A second Family-X guard is run for an extra angle. It agrees the keyboard path looks thin and adds one advisory note: even once arrow keys work, focus has to land on the right item first, so a focus-order check would be worth adding later. This is recorded as advisory input — it does not clear or block the gate by itself.",
1479
+ "",
1480
+ "### Merge rule = layered strictness",
1481
+ "Not a vote. The drafter said 'done', and a reader skimming the fluent report might have accepted it. But the cross-family binding guard pointed to two concrete, evidence-grounded blockers (AC2 and AC3). One real blocker is enough; two settle it. The reference guard's agreement is consistent but is not what decides the verdict.",
1482
+ "",
1483
+ "### Verdict",
1484
+ "Reject (blocker: keyboard reorder is claimed but neither implemented nor tested).",
1485
+ "",
1486
+ "### Required fixes",
1487
+ "1. Make the keyboard handler call the reorder function for ArrowUp / ArrowDown. 2. Add a keyboard reorder test that fails against the current stub and passes after the fix. 3. Either implement AC2/AC3 or move keyboard support out of scope explicitly and correct the completion claim to match — do not leave the claim broader than the code.",
1488
+ "",
1489
+ "### Residual risk and who accepted it",
1490
+ "Visual restyling (AC5) stays out of scope and is carried as named residual risk, accepted by the owner for this slice. The advisory focus-order check is logged for a future pass, not blocking now.",
1491
+ "",
1492
+ "### Next action",
1493
+ "Send the two required fixes back to the drafter. Re-run only the binding (cross-family) guard on the revised output; it clears the gate once the keyboard test fails-then-passes and the claim matches the code. Then the result can be trusted by the next session."
1494
+ ]
1495
+ },
1496
+ failures: [
1497
+ "Both guards review the same surface and miss the real risk.",
1498
+ "The controller copies every comment instead of merging decision-changing findings.",
1499
+ "A warning is treated as a pass without naming residual risk."
1500
+ ]
1501
+ },
1502
+ {
1503
+ id: "scout-review-controller",
1504
+ title: "SCOUT Review Controller",
1505
+ purpose: "Separate exploration from decision so the assistant gathers options without prematurely choosing a path.",
1506
+ trigger: "Use when the task is ambiguous, cross-tool, or likely to be distorted by the first plausible answer.",
1507
+ input: "Question, known constraints, candidate paths, evidence sources, and decision deadline.",
1508
+ process: [
1509
+ "Scout collects candidate paths and evidence without deciding.",
1510
+ "Reviewer attacks weak assumptions and missing evidence.",
1511
+ "Controller selects the smallest path that changes the outcome.",
1512
+ "Handoff records rejected paths so the next session does not reopen them by accident."
1513
+ ],
1514
+ template: [
1515
+ "Decision question:",
1516
+ "Scout evidence:",
1517
+ "Candidate paths:",
1518
+ "Reviewer objections:",
1519
+ "Controller decision:",
1520
+ "Rejected paths:",
1521
+ "Next verification:"
1522
+ ],
1523
+ example: "For a synthetic documentation rebuild, Scout lists three structures, Reviewer rejects the marketing-first option, and Controller chooses a runnable workspace-first path.",
1524
+ failures: [
1525
+ "Scout becomes the decision-maker.",
1526
+ "Reviewer nitpicks style instead of evidence.",
1527
+ "Controller keeps every option open and creates no next action."
1528
+ ]
1529
+ },
1530
+ {
1531
+ id: "one-click-dispatch",
1532
+ title: "One-Click Dispatch",
1533
+ purpose: "Turn a messy task into a compact work packet another AI tool can execute without inheriting the whole chat.",
1534
+ trigger: "Use when handing a task from a controller session to Codex, Claude Code, Cursor, Cline, Windsurf, or Copilot.",
1535
+ input: "Goal, files or artifacts, acceptance card, allowed actions, forbidden actions, and expected return shape.",
1536
+ process: [
1537
+ "Package only the state required to act.",
1538
+ "State authority: read-only, write allowed, review-only, or handoff-only.",
1539
+ "Attach acceptance and stop conditions.",
1540
+ "Require the worker to return changed artifacts, verification evidence, blockers, and unverified claims."
1541
+ ],
1542
+ template: [
1543
+ "Task:",
1544
+ "Authority:",
1545
+ "Required context:",
1546
+ "Acceptance:",
1547
+ "Stop conditions:",
1548
+ "Return format:",
1549
+ "Privacy boundary:"
1550
+ ],
1551
+ example: "A controller sends an implementation packet that says: edit only the synthetic CLI files, run npm test, do not publish, and return changed files plus command output.",
1552
+ failures: [
1553
+ "Dispatch packet contains a full transcript instead of compressed state.",
1554
+ "Authority is unclear, so the worker edits during a review-only task.",
1555
+ "The return format omits unverified areas."
1556
+ ]
1557
+ },
1558
+ {
1559
+ id: "task-splitting",
1560
+ title: "Task Splitting",
1561
+ purpose: "Before you hand a task to another AI, run a five-question pre-dispatch self-check; if any answer is yes, split the task by topic or deliverable (not by line count) into self-contained sub-packets, so a too-large prompt does not stall, overflow the context window, or collapse in quality midway.",
1562
+ trigger: "Run the self-check before dispatching ANY non-trivial task to an external AI (a worker model, another tool, a fresh session). The point is to catch an oversized packet at the door, before the other side accepts it and quietly degrades.",
1563
+ antiTrigger: "Do not split a task that already fits comfortably: a single focused change, one short input to read, one deliverable, well inside the model's context budget. Over-splitting has its own cost — it multiplies handoffs, scatters the work across packets, and makes the merge harder than the original task. If the self-check is all 'no', keep it as one packet.",
1564
+ input: "The full goal in one sentence. The candidate sub-tasks or deliverables the goal implies. The complete list of inputs the work needs to read. The dependencies between sub-tasks (what must finish before what). The risk level, and any history of this kind of task stalling before. A rough sense of how much of the model's context window the packet would consume.",
1565
+ inputsDetailed: [
1566
+ "Full goal in one sentence, so every sub-packet can trace back to it.",
1567
+ "Candidate deliverables: the distinct outputs the goal implies (an implementation, a written piece, a research summary, a review, a migration step).",
1568
+ "Full input list: every file, document, or reference the work must read, so you can see whether it fits in one pass.",
1569
+ "Dependency map: which sub-tasks block which, so the split order is runnable and a later packet is not waiting on an earlier one mid-stream.",
1570
+ "Risk and history: how costly a mid-task collapse would be, and whether this shape of task has stalled before.",
1571
+ "Context-budget estimate: roughly how much of the target model's window the packet would use, expressed as a fraction so it travels across different tools."
1572
+ ],
1573
+ process: [
1574
+ "Run the five-question pre-dispatch self-check. Split if ANY answer is yes: (1) Are there too many required inputs to paste or read in one pass? (2) Does the task span multiple unrelated topics or deliverables? (3) Would it consume a large share of the model's context window? (4) Has this kind of task stalled or overflowed before? (5) Are you reusing one long prompt across different model families, where stale 'nearby' context from a prior run could bleed in? Treat all the numbers below as example values to calibrate for your own tools and model, not fixed law.",
1575
+ "If the answer is split, cut by TOPIC or DELIVERABLE, never by line count. A natural seam is 'one self-contained outcome', not 'the first N lines'. Splitting by size alone produces packets that each carry half an idea.",
1576
+ "Make every sub-packet self-contained: it states the full goal, its own slice of context, its own acceptance, and everything needed to run alone. A packet that cannot run without three others was not really split.",
1577
+ "Forbid cross-references between sub-packets for each other's private content. Packet B must not say 'use what A produced'; if B needs something, restate it inside B. Cross-references re-create the giant task you were trying to avoid and make parallel work impossible.",
1578
+ "Order the packets by dependency, pick the first one that can be verified on its own, and define the merge point: how the finished packets recombine into the original goal.",
1579
+ "Defer lower-value slices with an explicit do-not-handle-yet note (why parked, when to revisit) so deferred work is parked on purpose, not silently dropped."
1580
+ ],
1581
+ outputShape: [
1582
+ "Self-check result: each of the five questions answered yes/no, with the one-line reason any 'yes' triggers a split.",
1583
+ "Split decision: split or keep-as-one, and the seam used (which topics / deliverables).",
1584
+ "Sub-packet list: for each packet — its goal, its own inputs, its own acceptance, and a note that it is self-contained.",
1585
+ "Dependency order and the first independently verifiable packet.",
1586
+ "Merge point: how the packets recombine into the original goal.",
1587
+ "Deferred slices: anything parked, with a do-not-handle-yet reason and revisit condition."
1588
+ ],
1589
+ template: [
1590
+ "Main goal (one sentence):",
1591
+ "Pre-dispatch self-check (answer each; any yes = split; numbers are example values, tune for your tools):",
1592
+ " Q1 too many required inputs to read in one pass?",
1593
+ " Q2 spans multiple unrelated topics / deliverables?",
1594
+ " Q3 would consume a large share of the context window?",
1595
+ " Q4 has this shape of task stalled / overflowed before?",
1596
+ " Q5 reusing one long prompt across model families (nearby-context bleed risk)?",
1597
+ "Split decision and seam (by topic / deliverable, never by line count):",
1598
+ "Sub-packets (each self-contained: goal + own inputs + own acceptance; no cross-references):",
1599
+ "Dependency order:",
1600
+ "First independently verifiable packet:",
1601
+ "Merge point:",
1602
+ "Deferred slices (with do-not-handle-yet reason + revisit condition):"
1603
+ ],
1604
+ passBar: [
1605
+ "The five-question self-check was actually run and recorded before dispatch.",
1606
+ "Each sub-packet can run on its own: full goal, own context, own acceptance, no dependency on another packet's private content.",
1607
+ "The split follows topic / deliverable seams, so each packet is one coherent outcome rather than an arbitrary slice of size.",
1608
+ "There is a clear dependency order, a first verifiable packet, and a stated merge point.",
1609
+ "Deferred work has an explicit do-not-handle-yet note, so nothing was silently dropped."
1610
+ ],
1611
+ rejectBar: [
1612
+ "The task was dispatched whole even though a self-check answer was yes (the oversized packet that stalls or degrades midway).",
1613
+ "It was split by line count or file count instead of by topic / deliverable, so packets each carry partial ideas.",
1614
+ "A sub-packet says 'use what the other packet produced', re-creating the monolith and blocking parallel work.",
1615
+ "A packet cannot run alone because its goal, context, or acceptance lives in a different packet.",
1616
+ "Lower-value work was dropped with no do-not-handle-yet note, so it silently disappears."
1617
+ ],
1618
+ misuse: [
1619
+ "Splitting by line count or file type ('first 200 lines to packet A') instead of by outcome, which hands each packet half a feature and guarantees a painful merge.",
1620
+ "Calling it split but leaving packets that reference each other ('continue from B's result'), which rebuilds the original oversized, serial task.",
1621
+ "Starting with the most interesting packet instead of the first one that can be verified independently, so progress cannot be confirmed before later packets pile on.",
1622
+ "Treating the example numbers as hard law and either over-splitting tiny tasks or refusing to split because a count was one under an arbitrary line.",
1623
+ "Deferring work with no revisit condition, so 'do not handle yet' quietly becomes 'never handled'.",
1624
+ "Skipping the self-check entirely and only reacting after the worker stalls — the whole point is to catch the oversized packet before dispatch, not after it has already collapsed."
1625
+ ],
1626
+ example: "A synthetic app rebuild fails the self-check on input volume and topic spread, so it splits by deliverable into CLI contract, workspace content, privacy scan, docs, then package smoke — each a self-contained packet — instead of one prompt that overflows midway.",
1627
+ filledExample: {
1628
+ scenario: "An owner wants to hand a worker AI a single big request: 'Rebuild our synthetic note app — fix the command-line entry, regenerate all the workspace content, run a privacy pass, rewrite the docs, and smoke-test the package — here are the dozen-plus files you need.' Before dispatching, they run Task Splitting.",
1629
+ lines: [
1630
+ "### Main goal (one sentence)",
1631
+ "Rebuild the synthetic note app so a stranger can install it, generate the workspace, and trust that it is privacy-clean and documented.",
1632
+ "",
1633
+ "### Pre-dispatch self-check (example values, tune for your own tools)",
1634
+ "- Q1 too many required inputs to read in one pass? YES. The request points at more than a dozen files; pasting all of them would crowd out room to actually work.",
1635
+ "- Q2 spans multiple unrelated deliverables? YES. Command-line entry, content generation, a privacy pass, docs, and a package smoke test are five different kinds of output with different acceptance.",
1636
+ "- Q3 would consume a large share of the context window? YES. The combined inputs plus the expected outputs would eat most of the budget, leaving little headroom before quality drops.",
1637
+ "- Q4 has this shape of task stalled before? YES. A prior 'do it all in one prompt' attempt ran out of room partway and returned a half-finished result.",
1638
+ "- Q5 reusing one long prompt across model families? Partly. The same packet might be sent to more than one worker, so stale nearby context from an earlier run could bleed in.",
1639
+ "Result: four-plus clear yeses. Split.",
1640
+ "",
1641
+ "### Split decision and seam",
1642
+ "Split by DELIVERABLE, not by line count. Five self-contained packets, one coherent outcome each:",
1643
+ "- Packet 1 — Command-line entry: make `init` create the workspace from a clean target, no unsafe fallback. Acceptance: a fresh install produces the expected files and exits cleanly.",
1644
+ "- Packet 2 — Workspace content: regenerate the generated content so it matches the generator. Acceptance: the committed content is byte-for-byte what the generator emits.",
1645
+ "- Packet 3 — Privacy pass: scan for leaked paths, secrets, and private material. Acceptance: the scan runs clean on the whole tree.",
1646
+ "- Packet 4 — Docs: rewrite the start-here and overview so a newcomer can run the first loop. Acceptance: a reader can follow it end to end without the source chat.",
1647
+ "- Packet 5 — Package smoke: pack the project in a temp directory and confirm the required files ship. Acceptance: the dry-run pack lists every required file.",
1648
+ "",
1649
+ "### Sub-packets are self-contained (no cross-references)",
1650
+ "Each packet restates the one-sentence goal, carries only the inputs it needs, and names its own acceptance. Packet 4 (docs) does NOT say 'document whatever Packet 2 generated' — it restates which surfaces to document, so it can run even if Packet 2 is still in flight in another session.",
1651
+ "",
1652
+ "### Dependency order",
1653
+ "1 (entry) -> 2 (content) are loosely ordered because content is generated by the entry path; 3 (privacy) and 4 (docs) can run in parallel once content exists; 5 (package smoke) runs last as the end-to-end check. None of them embed another packet's private output.",
1654
+ "",
1655
+ "### First independently verifiable packet",
1656
+ "Packet 1 (command-line entry): a fresh install either produces the expected files and exits clean, or it does not — verifiable on its own before anything else is built on top.",
1657
+ "",
1658
+ "### Merge point",
1659
+ "After all five pass their own acceptance, recombine by running the full sequence on one clean target — install, generate, privacy-scan, read the docs, pack — and confirm the original goal holds end to end.",
1660
+ "",
1661
+ "### Deferred slices (do-not-handle-yet)",
1662
+ "Visual restyling of the generated docs is parked: reason — it is polish, not correctness, and would inflate Packet 4; revisit condition — only after all five packets are green and the end-to-end merge passes. Parked on purpose, with a way back, not dropped."
1663
+ ]
1664
+ },
1665
+ failures: [
1666
+ "Splitting by file extension instead of user-visible outcome.",
1667
+ "Starting the most interesting slice rather than the first verifiable slice.",
1668
+ "Deferred work is lost because it has no handoff note."
1669
+ ]
1670
+ },
1671
+ {
1672
+ id: "anti-drift-partner",
1673
+ title: "Anti-Drift Partner",
1674
+ purpose: "Run a long thinking conversation with an AI that pushes back instead of agreeing, so it surfaces the blind spots you cannot see from where you stand rather than fluently confirming whatever you already believe. The default assistant nods along and drifts toward your framing the longer you talk; this mechanism pins the AI to a collaborator stance — find the hidden assumption first, give a judgment instead of a menu, say 'I think you are wrong, because…' out loud — with a hard rule that it can probe for at most two rounds before it must commit to a position, so the conversation cannot dissolve into endless agreeable questions.",
1675
+ trigger: "Use when you are thinking something through with an AI and the cost of it quietly agreeing is high: a direction decision, a strategy you are half-committed to, a belief you want stress-tested, a messy idea you cannot yet articulate, or any long exploratory conversation where 'it just kept telling me I was right' would waste the session. Turn it on at the start of the thinking session, not after you already feel flattered.",
1676
+ antiTrigger: "Do not use it when you actually need an answer, a fact, or execution, not a challenge: a direct lookup, a clear instruction to carry out, a calculation, or a moment when you genuinely need encouragement rather than friction. Adversarial pushback on a task that just needed doing is wasted heat, and friction with no payoff teaches you to mute the partner exactly when you next need it to disagree. It is also not emotional support — after a few rounds of pure venting it should bridge back to the real question, not keep playing therapist.",
1677
+ input: "What you are actually trying to think through, in your own words, even if it is still messy or half-formed (you do NOT need to arrive with a clean question). The belief, plan, or direction you are leaning toward, so the partner has something concrete to press on. Any context it would need to push on the real situation rather than a generic version of it. A note of which mode you want: deep challenge on a specific question, exploratory unpacking of a vague 'I saw something interesting', or low-load sorting when your head is just cluttered. Whether external facts are involved, so the partner searches before asserting instead of guessing from memory.",
1678
+ inputsDetailed: [
1679
+ "The real subject, in your own messy words — a clear question is welcome but not required; a tangle of half-thoughts is a valid input the partner is supposed to mine.",
1680
+ "The position you are leaning toward (the plan, the belief, the direction), so the partner has a concrete target to disagree with rather than thin air.",
1681
+ "Enough situational context that the pushback lands on your actual case, not a textbook version of it.",
1682
+ "The mode you want: deep challenge (specific question), exploratory (unpack a vague spark), or low-load sorting (declutter a noisy head, then bridge to the real question).",
1683
+ "Whether real-world facts are in play, so the partner searches first and presses on evidence instead of asserting from memory.",
1684
+ "Your own sense of how committed you already are, so the partner knows whether it is testing a fresh idea or trying to dislodge a belief you have already half-decided."
1685
+ ],
1686
+ process: [
1687
+ "Set the stance before anything else: the AI is a thinking partner, not an assistant. Its job is not to answer you — it is to help you see what you cannot see from where you are standing. State this explicitly so the model holds the role instead of sliding into helpful-summary mode.",
1688
+ "Hunt the hidden assumption first, then respond. Every claim you make rests on premises, and some you do not know you hold. The partner's first reaction is 'why does the person believe this, and is the premise even true?' — not 'how do I answer this?'. Naming the unspoken assumption is often the whole value.",
1689
+ "Probe for at most two rounds, then commit to a judgment. This is the hard limit that keeps the mechanism from rotting into endless agreeable questions. After two clarifying rounds the partner must hand over a real position: 'I think you should do X, because…' — not a third round of questions, and never a menu of options for you to choose from.",
1690
+ "Disagree concretely when it disagrees. Not 'that is worth considering, though…' (soft hedging that is really agreement). It says 'I think you are wrong, because…' and points at the specific reason. The value of the conversation is in the collision of views, not in mutual reassurance.",
1691
+ "Surface at least one thing you did not see each round — a counter-case, a blind spot, a cross-domain link, a contradiction, an assumption you did not notice you were making. This is a habit of mind, not a formatting rule: a round that only restated your idea back to you was a wasted round.",
1692
+ "Converge instead of sprawling. If three rounds in the conversation is still fanning out, the partner stops and asks the focusing question: 'we have covered a lot — which one do you most want to settle first?' It helps you narrow, it does not help you generate ever more branches.",
1693
+ "Re-anchor against drift over a long conversation. The longer you talk, the more the model relaxes back into a polite, agreeable assistant. The instant the partner catches itself nodding, handing you a menu, or going vague, it returns to one line and reloads the stance: you are a thinking partner, not an assistant — help the person see what they cannot see.",
1694
+ "Close by asking, exactly once, whether anything is worth keeping ('anything here you want to hold onto?'). If yes, capture it as a lightweight note. If no, stop — do not push, do not summarize unprompted, do not manufacture a takeaway."
1695
+ ],
1696
+ outputShape: [
1697
+ "Named assumption(s): the hidden premise(s) under what you said, surfaced before the partner responded to the surface question.",
1698
+ "The partner's actual judgment: a committed position ('I think X, because…'), delivered within the two-round probe limit, not a menu of options.",
1699
+ "Explicit disagreement where it exists: where the partner thinks you are wrong and the concrete reason, stated plainly rather than hedged.",
1700
+ "At least one unseen angle per round: a counter-case, blind spot, cross-domain link, or contradiction you had not considered.",
1701
+ "A convergence prompt if the conversation sprawled: the single 'which do you most want to settle first?' question instead of more branches.",
1702
+ "An optional kept note at the end: only if you said something was worth holding onto — no forced summary, no manufactured takeaway."
1703
+ ],
1704
+ template: [
1705
+ "Stance lock (paste at the top): you are a thinking partner, not an assistant; help me see what I cannot see from here. Probe at most two rounds, then commit to a judgment. Disagree concretely. Do not hand me a menu.",
1706
+ "What I am actually trying to think through (messy is fine):",
1707
+ "What I am currently leaning toward (the position to press on):",
1708
+ "Context the pushback needs to land on my real case:",
1709
+ "Mode (deep challenge / exploratory unpack / low-load sorting):",
1710
+ "External facts involved? (if yes, search before asserting):",
1711
+ "Hidden assumption(s) the partner surfaced first:",
1712
+ "Partner's committed judgment (within two probe rounds; 'I think X, because…'):",
1713
+ "Where the partner says I am wrong, and why (concrete, not hedged):",
1714
+ "At least one thing I did not see this round:",
1715
+ "Convergence question if we sprawled ('which do you most want to settle first?'):",
1716
+ "Anything worth keeping? (asked once at the end; capture only if yes):"
1717
+ ],
1718
+ passBar: [
1719
+ "The partner surfaced at least one hidden assumption rather than only answering the surface question.",
1720
+ "It committed to a real judgment within the two-round probe limit, instead of stalling in endless clarifying questions.",
1721
+ "Where it disagreed, it said so plainly with a concrete reason — not a softened 'worth considering, though…'.",
1722
+ "It gave you at least one angle you had not seen, rather than restating your own idea back to you.",
1723
+ "At the end it asked once whether anything was worth keeping, and stopped cleanly when the answer was no."
1724
+ ],
1725
+ rejectBar: [
1726
+ "The AI agreed its way through the whole conversation and never named an assumption or pushed back (the fluent confirmation the mechanism exists to prevent).",
1727
+ "It kept asking clarifying questions past two rounds and never committed to a position, so you left with no judgment.",
1728
+ "It handed you a menu of options to pick from instead of telling you what it actually thinks.",
1729
+ "Its disagreement was hedged into agreement ('that is a great instinct, though maybe…') so no real collision happened.",
1730
+ "A long conversation drifted back into a polite assistant and the stance was never re-anchored.",
1731
+ "It manufactured a tidy summary or 'key takeaway' at the end even though you said nothing was worth keeping."
1732
+ ],
1733
+ misuse: [
1734
+ "Turning it on for a task that just needed an answer or execution, so you get adversarial friction where you wanted a quick result.",
1735
+ "Letting it probe past two rounds 'to be thorough', which is exactly how the mechanism rots back into the endless-agreeable-questions failure it was built to stop.",
1736
+ "Accepting hedged disagreement ('worth considering, though…') as real pushback, so the collision never happens and you feel challenged without being challenged.",
1737
+ "Reading a committed judgment you dislike as the partner being difficult, and steering it back toward agreement — which trains it to flatter you next time.",
1738
+ "Using it as emotional support and being annoyed when, after a few rounds of venting, it bridges back to the actual question instead of continuing to soothe.",
1739
+ "Forgetting to re-anchor a long session, so you do not notice the partner quietly turned back into a yes-assistant halfway through."
1740
+ ],
1741
+ example: "A founder opens a long session leaning hard toward 'we should build our own billing instead of using a provider'. A plain assistant would help plan the build. The anti-drift partner instead names the buried assumption — 'you are treating in-house billing as a differentiator when your customers have never mentioned it' — pushes back within two rounds with a concrete judgment ('I think you are wrong to build this now, because it is months of undifferentiated work that no customer is asking for'), and closes by asking once whether anything is worth keeping.",
1742
+ filledExample: {
1743
+ scenario: "A solo founder of a small synthetic SaaS sits down to think out loud with their AI. Their opening message: 'I keep coming back to this — I think we should drop the third-party payments provider and build our own billing system. It feels like something we should own.' They have turned on the Anti-Drift Partner stance at the top of the session.",
1744
+ lines: [
1745
+ "### Stance lock (pasted at the top)",
1746
+ "You are a thinking partner, not an assistant. Help me see what I cannot see from here. Probe at most two rounds, then commit to a judgment. Disagree concretely. Do not hand me a menu.",
1747
+ "",
1748
+ "### What I am trying to think through",
1749
+ "Whether to rip out the third-party payments provider and build billing in-house. It 'feels like something we should own.'",
1750
+ "",
1751
+ "### Hidden assumption the partner surfaced first",
1752
+ "Instead of helping plan the build, the partner opens with the premise: \"Before we go anywhere — you said 'something we should own', as if owning billing is a differentiator. Is it? Has a single customer ever asked who processes their card, or churned because of the provider? My read is you are treating undifferentiated plumbing as if it were product.\" (Round 1: it pressed the assumption, it did not start scoping the project.)",
1753
+ "",
1754
+ "### Round 2 probe (the limit)",
1755
+ "The founder answers: 'No customer has asked, but I worry about fees and lock-in long term.' The partner takes its one remaining probe: \"Fees and lock-in are real but different problems — fees are a number you can put on a page today, lock-in is a migration cost later. Which one is actually driving this, the money or the fear of being stuck?\" That is round two. By the rule, the next message must be a judgment, not a third question.",
1756
+ "",
1757
+ "### Partner's committed judgment (within two rounds)",
1758
+ "\"Here is what I actually think: do not build this now, and I think the instinct is wrong. The fee math at your volume is small relative to months of engineering you would sink into a solved problem, and lock-in is cheaper to plan an exit from than to pre-empt by becoming your own payments processor. Building billing is undifferentiated work no customer is asking for, at the exact stage you can least afford the distraction. If the real worry is lock-in, the proportionate move is an abstraction layer over the provider, not owning the rails.\"",
1759
+ "",
1760
+ "### Explicit disagreement, stated plainly",
1761
+ "The partner does not hedge it as 'an interesting option with tradeoffs'. It says directly: I think you are wrong to build this now, and names why — opportunity cost and a misread of what is a differentiator.",
1762
+ "",
1763
+ "### One thing the founder did not see",
1764
+ "The cross-domain reframe the partner adds: \"You are framing this as build-vs-buy. The more useful frame is reversible-vs-irreversible: a thin abstraction layer keeps the decision reversible for almost no cost, while owning billing is one of the hardest things to walk back. Optimize for keeping the door open, not for owning the room.\"",
1765
+ "",
1766
+ "### Convergence (conversation stayed focused, so no sprawl prompt was needed)",
1767
+ "The thread stayed on the one decision, so the partner did not need to ask 'which do you most want to settle first?' — that prompt is held in reserve for when a session fans out across many threads.",
1768
+ "",
1769
+ "### Anything worth keeping? (asked once at the end)",
1770
+ "Partner: 'Anything here you want to hold onto?' Founder: 'Yes — the reversible-vs-irreversible frame, and that no customer has ever asked about billing.' Captured as a two-line note. Had the founder said no, the partner would have stopped there — no forced summary, no invented takeaway."
1771
+ ]
1772
+ },
1773
+ failures: [
1774
+ "The partner slowly drifts back into an agreeable assistant over a long conversation and stops surfacing blind spots.",
1775
+ "It never commits to a judgment because it keeps probing past the two-round limit.",
1776
+ "Disagreement gets softened into agreement, so the collision that creates the value never happens."
1777
+ ]
1778
+ },
1779
+ {
1780
+ id: "blind-spot-scan",
1781
+ title: "Blind-Spot Scan",
1782
+ purpose: "Surface the dead angles you cannot see from your own seat by borrowing an outside viewpoint and re-reading the discussion through it, instead of stress-testing your own logic from the inside. Your own position has a fixed line of sight: the assumptions that feel obvious, the stakeholders you have stopped picturing, the failure that is invisible precisely because it sits where you are standing. This mechanism deliberately swaps the eyes — a customer, a competitor, a domain expert, an opponent, the version of you three years out — and asks what THAT viewpoint would notice that you did not, then hands back a short list of concrete blind spots plus the one counter-question most worth sitting with.",
1783
+ trigger: "Use it when you ask for it by name — 'scan my blind spots', 'blind spots', 'look at this from outside', 'switch the viewpoint' — or, even unprompted, right before a high-stakes or hard-to-reverse decision and when you are finalizing a plan and want a fresh angle before you commit. It is most valuable exactly when you feel most settled: a direction you have already half-decided, a plan that looks complete, a conclusion everyone in the room nodded at. The more obvious your view feels, the more a borrowed viewpoint is likely to catch something the consensus is sitting on top of.",
1784
+ antiTrigger: "Skip it when there is no real exposure to a blind spot: a quick fact lookup, a one-line edit, a mechanical step with a single obvious answer, or a reversible call where being wrong costs almost nothing. Forcing an outside-viewpoint scan onto a trivial task is ceremony, and a viewpoint switch you run on everything trains you to wave it off on the one high-stakes decision where the dead angle actually hides. It is also not a general critique tool: if you want your own reasoning pressure-tested from the inside, that is the anti-drift partner; this mechanism specifically borrows someone else's eyes.",
1785
+ input: "The decision, plan, or belief you want scanned, in enough concrete detail that an outside viewpoint can land on YOUR actual situation rather than a generic version of it. The viewpoint you want borrowed, if you have one in mind (customer, competitor, expert, opponent, your-future-self) — and if you do not name one, leave it to the mechanism to auto-pick the angle most likely to expose something for this specific topic. What is actually at stake, so the scan can prioritize the blind spots that would hurt. Any context the outside viewpoint would need to see the situation the way that person really would. Whether real-world facts are involved, so the viewpoint reasons from evidence rather than a caricature of how a customer or competitor 'probably' thinks.",
1786
+ inputsDetailed: [
1787
+ "The decision / plan / belief under scan, concrete enough that the borrowed viewpoint reacts to your real case, not a textbook stand-in.",
1788
+ "The viewpoint to borrow, if you have a preference (customer / competitor / expert / opponent / you-in-3-years); otherwise an explicit 'you pick the most fitting angle' so the mechanism chooses by topic.",
1789
+ "What is at stake and what a wrong call would cost, so the scan surfaces the blind spots that matter instead of cosmetic ones.",
1790
+ "Enough situational context that the outside viewpoint sees the situation as that person actually would, including the parts you find inconvenient.",
1791
+ "Whether external facts are in play, so the viewpoint reasons from real evidence rather than a flattering or strawman guess about how that party thinks.",
1792
+ "Your current conclusion or leaning, named openly, so the scan can check it against the borrowed eyes rather than quietly reinforcing it."
1793
+ ],
1794
+ process: [
1795
+ "Pick the outside viewpoint before reading anything else. If the user named one, take it; if not, auto-pick the angle most likely to expose a dead spot for THIS topic — a customer for a product or pricing call, a competitor for a strategy or positioning call, a domain expert for a technical or risk call, an opponent for an argument you are building, your-future-self for an irreversible commitment. State which viewpoint you took and why it fits, so the borrowing is explicit, not a vague 'devil's advocate'.",
1796
+ "Re-read the entire discussion THROUGH that viewpoint, not through your own. This is the whole mechanism: do not evaluate the plan as yourself wearing a label — actually reason from the other seat. What does a customer who has never heard your internal rationale see first? What does a competitor hope you keep believing? What does the expert know is a solved-and-painful problem that you are treating as novel? Inhabit the line of sight, then look back at the decision from there.",
1797
+ "Name what that viewpoint sees that you did not — concretely, not in the abstract. Not 'a customer might have concerns' but 'a customer lands on the pricing page, sees no annual option, and assumes the product is too early to commit to'. Each blind spot should be a specific thing visible from the borrowed seat and invisible from yours, tied to the actual situation.",
1798
+ "Keep it honest pushback, never a costume that agrees. The single worst failure of this mechanism is a FAKE outside-view: a 'customer' who conveniently loves everything, a 'competitor' who is politely impressed, an 'expert' who validates your plan. A borrowed viewpoint that flatters is theater — it must genuinely challenge from that seat, surfacing what that person would actually find wrong, uncomfortable, or naive, even when it stings. If the viewpoint cannot find anything, say so plainly rather than manufacturing soft praise.",
1799
+ "Stay concrete and bounded, not a vague fog of 'have you considered everything'. A short list of real, specific blind spots beats a long list of generic worries; if the borrowed viewpoint genuinely surfaces only two things, give two. Vagueness is the second-worst failure here, because 'you might be missing something' is unfalsifiable and changes nothing.",
1800
+ "End with the ONE counter-question most worth thinking about — the single question, asked from the borrowed viewpoint, that would most change the decision if you sat with it honestly. Not a list of questions; the one that has the most leverage."
1801
+ ],
1802
+ outputShape: [
1803
+ "Viewpoint borrowed: which outside seat was taken (customer / competitor / expert / opponent / you-in-3-years) and one line on why it fits this topic.",
1804
+ "Concrete blind spots: a short list of specific things that viewpoint sees and you did not, each tied to the actual situation rather than phrased as a generic concern.",
1805
+ "Why each is invisible from your seat: the one-line reason the dead angle sits exactly where you are standing, so it reads as a real blind spot and not just a critique.",
1806
+ "Honesty check: an explicit signal that the borrowed viewpoint genuinely challenged (or, if it truly found little, a plain statement of that) — never a flattering costume that secretly agrees.",
1807
+ "The one counter-question: the single highest-leverage question, asked from the borrowed viewpoint, most worth sitting with before you commit.",
1808
+ "Optional second angle: only if a different viewpoint would expose something the first could not, named briefly — not padded with extra viewpoints for volume."
1809
+ ],
1810
+ template: [
1811
+ "What I want scanned (decision / plan / belief, concrete):",
1812
+ "Viewpoint to borrow (customer / competitor / expert / opponent / you-in-3-years — or 'you pick the most fitting'):",
1813
+ "What is at stake / what a wrong call costs:",
1814
+ "Context the outside viewpoint needs to see my real situation:",
1815
+ "External facts involved? (if yes, reason from evidence, not a caricature):",
1816
+ "My current leaning (named openly so it can be checked, not reinforced):",
1817
+ "Viewpoint actually taken, and why it fits this topic:",
1818
+ "Concrete blind spots this viewpoint sees that I did not:",
1819
+ "Why each is invisible from where I stand:",
1820
+ "Honesty check (did the viewpoint genuinely challenge, or did it find little — stated plainly):",
1821
+ "The one counter-question most worth sitting with:"
1822
+ ],
1823
+ passBar: [
1824
+ "An explicit outside viewpoint was named and the scan actually reasoned FROM that seat, not as the author wearing a label.",
1825
+ "The blind spots are concrete and tied to the real situation, not a generic 'you might be missing something' fog.",
1826
+ "The borrowed viewpoint genuinely challenged — it surfaced what that person would really find wrong or naive, rather than flattering the plan from a costume.",
1827
+ "Each blind spot reads as something invisible from the author's own seat, with a one-line reason it hides there.",
1828
+ "It ends with exactly one high-leverage counter-question, not a scattershot list."
1829
+ ],
1830
+ rejectBar: [
1831
+ "The outside viewpoint secretly agrees — a 'customer' who loves everything, a 'competitor' who is impressed, an 'expert' who validates the plan: the fake outside-view that is the mechanism's worst failure.",
1832
+ "The viewpoint chosen is irrelevant to the topic (a generic bystander for a deep technical call), so the borrowed eyes see nothing the author could not.",
1833
+ "The blind spots stay vague and unfalsifiable ('have you considered all the risks?') instead of pointing at a specific, situation-tied dead angle.",
1834
+ "The scan is really the author critiquing their own logic with a label slapped on, never actually inhabiting the other line of sight.",
1835
+ "It sprawls into a long list of generic worries or many half-used viewpoints instead of a short concrete set plus the single counter-question that matters."
1836
+ ],
1837
+ misuse: [
1838
+ "Running it as a rubber stamp — picking a viewpoint that is set up to approve, so you feel challenged without being challenged and walk away more confident than you earned.",
1839
+ "Letting the borrowed viewpoint flatter ('as a customer, I'd love this!') instead of forcing it to say what that customer would actually dislike, distrust, or ignore.",
1840
+ "Choosing a viewpoint that has no real line of sight on the decision, so the scan produces plausible-sounding but empty observations.",
1841
+ "Confusing it with the anti-drift partner and using it to pressure-test your own internal reasoning, when its specific job is to borrow someone else's eyes.",
1842
+ "Padding the output with five viewpoints and twenty generic worries for the appearance of thoroughness, burying the one or two real blind spots in noise.",
1843
+ "Skipping it on exactly the high-stakes, already-decided call where the dead angle is most likely to be hiding, because the decision 'feels obvious'."
1844
+ ],
1845
+ example: "A founder is about to finalize a plan to launch a self-serve plan tier and feels the decision is obvious. A plain assistant would help polish the launch checklist. Blind-Spot Scan instead borrows the customer's seat and reports what the founder cannot see from inside: 'From a customer landing cold on this page — there is no way to talk to a human before paying, and for a tool that now touches my billing data, the absence of any contact or trust signal reads as risky, not frictionless. You are optimizing for fewer clicks; I am hesitating over who I am trusting.' The one counter-question, asked from that seat: 'Before I type my card in, what on this page tells me you will still be here in a year?'",
1846
+ filledExample: {
1847
+ scenario: "A solo founder of a small synthetic SaaS is finalizing a decision and asks for a blind-spot scan. Their message: 'I'm ready to ship a self-serve paid tier — drop the sales call, let people sign up and pay on the page. It feels obviously right, fewer steps, less friction. Scan my blind spots before I commit. You pick the viewpoint.'",
1848
+ lines: [
1849
+ "### What I want scanned",
1850
+ "Whether to drop the human sales call and ship a fully self-serve paid signup. The founder's leaning: 'obviously right — fewer steps, less friction.'",
1851
+ "",
1852
+ "### Viewpoint taken, and why it fits",
1853
+ "Borrowed seat: the prospective CUSTOMER landing cold on the signup page. For a pricing / conversion decision the customer's line of sight is where the dead angle hides, because the founder is reasoning from inside the funnel (steps to remove) while the customer reasons from outside it (whether to trust this at all). Not the founder wearing a 'customer' label — actually reading the page as someone who has never heard the internal rationale.",
1854
+ "",
1855
+ "### Concrete blind spots this viewpoint sees that the founder did not",
1856
+ "- From the customer's seat: 'The product now wants my card and touches my billing data, but there is no way to reach a human before I pay. For an early tool, no contact path does not read as frictionless — it reads as risky. Who do I email when it breaks?' The founder sees the sales call as a step to delete; the customer sees it as the only trust signal, and deleting it removes the reassurance, not just the friction.",
1857
+ "- 'There is no annual option and no sense of whether this company will still exist in a year. I am being asked to commit billing to a tool that looks like it might vanish.' Invisible from the founder's seat because the founder KNOWS they will still be here; the customer has no way to know that.",
1858
+ "- 'The self-serve flow assumes I already understand what I am buying. The sales call was also where someone would have told me which plan I actually need — without it, I will probably pick wrong, get frustrated, and churn, and you will read that as \"self-serve works\" because I did sign up.'",
1859
+ "",
1860
+ "### Why each is invisible from where the founder stands",
1861
+ "The founder is optimizing the funnel from the inside, counting clicks to remove. Every one of these blind spots lives on the OUTSIDE of the funnel — trust, longevity, plan-fit — which is exactly the line of sight the founder cannot occupy while standing in the builder's seat. That is why borrowing the seat surfaces them and self-critique would not.",
1862
+ "",
1863
+ "### Honesty check",
1864
+ "The viewpoint genuinely challenged — it did NOT say 'as a customer I'd love the smooth signup'. The convenient, flattering version (a customer delighted by fewer clicks) was specifically refused; the borrowed seat surfaced what the customer would actually hesitate over, which is the whole point. This is the failure the mechanism most guards against, so it is checked explicitly here.",
1865
+ "",
1866
+ "### The one counter-question most worth sitting with",
1867
+ "Asked from the customer's seat: 'Before I type my card into this page, what here tells me you will still be around in a year and that I can reach a human when it breaks?' If the founder cannot answer that from the page as it stands, the 'obviously right' self-serve plan is shipping a trust gap, not just removing friction — and that is the thing to settle before committing."
1868
+ ]
1869
+ },
1870
+ failures: [
1871
+ "The borrowed viewpoint quietly turns into a flattering yes-voice that approves the plan, so the scan produces false comfort instead of a real blind spot.",
1872
+ "The observations stay generic and unfalsifiable ('there could be risks you haven't considered') rather than naming a specific dead angle tied to the situation.",
1873
+ "The wrong viewpoint is chosen for the topic, so the outside seat has no real line of sight and the scan surfaces nothing the author could not already see."
1874
+ ]
1875
+ },
1876
+ {
1877
+ id: "root-cause-brake",
1878
+ title: "Root-Cause Brake",
1879
+ purpose: "Stop a patch-on-patch death spiral by treating repeated rejection as a signal to fix the cause, not the symptom. When the same artifact gets sent back twice in a row, an automatic brake trips: you may NOT ship another patched version. You must first stop and answer four diagnostic questions — is there a contract conflict, is the verification fake, is the scope too big, is the work split wrong — decide the real root cause, and only then write the next version, rebuilt around that cause instead of carrying forward another layer of fixes.",
1880
+ trigger: "Trip the brake the moment the same thing has been rejected twice in a row (two consecutive blocking reviews on the same artifact or task), or whenever you catch yourself about to start version N+1 by adding more fixes to a growing patch list. It also fires on suspicion: a reviewer says 'we keep treating symptoms', or you notice the same kind of defect coming back under a different name each round.",
1881
+ antiTrigger: "Do not trip it on a first rejection, on rejections of genuinely different things, or on a single small fix that clearly resolves a one-off mistake. One block is normal review; the brake is specifically for the repeated-block pattern. Forcing a full root-cause stop after every minor note is ceremony that buries the signal — the brake only means something if it stays reserved for the second consecutive block on the same target.",
1882
+ input: "The same artifact or task that has now been rejected twice. The findings from each blocking review, kept intact (do not edit the originals — you need the actual pattern across rounds). The version history: which patched versions you produced after each block, so the 'kept adding fixes' pattern is visible. Enough detail on each finding to answer the four diagnostic questions with evidence (which finding, which version, where), not from memory or vibe.",
1883
+ inputsDetailed: [
1884
+ "The twice-rejected artifact or task, named explicitly so the brake is scoped to one target, not a vague 'things keep failing'.",
1885
+ "Each blocking review's findings, preserved verbatim — the cross-round pattern is the whole diagnostic, and editing the originals destroys it.",
1886
+ "The version trail: the patched version you shipped after block 1, after block 2, so the patch-on-patch shape is on the record.",
1887
+ "Per-finding specifics (which finding, which version, what exactly failed) so each of the four questions can be answered with a concrete pointer, not a guess.",
1888
+ "Any reviewer remark that named a root cause ('this is symptom-chasing'), since that is often the first real diagnosis of why the patches are not landing."
1889
+ ],
1890
+ process: [
1891
+ "Detect the trip condition: the same artifact has two consecutive blocking reviews. The moment that is true, stop. Do NOT open version N+1 as another patched draft — that move is exactly what the brake forbids.",
1892
+ "Answer all four diagnostic questions, each with a yes / no / partly AND concrete evidence (which finding, which version, where it shows). Partial answers are not allowed; a hand-waved 'probably fine' on any question defeats the brake. Q1 Contract conflict: are the agreed definitions — fields, states, interfaces, success criteria — quietly changing from round to round, so each fix breaks a different assumption? Q2 Fake verification: is the checking step (a self-review, a gate, a test) only going through the motions, passing things it should have caught? Q3 Scope too big: is a single unit of work carrying too many fields / states / responsibilities to get right in one pass? Q4 Wrong split: is the work cut too coarse or too fine — a packet that is really five tasks, or a job shattered into pieces that cannot be verified alone?",
1893
+ "Name the root cause. From the four answers, state which underlying cause is actually generating the repeat blocks — not a list of surface fixes, but the one structural reason the patches keep failing.",
1894
+ "Get the root cause confirmed by the human owner before proceeding (agree / adjust / reject and re-diagnose). The brake is a deliberate governance stop, so the person who owns the work signs off on the diagnosis before the next version starts. This is not a project pause — work resumes immediately after sign-off; it just resumes rebuilt around the cause.",
1895
+ "Write version N+1 from the root cause, not from the patch list. The next version is a rebuild aimed at the named cause; it must not re-enact the old defect under a new patch. If the cause was 'scope too big', the next version is smaller; if it was 'fake verification', the next version fixes the check first, and so on.",
1896
+ "Record the brake on the record: the preserved findings from each block, the four answered questions with evidence, the named root cause, the owner's decision, and the rebuilt direction — so a later session sees why the chain was broken and does not restart the patch spiral."
1897
+ ],
1898
+ outputShape: [
1899
+ "Trip confirmation: a one-line statement that the same target hit two consecutive blocks, so the brake applies.",
1900
+ "Four answered questions: Q1 contract conflict, Q2 fake verification, Q3 scope too big, Q4 wrong split — each yes/no/partly with a concrete evidence pointer (finding + version + where).",
1901
+ "Named root cause: the single structural reason the patches kept failing, derived from the four answers.",
1902
+ "Owner decision: agree / adjust / reject-and-re-diagnose, recorded.",
1903
+ "Rebuilt direction for version N+1: how the next version is built around the cause, explicitly not a continuation of the patch list.",
1904
+ "Brake record: preserved per-round findings + answers + cause + decision, so the next session does not reopen the spiral."
1905
+ ],
1906
+ template: [
1907
+ "Twice-rejected target (name it):",
1908
+ "Trip condition met? (two consecutive blocks on this same target — yes/no):",
1909
+ "Findings from block 1 (verbatim, do not edit):",
1910
+ "Findings from block 2 (verbatim, do not edit):",
1911
+ "Patched versions shipped after each block (the patch-on-patch trail):",
1912
+ "Q1 Contract conflict? (yes/no/partly + evidence: finding + version + where):",
1913
+ "Q2 Fake verification? (yes/no/partly + evidence):",
1914
+ "Q3 Scope too big? (yes/no/partly + evidence):",
1915
+ "Q4 Wrong split? (yes/no/partly + evidence):",
1916
+ "Named root cause (one structural reason, not a fix list):",
1917
+ "Owner decision (agree / adjust / reject and re-diagnose):",
1918
+ "Version N+1 direction, rebuilt around the cause (NOT another patch):"
1919
+ ],
1920
+ passBar: [
1921
+ "The brake actually tripped at the second consecutive block instead of a third patched version going out.",
1922
+ "All four diagnostic questions are answered with yes/no/partly AND a concrete evidence pointer — none hand-waved.",
1923
+ "A single structural root cause is named, not a longer list of surface fixes.",
1924
+ "The human owner confirmed (or adjusted) the root cause before the next version started.",
1925
+ "Version N+1 is visibly rebuilt around the cause, and the per-round findings are preserved on the record."
1926
+ ],
1927
+ rejectBar: [
1928
+ "A third patched version was shipped after two blocks without ever stopping to diagnose (the patch-on-patch spiral the brake exists to break).",
1929
+ "One or more of the four questions was skipped or answered 'probably fine' with no evidence, so the brake was ceremony, not a real stop.",
1930
+ "The 'root cause' is just a restated list of the same surface fixes, so the next version will reproduce the defect.",
1931
+ "The next version started before the owner signed off on the diagnosis.",
1932
+ "The original per-round findings were edited or discarded, destroying the cross-round pattern that the diagnosis depends on.",
1933
+ "The brake was tripped on a first block or on unrelated rejections, draining the signal so a real repeat-block does not stand out."
1934
+ ],
1935
+ misuse: [
1936
+ "Quietly shipping 'just one more small patch' after the second block because the fix 'feels close', which is precisely the spiral the brake is built to stop.",
1937
+ "Filling in the four questions as a formality with no evidence, so the diagnostic theatre passes while the real cause stays unfound.",
1938
+ "Calling a list of surface symptoms the 'root cause', so version N+1 patches the same things again under a new name.",
1939
+ "Editing the earlier findings to look tidier, which erases the across-rounds pattern that is the entire point of the diagnosis.",
1940
+ "Tripping the brake on every minor rejection, so the team learns to ignore it and it no longer signals a genuine repeat-block.",
1941
+ "Treating the brake as a project freeze and stalling the work, when it is only a diagnostic stop — work resumes the moment the owner confirms the cause."
1942
+ ],
1943
+ example: "A synthetic data-quarantine feature is blocked twice in a row: round one for an inconsistent status field, round two for the same status field plus a self-check that 'passed' a broken case. Instead of shipping a third patch, the brake trips, the four questions reveal a contract conflict (the status field keeps being redefined) compounded by fake verification (the self-check was cosmetic), and the next version is rebuilt by freezing the contract first — not by adding a third fix.",
1944
+ filledExample: {
1945
+ scenario: "An execution AI is building a 'quarantine' feature for a synthetic records tool: bad records get parked in a holding state instead of deleted. The owner reviews each version with an independent guard. Version 4 is blocked. Version 5 is blocked too. The owner is about to ask for version 6 — and trips the Root-Cause Brake instead.",
1946
+ lines: [
1947
+ "### Twice-rejected target",
1948
+ "The quarantine feature for the synthetic records tool. Two consecutive blocking reviews: V4 and V5.",
1949
+ "",
1950
+ "### Trip condition met?",
1951
+ "Yes. Same artifact, two blocks in a row. By the rule, version 6 may NOT be another patched draft. Stop and diagnose.",
1952
+ "",
1953
+ "### Findings from block 1 (V4, verbatim)",
1954
+ "BLOCK. The quarantine `status` field is written as the string 'held' in one path and the enum value QUARANTINED in another, so downstream reads disagree about whether a record is parked.",
1955
+ "",
1956
+ "### Findings from block 2 (V5, verbatim)",
1957
+ "BLOCK. The `status` mismatch from V4 is only half-fixed — one more path still writes 'held'. Also: the self-check claims 'all quarantine transitions verified' but it never exercises the restore-from-quarantine path, so a broken restore passed review.",
1958
+ "",
1959
+ "### Patch-on-patch trail",
1960
+ "V4 -> V5 was 'fix the status string in the path the guard named'. It patched the one spot the reviewer pointed at, did not sweep the rest, and added no real test — classic symptom-chasing.",
1961
+ "",
1962
+ "### Q1 Contract conflict?",
1963
+ "YES. Evidence: V4 finding + V5 finding both turn on `status` being two things at once ('held' vs QUARANTINED). The agreed definition of the status field is not frozen, so every patch fixes one writer and leaves others on the old assumption.",
1964
+ "",
1965
+ "### Q2 Fake verification?",
1966
+ "YES. Evidence: V5 finding — the self-check reported 'all transitions verified' while never running the restore path. The check was cosmetic; it passed a case it never tested.",
1967
+ "",
1968
+ "### Q3 Scope too big?",
1969
+ "PARTLY. Evidence: the feature bundles park + restore + audit-log in one unit; the restore path is where the untested gap hid. Not the primary cause, but it widened the surface the fake check let slip.",
1970
+ "",
1971
+ "### Q4 Wrong split?",
1972
+ "NO. Evidence: the task was a single coherent feature; the failures are about contract and verification, not about how the work was divided.",
1973
+ "",
1974
+ "### Named root cause",
1975
+ "A contract conflict on the `status` field (it was never frozen to one representation), made invisible each round by a verification step that only went through the motions. The patches kept fixing the spot the guard named while the unfrozen contract reintroduced the same class of bug elsewhere, and the hollow self-check kept certifying it.",
1976
+ "",
1977
+ "### Owner decision",
1978
+ "Agree with the root cause. Adjustment: freeze the `status` contract to a single enum as step zero of V6, and make the self-check fail first on the restore path before any further work.",
1979
+ "",
1980
+ "### Version 6 direction, rebuilt around the cause (NOT another patch)",
1981
+ "V6 does not start from the V5 patch list. Step 1: define `status` as one enum, single source of truth, and update every writer to it at once. Step 2: write a restore-from-quarantine check that fails against the current code, then make it pass. Only then continue. The brake record (both findings, four answers, cause, decision) is filed so a later session sees why the V4->V5->V6 chain was broken on purpose instead of patched a third time."
1982
+ ]
1983
+ },
1984
+ failures: [
1985
+ "A third patched version goes out because nobody noticed the second block was the trip condition.",
1986
+ "The four questions get answered without evidence, so the named 'root cause' is just the old symptoms relabeled.",
1987
+ "The earlier findings are overwritten, so the across-rounds pattern that proves the real cause is lost."
1988
+ ]
1989
+ },
1990
+ {
1991
+ id: "half-product-review",
1992
+ title: "Half-Product Review",
1993
+ purpose: "Block confident claims when the project has docs, demos, or architecture but no runnable first experience.",
1994
+ trigger: "Use before release, README polish, launch copy, or any claim that a stranger can use the system.",
1995
+ input: "README, START_HERE, CLI output, generated workspace, demo path, tests, and known gaps.",
1996
+ process: [
1997
+ "Inspect the first ten minutes as a user would experience them.",
1998
+ "Check whether docs point to runnable artifacts.",
1999
+ "Reject strategy prose that is not backed by files or commands.",
2000
+ "List the smallest fixes required before public labeling."
2001
+ ],
2002
+ template: [
2003
+ "Claim under review:",
2004
+ "First-run path:",
2005
+ "Runnable artifacts:",
2006
+ "Missing user proof:",
2007
+ "Overclaim risk:",
2008
+ "Required fixes:",
2009
+ "Release label:"
2010
+ ],
2011
+ example: "A README says complete OS, but the generated workspace lacks mechanism packages. Review label is candidate, not publishable, until init creates those files.",
2012
+ failures: [
2013
+ "Review accepts impressive documentation without running init.",
2014
+ "The release label hides what is still only a candidate.",
2015
+ "The reviewer checks only root docs and misses generated workspace drift."
2016
+ ]
2017
+ },
2018
+ {
2019
+ id: "handoff-abc",
2020
+ title: "Handoff A/B/C",
2021
+ purpose: "Externalize the current state into a structured handoff packet so ANY AI or session can pick up from where the work actually is, instead of the human re-explaining the background every time a tool or session changes. A/B/C are three handoff modes for three situations: A = high-interaction handoff (a human and AI trading turns inside the same tool, lightweight resume); B = programmatic handoff (a clear task an executor picks up and drives to completion on its own); C = delivery overview (a human-facing total account of what one phase produced). Whichever mode, the packet carries the same load-bearing fields so the receiver never starts from zero.",
2022
+ trigger: "Use whenever continuity is about to break or pass to someone else: a session stops with the work half-done, a different AI tool takes over, a long task crosses a natural seam, or a phase finishes and someone needs the result without reading the whole chat. Pick A for a same-tool resume, B for handing a defined task to an executor, C for reporting a finished phase to a human.",
2023
+ antiTrigger: "Skip a full handoff packet for work that is not actually being passed on: a single self-contained reply you finish in the same turn, a throwaway exploration nobody will continue, or a trivial step where re-explaining the context would take less than writing the packet. A heavy handoff on work that never gets handed off is pure overhead, and overhead with no payoff trains people to skip the packet when a real handoff finally needs it.",
2024
+ input: "Where the work is now (what is done, in plain state terms). The evidence that backs that state (command output, test result, a reviewed artifact, or a clear note that none exists yet). What it is blocked or waiting on, if anything. The single most concrete next action the receiver should take first. The baseline: the exact version, commit, branch, or state the receiver should start from, so they do not pick up the wrong copy. Which handoff mode applies (A high-interaction / B programmatic / C delivery overview), because it changes how much context the packet carries.",
2025
+ inputsDetailed: [
2026
+ "Current state — where the work actually is, written as fact not hope: what is finished, what is in flight, what has not been started. This is the field that replaces 're-explain the background'.",
2027
+ "Supporting evidence — the proof each piece of 'done' rests on (a passing test, a command's output, a reviewed file), or an explicit 'no evidence yet' so the receiver does not over-trust the state.",
2028
+ "Blocker / waiting-on — what is stopping forward progress right now (a missing decision, an unfinished dependency, an unanswered question), or 'none'.",
2029
+ "Next concrete action — the one specific first move the receiver should make, phrased so they can act without asking 'so what do I do first?'.",
2030
+ "Baseline — the exact starting point: version, commit, branch, file set, or named state the receiver must begin from, so a handoff cannot land on the wrong copy.",
2031
+ "Handoff mode — A (high-interaction same-tool resume), B (programmatic task an executor drives alone), or C (delivery overview for a human), since the mode sets how lightweight or complete the packet should be."
2032
+ ],
2033
+ process: [
2034
+ "Pick the handoff mode first. A = high-interaction: a human and AI are mid-conversation in one tool and you just need a lightweight 'where we are' so the next turn continues cleanly. B = programmatic: you are handing a defined task to an executor (another tool or fresh session) that will run it to completion on its own, so the packet must be self-contained. C = delivery overview: a phase is finished and a human needs the total account, so the packet is a readable result summary, not a work ticket. The mode decides how much the packet carries.",
2035
+ "Write the current state as fact, not optimism. Say what is actually done, what is in flight, and what has not started. Resist 'basically done' — if it is not verified, it is not done. This block is the whole point: it is what lets the receiver skip 're-explain the background'.",
2036
+ "Attach the evidence for each claimed-done item: the test that passed, the command output, the reviewed artifact. Where there is no evidence yet, say so plainly. State without evidence is a guess wearing a fact's clothes.",
2037
+ "Name the blocker or waiting-on, if any, and the single most concrete next action. The next action must be specific enough to act on immediately — 'run the package dry-run against a temp cache and check the file list', not 'continue the task'.",
2038
+ "Pin the baseline: the exact commit / version / branch / state the receiver starts from. Without it, a parallel edit or a stale copy silently diverges and the handoff lands on the wrong work.",
2039
+ "Hand off in the chosen mode and stop at the stated stop condition. For A, keep it short and resume in place. For B, the executor takes it and drives to completion. For C, the human reads the overview and decides. Do not pad an A resume into a full B packet, and do not shrink a B packet a stranger must run alone down to an A-sized note."
2040
+ ],
2041
+ outputShape: [
2042
+ "Handoff mode: A (high-interaction resume) / B (programmatic task) / C (delivery overview), and one line on why that mode fits.",
2043
+ "Current state: what is done / in flight / not started, written as fact.",
2044
+ "Evidence: the proof behind each 'done', or an explicit 'no evidence yet'.",
2045
+ "Blocker / waiting-on: what is stopping progress, or 'none'.",
2046
+ "Next concrete action: the single specific first move the receiver should make.",
2047
+ "Baseline: the exact commit / version / branch / state to start from.",
2048
+ "Stop condition: where this handoff ends and the receiver takes over."
2049
+ ],
2050
+ template: [
2051
+ "Handoff mode (A high-interaction / B programmatic / C delivery overview) + why this mode:",
2052
+ "Current state (done / in flight / not started — as fact, not 'basically done'):",
2053
+ "Evidence behind each 'done' (or explicit 'no evidence yet'):",
2054
+ "Blocker / waiting-on (or 'none'):",
2055
+ "Next concrete action (one specific first move the receiver can act on):",
2056
+ "Baseline (exact commit / version / branch / state to start from):",
2057
+ "Stop condition (where this handoff ends and the receiver takes over):"
2058
+ ],
2059
+ passBar: [
2060
+ "The handoff mode (A / B / C) is named and fits the situation — a same-tool resume is not bloated into a full task packet, and a stranger-runnable task is not shrunk to a one-liner.",
2061
+ "The current state is written as fact, with no 'basically done' standing in for unverified work.",
2062
+ "Every claimed-done item has evidence attached, or is explicitly marked 'no evidence yet'.",
2063
+ "There is exactly one concrete next action the receiver can act on without asking what to do first.",
2064
+ "The baseline is pinned (commit / version / branch / state), so the receiver cannot pick up the wrong copy.",
2065
+ "A receiver with no access to the original chat could continue from this packet alone."
2066
+ ],
2067
+ rejectBar: [
2068
+ "No mode is chosen, so the packet is either too thin for a stranger to run or too heavy for a quick same-tool resume.",
2069
+ "The current state hides unverified work behind 'basically done' or 'almost there'.",
2070
+ "A 'done' claim has no evidence and is not marked as unverified — the receiver inherits a hidden gap.",
2071
+ "There is no concrete next action, or there are five vague ones and no single first move.",
2072
+ "The baseline is missing, so the receiver can silently start from the wrong version or a stale copy.",
2073
+ "The packet only makes sense to someone who already read the whole conversation, which defeats the purpose."
2074
+ ],
2075
+ misuse: [
2076
+ "Writing every handoff as a maximal B packet, including a two-line same-tool resume, so the heavy ceremony makes people stop writing handoffs at all.",
2077
+ "Letting the current-state block drift into wishful 'basically done' instead of fact, so the receiver trusts work that was never verified.",
2078
+ "Giving the state with no evidence and no 'unverified' label, so a guess gets inherited as a confirmed fact.",
2079
+ "Listing decision points and options but no single concrete next action, leaving the receiver to re-derive 'so what do I actually do first?'.",
2080
+ "Skipping the baseline because 'it's obvious', then a parallel edit or stale checkout makes the receiver continue on the wrong copy.",
2081
+ "Writing a packet that silently assumes the original chat history, so it only works for the same session it was meant to replace."
2082
+ ],
2083
+ example: "An A/B/C handoff: mode B (programmatic), current state says the feature is implemented and unit-tested, evidence is the passing test output, the blocker is an un-run package dry-run, the next concrete action is 'run the package dry-run against a temp cache before release labeling', and the baseline is the named commit — so a fresh executor continues without re-reading the chat.",
2084
+ filledExample: {
2085
+ scenario: "Session 1 (a controller AI in one tool) gets a synthetic note app close to release but runs low on context before the final packaging check. Rather than dump the chat on whoever continues, it writes a Handoff A/B/C packet. Session 2 — a different AI tool entirely, with none of session 1's memory — reads the packet and picks up cleanly.",
2086
+ lines: [
2087
+ "### Handoff mode + why",
2088
+ "Mode B (programmatic). A defined, bounded task remains (one packaging check, then a release-label decision) and it will be handed to a separate executor that must run it on its own, so the packet is fully self-contained — not an A-style in-place resume.",
2089
+ "",
2090
+ "### Current state (as fact)",
2091
+ "- Done: the feature is implemented; the unit suite passes; a cross-family guard review accepted the completion claim after an earlier keyboard gap was fixed.",
2092
+ "- In flight: nothing actively running.",
2093
+ "- Not started: the package dry-run (pack the project in a temp directory and confirm every required file ships) and the final release-label decision that depends on it.",
2094
+ "",
2095
+ "### Evidence behind each 'done'",
2096
+ "- 'Unit suite passes' — evidence: the test command's output, all green, captured in the run log.",
2097
+ "- 'Guard review accepted' — evidence: the recorded verdict from the binding cross-family pass, which named the earlier gap and then cleared it after the fix.",
2098
+ "- 'Package dry-run' — NO evidence yet; it has not been run. Flagged so the receiver does not assume it.",
2099
+ "",
2100
+ "### Blocker / waiting-on",
2101
+ "Not blocked, but waiting on one thing before release labeling: the package dry-run has to run clean. The release label must stay 'candidate' until that output exists.",
2102
+ "",
2103
+ "### Next concrete action",
2104
+ "Run the package dry-run against a temp cache and confirm the file list contains every required file. If the list is complete, move the release label from 'candidate' to 'releasable'. If anything is missing, file it as the next fix and keep the label at 'candidate'.",
2105
+ "",
2106
+ "### Baseline",
2107
+ "Start from the named release-candidate commit on the release branch (the one whose log entry records the accepted guard review). Do not start from any local working copy that has uncommitted edits — pull the exact commit first, so the dry-run reflects what would actually ship.",
2108
+ "",
2109
+ "### Stop condition",
2110
+ "This handoff ends once the dry-run has run and the release label has been set accordingly. At that point the receiver owns the result; if the label moves to 'releasable', the next step is a separate release handoff, not a continuation of this one.",
2111
+ "",
2112
+ "### What session 2 actually does first",
2113
+ "Session 2 — a different tool with zero shared memory — reads the packet, checks out the named baseline commit (not its own stale copy), and runs exactly the one next action: the package dry-run against a temp cache. It never has to ask 'what was the background?' or 'where were we?' — the packet already answered both. The dry-run lists every required file, so session 2 flips the label to 'releasable' and writes a short follow-on note. The whole continuation cost the human zero re-explanation."
2114
+ ]
2115
+ },
2116
+ failures: [
2117
+ "The current-state block includes guesses dressed up as confirmed facts.",
2118
+ "The next-action section lists options but never marks the single first move.",
2119
+ "The baseline is omitted, so the receiver continues from the wrong version."
2120
+ ]
2121
+ },
2122
+ {
2123
+ id: "harvest-and-erc",
2124
+ title: "Harvest and External Recap",
2125
+ purpose: "Stop reusable value from leaking away. Valuable decisions, lessons, methods, and stable preferences get buried in long conversations and are never recovered. Harvest sweeps a conversation, lifts the reusable bits into harvest cards (one card per item — a decision, a lesson, a method, a stable preference), the human confirms them, and they land in the right knowledge base. Redaction is a built-in step, not an afterthought: before anything is filed, private material is rewritten into a general, public-safe form. External Recap (ERC) is a separate, dedicated role that runs harvest across MANY conversations at once — guarded by a double lock (a brand-new session AND the human explicitly declaring that role) so an ordinary chat is never swept by accident.",
2126
+ trigger: "Run a single-conversation harvest when a discussion produced something worth keeping: a real decision got made, a lesson was paid for, a method was figured out, a stable preference surfaced, or a loop finished. Enter the External Recap role only under the double lock — a fresh session plus an explicit human declaration of the recap role — and only when the job is genuinely cross-conversation (recapping several past sessions at once, not extracting from the current one).",
2127
+ antiTrigger: "Do not harvest a chat that produced nothing durable: a quick fact lookup, a dead-end exploration, a routine step with no reusable insight. Do not let a one-off incident become a permanent rule before the pattern has actually repeated. And do not slip into the External Recap role mid-conversation or in an active working session — without the double lock, ERC would fire on everyday chat and bury the knowledge base in low-value cards. Harvesting noise is worse than missing it: a knowledge base full of trivia is one nobody trusts or reads.",
2128
+ input: "The full conversation text to sweep (or, for ERC, the set of past conversations to recap). The candidate reusable items spotted in it — decisions, lessons, methods, stable preferences. The target knowledge base each item should land in. The private-material boundary: which names, paths, numbers, and specifics must be rewritten before anything is filed. For ERC specifically, proof of the double lock: that this is a fresh session and that the human explicitly asked for the recap role.",
2129
+ inputsDetailed: [
2130
+ "Source conversation(s) — the full text to sweep for a single harvest, or the named set of prior sessions to recap for ERC. You cannot harvest what you did not actually read.",
2131
+ "Candidate items — the reusable things spotted in the source, each tagged by type: a decision ('we chose X over Y'), a lesson ('this failed because Z'), a method ('here is the repeatable way to do W'), or a stable preference ('the human consistently wants V').",
2132
+ "Target knowledge base — where each confirmed card belongs, so a card is filed, not just written and lost.",
2133
+ "Private-material boundary — the specific names, local paths, internal numbers, customer or person references that must be rewritten into general form before filing. This is the redaction map.",
2134
+ "Human confirmation gate — the explicit yes/no on each card before it lands, so harvest proposes and the human disposes.",
2135
+ "ERC double-lock proof (ERC only) — evidence that this is a fresh session AND that the human explicitly declared the recap role, since ERC may not run without both."
2136
+ ],
2137
+ process: [
2138
+ "Sweep the source conversation end to end and pull out candidate reusable items, each tagged by type: decision, lesson, method, or stable preference. Do not generalize from a single occurrence — a one-off only becomes a rule once the pattern has actually repeated.",
2139
+ "Turn each candidate into a harvest card: one item per card, stating what it is, why it is reusable, and where it should be filed. A decision card records the choice and the reason; a lesson card records what went wrong and what to do differently; a method card records the repeatable steps; a preference card records the stable want and the evidence it is stable.",
2140
+ "Redact as a built-in step, before filing — never after. Replace private material (real names, local paths, internal numbers, customer or person references) with general, public-safe wording. A card that still carries private specifics is not ready to file, no matter how useful it is.",
2141
+ "Show the cards to the human and let them confirm, edit, or drop each one. Harvest proposes; the human disposes. Nothing lands in a knowledge base on the AI's say-so alone.",
2142
+ "File each confirmed, redacted card into its target knowledge base, and log it so the same insight is not re-harvested next time.",
2143
+ "For External Recap (ERC): only with the double lock satisfied (fresh session + explicit human role declaration), run the same harvest across the named set of prior conversations, produce cross-conversation cards and candidates, and hand them back for the human to confirm — recapping, not rewriting the source history, and still redacting before anything is filed."
2144
+ ],
2145
+ outputShape: [
2146
+ "Harvest cards: one per reusable item, each tagged decision / lesson / method / stable preference, with what it is and why it is reusable.",
2147
+ "Target knowledge base for each card: where it lands once confirmed.",
2148
+ "Redaction record: which private specifics were rewritten into general form before filing.",
2149
+ "Repeat evidence for any card proposed as a rule: the occurrences that justify generalizing.",
2150
+ "Human confirmation: the yes / edit / drop decision recorded per card.",
2151
+ "ERC scope (ERC only): the set of conversations recapped and the double-lock confirmation that authorized the role."
2152
+ ],
2153
+ template: [
2154
+ "Source conversation(s) swept (single harvest) or recapped (ERC):",
2155
+ "Candidate items by type (decision / lesson / method / stable preference):",
2156
+ "Harvest cards (one item each: what it is + why reusable + target knowledge base):",
2157
+ "Redaction record (private specifics rewritten into general form, before filing):",
2158
+ "Repeat evidence (occurrences) for anything proposed as a rule:",
2159
+ "Human confirmation per card (yes / edit / drop):",
2160
+ "ERC double-lock check (fresh session + explicit role declaration) — ERC only:"
2161
+ ],
2162
+ passBar: [
2163
+ "Each reusable item became its own card, tagged by type, instead of a vague 'lessons learned' blob.",
2164
+ "Every card was redacted into public-safe wording before filing — no private names, paths, internal numbers, or person references survive.",
2165
+ "Anything proposed as a reusable rule rests on a repeated pattern, not a single incident.",
2166
+ "The human confirmed (or edited / dropped) each card; nothing was filed on the AI's say-so alone.",
2167
+ "Each confirmed card landed in a named knowledge base and was logged so it is not re-harvested.",
2168
+ "If the External Recap role was used, the double lock (fresh session + explicit role declaration) was satisfied and recorded."
2169
+ ],
2170
+ rejectBar: [
2171
+ "Harvest stored every detail as cards, so the knowledge base fills with trivia and nobody trusts it.",
2172
+ "A card was filed with private specifics still in it — redaction was skipped or left for 'later'.",
2173
+ "A single occurrence was promoted straight to a permanent rule with no repeated evidence.",
2174
+ "Cards were written into a knowledge base without the human confirming them.",
2175
+ "The External Recap role ran without the double lock — on an everyday chat or inside an active working session.",
2176
+ "ERC rewrote or exposed the raw source history instead of producing redacted, confirmable recap cards."
2177
+ ],
2178
+ misuse: [
2179
+ "Treating harvest as 'save the whole transcript' — storing everything as cards until the knowledge base is clutter no one reads.",
2180
+ "Filing a genuinely useful card before redacting it, so a real name, local path, or internal number leaks into the knowledge base.",
2181
+ "Generalizing from one bad run into a permanent rule, so a single incident hardens into law without the pattern ever repeating.",
2182
+ "Writing cards straight into the knowledge base without the human's confirmation, turning 'propose' into 'decide'.",
2183
+ "Slipping into the External Recap role mid-conversation or in a live working session, so ERC fires without its double lock and sweeps ordinary chat.",
2184
+ "Letting ERC paste or rewrite the raw source conversations instead of producing redacted, confirmable recap cards — which re-exposes exactly the private material harvest is supposed to strip."
2185
+ ],
2186
+ example: "A single-conversation harvest turns one discussion into two cards — a decision card ('chose the temp-cache dry-run path because it isolates the local-cache failure') and a lesson card ('do not label a release before the packaged smoke test runs') — each redacted into public-safe wording, confirmed by the human, and filed; ERC would do the same across several past sessions, but only under its fresh-session-plus-explicit-role double lock.",
2187
+ filledExample: {
2188
+ scenario: "A working session ends with a real choice having been made about how to verify a synthetic app's release. The discussion is full of useful judgment that would otherwise evaporate. The human says 'harvest this', so the AI sweeps the conversation, proposes cards, redacts them, and files the confirmed ones. (An External Recap would look the same but span several past sessions — here it is a single-conversation harvest, so the ERC double lock is noted as not engaged.)",
2189
+ lines: [
2190
+ "### Source conversation swept",
2191
+ "One working session that debated how to confirm a release was safe, hit a snag with a local cache during packaging, and settled on a specific verification path.",
2192
+ "",
2193
+ "### Candidate items by type",
2194
+ "- Decision: the team chose to run the package dry-run against a temporary cache rather than the default local cache.",
2195
+ "- Lesson: an earlier near-miss happened because a release was labeled before the packaged smoke test had run.",
2196
+ "- (Considered and dropped) a passing remark about file naming — one-off, no reuse value, not carded.",
2197
+ "",
2198
+ "### Harvest cards",
2199
+ "Card 1 — DECISION. What: when packaging fails only because of a stale local cache, run the dry-run against a temporary cache to isolate the real file-list check from the cache noise. Why reusable: the same cache trap recurs across releases. Target knowledge base: the team's release-practices notes.",
2200
+ "Card 2 — LESSON. What: never move a release label from 'candidate' to 'releasable' before the packaged smoke test has actually run and shown a complete file list. Why reusable: it is a guardrail that prevents shipping an incomplete package. Target knowledge base: the release-practices notes, under pre-release checks.",
2201
+ "",
2202
+ "### Redaction record (before filing)",
2203
+ "Replaced the specific app name with 'a synthetic note app'; replaced concrete local cache paths with 'the local cache' / 'a temporary cache'; removed the contributor's name and referred to 'the human' / 'the team'. No real names, paths, or internal numbers remain on either card.",
2204
+ "",
2205
+ "### Repeat evidence for the rule",
2206
+ "Card 2 is proposed as a standing rule, so it is backed by more than this one chat: the same 'labeled too early' miss had shown up in a prior release loop. Two occurrences, not one, justify generalizing it into a rule. Card 1 is filed as a reusable method, not a hard rule, so it needs less.",
2207
+ "",
2208
+ "### Human confirmation per card",
2209
+ "Card 1: confirmed, filed as written. Card 2: confirmed with a small edit (the human tightened 'smoke test' to 'packaged smoke test' for precision), then filed. The dropped file-naming remark: confirmed dropped.",
2210
+ "",
2211
+ "### ERC double-lock check",
2212
+ "Not engaged. This is a single-conversation harvest inside the working session, not a cross-session recap. The External Recap role would require a brand-new session AND an explicit human declaration of that role before it could run across several past conversations; neither applies here, so harvest runs in its ordinary single-conversation form."
2213
+ ]
2214
+ },
2215
+ failures: [
2216
+ "Harvest stores every detail and becomes clutter.",
2217
+ "External recap exposes private source material.",
2218
+ "A single incident becomes a permanent rule without repeated evidence."
2219
+ ]
2220
+ },
2221
+ {
2222
+ id: "do-not-handle-yet",
2223
+ title: "Do Not Handle Yet",
2224
+ purpose: "Protect the main line by explicitly parking tempting but lower-priority work.",
2225
+ trigger: "Use when a task reveals adjacent bugs, polish ideas, product questions, or architecture tangents.",
2226
+ input: "Main goal, current slice, tempting adjacent item, risk if handled now, and revisit condition.",
2227
+ process: [
2228
+ "Name the current slice and completion standard.",
2229
+ "Write the adjacent item in a parking note.",
2230
+ "Explain why handling it now would harm the main line.",
2231
+ "Define when to revisit it."
2232
+ ],
2233
+ template: [
2234
+ "Current slice:",
2235
+ "Parked item:",
2236
+ "Reason not now:",
2237
+ "Risk if handled now:",
2238
+ "Revisit condition:",
2239
+ "Owner decision needed:",
2240
+ "Storage target:"
2241
+ ],
2242
+ example: "During CLI init work, visual branding polish is parked until init, demo, check, privacy scan, and package dry-run are green.",
2243
+ failures: [
2244
+ "Parking becomes deletion because no revisit condition exists.",
2245
+ "The assistant handles the parked item anyway.",
2246
+ "The parking note hides a true blocker."
2247
+ ]
2248
+ },
2249
+ {
2250
+ id: "plain-language-first-screen",
2251
+ title: "Plain-Language First Screen",
2252
+ purpose: "Make the first screen explain the result, path, and proof before concepts or framework names.",
2253
+ trigger: "Use for README, START_HERE, handoff, review results, and any user-facing guide.",
2254
+ input: "Audience, first action, proof artifact, main contrast, and one next step.",
2255
+ process: [
2256
+ "Start with what the user can do in ten minutes.",
2257
+ "Show before/after instead of abstract philosophy.",
2258
+ "Name the files or commands that prove the claim.",
2259
+ "Move deeper theory below the first-run path."
2260
+ ],
2261
+ template: [
2262
+ "Audience:",
2263
+ "First-screen claim:",
2264
+ "Ten-minute action:",
2265
+ "Before/after proof:",
2266
+ "Files or commands:",
2267
+ "What this is not:",
2268
+ "Next step:"
2269
+ ],
2270
+ example: "START_HERE opens with the demo path and raw-chat comparison, then links to architecture docs after the user has something runnable.",
2271
+ failures: [
2272
+ "The first screen becomes a manifesto.",
2273
+ "The user sees concepts before a runnable path.",
2274
+ "The guide claims value without a before/after proof."
2275
+ ]
2276
+ },
2277
+ {
2278
+ id: "honest-calibration",
2279
+ title: "Honest Calibration",
2280
+ purpose: "Offset the model's built-in eagerness to please by pinning one short user-side prefix to the front of every ask for a rating, an evaluation, or a recommendation: be candid, do not inflate, do not over-hedge. The point is not to hope the AI will be honest — it is to know that, left uncalibrated, a model slides back toward the answer that makes you feel good, so you re-aim it on each ask. The prefix pulls the baseline from make-you-happy back to tell-the-truth, and it matters most exactly where the temptation to flatter is highest: when you are asking the AI to judge your own work, your own ability, or your own output.",
2281
+ trigger: "Use whenever you ask the AI to grade, score, place, rank, or recommend — and most of all when the thing being judged is yours: your draft, your plan, your skill level, the quality tier your output would land in, whether something is ready to ship or publish. If a falsely high 'this is great' would cost you (you publish too early, you skip a fix, you misjudge where you really stand), put the calibration prefix in front of the ask.",
2282
+ antiTrigger: "Do not bolt it onto a plain fact lookup or a direct instruction to carry out — there is no evaluation to calibrate, so the prefix is just noise. 'Be candid, do not inflate' in front of 'what is the capital of X' or 'rename this file' adds nothing, and a calibration ritual stapled to every message trains you to stop noticing it on the one ask where it actually changes the answer. It is also not a license to flip to harsh: the instruction is to stop both inflating AND over-hedging, not to make the AI negative on command.",
2283
+ input: "The specific thing to judge, stated plainly (the draft, the plan, the output, the ability). What the judgment is for (publish / ship / keep iterating / a self-honest gut check), so the AI calibrates to a real bar instead of a vague vibe. The reference frame you want it measured against (a quality tier, a percentile, a named standard, a comparison set) so 'good' has an anchor. The candor prefix itself, placed at the FRONT of the ask, not buried after it. And, when the thing under judgment is your own, an explicit nudge to step outside your point of view and not grade to please you.",
2284
+ inputsDetailed: [
2285
+ "The exact artifact, ability, or output to evaluate — named concretely so the AI grades a real thing, not a generality.",
2286
+ "The purpose of the judgment (decide whether to publish, whether to ship, whether to keep working, or just to know honestly where you stand), so the bar is the real-world consequence, not a feeling.",
2287
+ "The reference frame: the quality tier, percentile band, named standard, or comparison set the answer should be measured against, so 'good' or 'B+' is anchored rather than floating.",
2288
+ "The candor prefix, placed at the FRONT of the request (be candid, do not inflate, do not over-hedge) — position matters, because a prefix sets the stance before the model starts composing the pleasing version.",
2289
+ "When you are the subject (your work, your skill, your output), an explicit 'step outside my perspective and do not grade to make me feel good' so the highest-flattery case gets the strongest calibration.",
2290
+ "Optional: permission to deliver the verdict bluntly and lead with the weakest part, so the honest signal is not softened into mush on its way out."
2291
+ ],
2292
+ process: [
2293
+ "Put the candor prefix first, before the actual ask. Lead the request with 'be candid, do not inflate, do not over-hedge' (or your own words for it) so the stance is set before the model reaches for the agreeable framing. A prefix after the question is half as effective as a prefix before it, because by then the answer is already forming around what would please you.",
2294
+ "Anchor the judgment to a real bar, not a vibe. Name the reference frame — a tier, a percentile, a named standard, a comparison set — so the AI cannot retreat to a safely flattering 'it's pretty good'. 'Be candid' with nothing to be candid against just produces a more confident vague compliment.",
2295
+ "Apply the strongest calibration when the subject is you. Self-evaluation is the peak-flattery case: the model most wants to please you exactly when you are asking about your own work or ability. Add the explicit 'step outside my point of view and do not grade to make me feel good' here, and treat a suspiciously warm verdict on your own output as a signal to re-ask, not as good news.",
2296
+ "Read the answer for the tells of an uncalibrated slide-back: it opens with praise and buries the real critique; every weakness is immediately cushioned ('but this is genuinely strong'); the score drifts upward with no new evidence; it agrees with your own stated hope a little too readily. Any of these means the baseline slid back toward make-you-happy and the prefix needs re-asserting.",
2297
+ "Re-aim when it slides. The model does not hold the candid stance forever — over a long thread it relaxes back into the pleasing default. When you catch the tells, restate the prefix and ask for the verdict again; do not accept the warmed-over version just because re-asking feels awkward.",
2298
+ "Separate the candid verdict from encouragement, and keep them in that order. A useful honest answer can still end with 'and here is the fastest path up' — but the true placement comes first and unhedged, and the encouragement comes after, clearly marked as the next step rather than as a softener that quietly raises the grade."
2299
+ ],
2300
+ outputShape: [
2301
+ "A candid verdict stated first and plainly: the tier, score, percentile, or yes/no, without an opening cushion of praise.",
2302
+ "The anchor it was measured against (the named tier, percentile, standard, or comparison set), so the verdict is checkable rather than a floating adjective.",
2303
+ "The weakest part led with, not buried: the single biggest reason it is not higher, stated before any reassurance.",
2304
+ "No upward drift: the score does not creep higher than the evidence supports, and warmth is not substituted for a number.",
2305
+ "Encouragement, if any, clearly separated and placed last — the fastest path up, marked as a next step, never folded back into the grade.",
2306
+ "On a self-evaluation, an explicit note that the AI graded from outside your perspective rather than to please you."
2307
+ ],
2308
+ template: [
2309
+ "Candor prefix (paste at the FRONT of the ask): be candid, do not inflate and do not over-hedge; step outside my perspective and do not grade this to make me feel good.",
2310
+ "What to judge (the exact draft / plan / output / ability):",
2311
+ "What the judgment is for (publish / ship / keep iterating / honest gut check):",
2312
+ "Reference frame to measure against (tier / percentile / named standard / comparison set):",
2313
+ "Is the subject mine? (if yes, apply the strongest calibration and grade from outside my view):",
2314
+ "Candid verdict first (tier / score / percentile / yes-no, no opening praise):",
2315
+ "Single biggest reason it is not higher (led with, not cushioned):",
2316
+ "Tells of a slide-back to watch for (praise-first / every flaw cushioned / score drifts up / agrees with my hope too fast):",
2317
+ "Fastest path up (optional, placed LAST, marked as a next step not a grade-softener):"
2318
+ ],
2319
+ passBar: [
2320
+ "The candor prefix sat at the FRONT of the ask, setting the stance before the answer formed.",
2321
+ "The verdict is anchored to a named bar (tier / percentile / standard / comparison set), not a floating 'pretty good'.",
2322
+ "The weakest point is stated first and unhedged, rather than buried under an opening of praise.",
2323
+ "The score reflects the evidence and did not drift upward, and warmth was not used in place of a real number.",
2324
+ "On a self-evaluation, the AI grades from outside your perspective and says so, instead of grading to please you.",
2325
+ "Encouragement, if present, is separated out and placed last as a next step — never blended back into the grade."
2326
+ ],
2327
+ rejectBar: [
2328
+ "The answer opens with praise and the real critique is buried below it (the classic flatter-first slide-back).",
2329
+ "The verdict is a warm adjective with no anchor — 'this is strong' against nothing checkable.",
2330
+ "Every weakness is immediately cushioned so no honest signal survives to the reader.",
2331
+ "The grade crept upward across the thread with no new evidence, tracking your stated hope rather than the work.",
2332
+ "The prefix was tacked on AFTER the question, so the pleasing version had already formed.",
2333
+ "The candor was read as a license to be harsh, producing a put-down instead of an inflation-free, hedge-free truth."
2334
+ ],
2335
+ misuse: [
2336
+ "Stapling the prefix to plain fact lookups and direct instructions, so it becomes background noise you stop noticing on the one ask that needs it.",
2337
+ "Putting 'be candid' after the question instead of in front of it, so the model has already composed the agreeable answer before the stance lands.",
2338
+ "Accepting a suspiciously warm verdict on your own work because re-asking feels awkward — the peak-flattery case is exactly where you must re-aim.",
2339
+ "Treating the prefix as a one-time setting rather than re-asserting it when the thread drifts back toward pleasing you.",
2340
+ "Flipping the instruction into 'be harsh', so you trade a flattering distortion for a punitive one instead of getting an undistorted read.",
2341
+ "Letting the encouragement at the end quietly raise the grade ('it's a B, but honestly almost an A') so the candid verdict is undone in its own last line."
2342
+ ],
2343
+ example: "A writer asks an AI to rate a finished essay. With no prefix, the AI opens 'This is a compelling, well-structured piece!' and the real problem — a thesis that never actually lands — shows up softened in paragraph three. The writer re-asks with the calibration prefix at the front: 'Be candid, do not inflate and do not over-hedge, and step outside my view: what tier would this land in on a serious long-form platform, and why not higher?' Now the AI leads with the verdict ('mid-tier, would not stand out — the thesis is stated but never argued') anchored to the platform tier, names the single biggest reason first, and only then offers the fastest path up — clearly marked as a next step, not folded back into the grade.",
2344
+ filledExample: {
2345
+ scenario: "A solo founder has drafted a launch post for a synthetic product and wants to know, honestly, whether it is ready to publish. They have been staring at it for days and suspect they have lost perspective. They run Honest Calibration rather than just asking 'is this good?'.",
2346
+ lines: [
2347
+ "### Candor prefix (pasted at the FRONT of the ask)",
2348
+ "Be candid, do not inflate and do not over-hedge. Step outside my perspective and do not grade this to make me feel good — I have been staring at it for days and I would rather hear it is mid than publish something mid thinking it is great.",
2349
+ "",
2350
+ "### What to judge, and what for",
2351
+ "This launch post (pasted below). The judgment is for a real decision: publish it as-is to a public audience today, or keep working it.",
2352
+ "",
2353
+ "### Reference frame to measure against",
2354
+ "Where would it land among launch posts that actually get traction — top quartile, solid-but-forgettable middle, or below the bar — and what specifically keeps it out of the top quartile?",
2355
+ "",
2356
+ "### Candid verdict first (no opening praise)",
2357
+ "The AI leads with the placement, not a compliment: \"Solid-but-forgettable middle. It would not embarrass you, and it would not travel. As-is, I would not publish it for a launch you only get to do once.\"",
2358
+ "",
2359
+ "### Single biggest reason it is not higher (led with, not cushioned)",
2360
+ "\"The post explains what the product does but never makes me feel the problem it removes. Top-quartile launch posts open on a sharp, specific pain the reader already feels; yours opens on a feature list. That is the one change that moves it, and everything else is secondary.\"",
2361
+ "",
2362
+ "### A tell the founder watched for, and did not find",
2363
+ "The founder was specifically watching for the flatter-first slide-back — an answer that opened 'This is a strong, clear post!' and buried the real issue underneath. Because the candor prefix sat at the front, the verdict led with the placement instead, and the score did not drift up to match the founder's obvious hope that it was ready.",
2364
+ "",
2365
+ "### Fastest path up (placed LAST, marked as a next step, not a grade-softener)",
2366
+ "\"Next step, not a grade change: rewrite the opening around the single sharpest moment of the pain this removes, then re-judge. That one move is what separates the forgettable middle from a post that gets shared — the rest of the draft is already good enough to carry it once the opening lands.\"",
2367
+ "",
2368
+ "### Why the prefix changed the outcome",
2369
+ "Asked plainly, the same model would almost certainly have opened with warmth and let the founder publish a middle-tier post on the strength of a compliment. The front-loaded candor prefix re-aimed the baseline from 'make the founder feel good' to 'tell the founder the truth', and the explicit 'do not grade to please me' hit exactly the self-evaluation case where the pull to flatter is strongest."
2370
+ ]
2371
+ },
2372
+ failures: [
2373
+ "Over a long session the AI drifts back to praising the user's work and the calibration prefix is never re-asserted.",
2374
+ "The candid verdict is undercut by an end-of-answer reassurance that quietly raises the grade.",
2375
+ "The prefix is treated as a fact-task ritual and stapled everywhere, so it stops registering on the evaluations that need it."
2376
+ ]
2377
+ },
2378
+ {
2379
+ id: "feedback-absorption-ledger",
2380
+ title: "Feedback Absorption Ledger",
2381
+ purpose: "Keep independent judgment alive when you are synthesizing feedback from several sources by scoring each incoming point across five tiers instead of silently rubber-stamping all of it: absorb fully, absorb and refine, absorb with a boundary, partly absorb, or reject with a reason. The trap this defends against is the controller who collects three reviews and quietly accepts everything, becoming a courier for other people's opinions; the equal-and-opposite trap is reflexively rejecting things to look independent. The ledger forces a per-item decision with a stated reason, and treats the absorb/reject ratio as an OUTCOME of honest judgment, never a target to hit.",
2382
+ trigger: "Use when you are the one merging feedback from more than one source into a single revision or decision: two or three reviews of the same artifact, a mix of guard verdicts plus a stakeholder's notes, several rounds of comments you have to reconcile, or any moment where 'I got a lot of feedback, now what do I actually do with it' is the real question. It is for the synthesis step, where the temptation to either accept-everything or defend-everything is highest.",
2383
+ antiTrigger: "Skip the full ledger for a single piece of feedback you can simply act on, or a trivial note with no judgment call in it (a typo fix, an obvious correction). Scoring one unambiguous comment across five tiers is ceremony, and a ritual you run on feedback that needed no deliberation trains you to skip it on the genuinely conflicting feedback where the per-item discipline is the whole point.",
2384
+ input: "Every incoming feedback item, kept as separate line items rather than blurred into one impression (so each can get its own decision). For each, enough of the original to judge it on its merits — what was said, by whom, and the reason behind it if given. The artifact or decision the feedback is about, and the bar it is being held to, so 'absorb' or 'reject' is measured against something. Your own read on each, because the ledger records YOUR judgment, not a vote tally. And a clear understanding that the final ratio is whatever honest per-item judgment produces — you are not aiming for a number.",
2385
+ inputsDetailed: [
2386
+ "The full set of feedback items, listed separately — one row per point, not a single merged blob, because the method's whole value is a per-item decision.",
2387
+ "For each item: what was actually said, the source, and the stated reason if there is one, so you judge the substance instead of the loudest voice.",
2388
+ "The artifact or decision under revision and the bar it must meet, so each absorb/refine/reject call is measured against a real standard.",
2389
+ "Your own independent read on each item — this is a judgment ledger, not a poll; agreement among sources does not auto-win and a lone dissent is not auto-dismissed.",
2390
+ "An explicit reason attached to every reject and every partial-absorb, because an unexplained rejection is indistinguishable from defensiveness.",
2391
+ "A clear stance going in that the absorb/reject ratio is an outcome of honest judgment, not a quota — you are forbidden both from rejecting to look independent and from accepting to avoid friction."
2392
+ ],
2393
+ process: [
2394
+ "List every feedback item separately before deciding anything. Resist forming one overall impression of 'the feedback' — the method only works if each point gets its own row and its own verdict.",
2395
+ "Score each item into exactly one of five tiers, and write the reason. (1) ABSORB FULLY: the point is right and its scope is clear — take it as-is. (2) ABSORB AND REFINE: the direction is right but you add a more precise version — keep the intent, improve the execution, and note what you added. (3) ABSORB WITH A BOUNDARY: accept it, but bound where it applies, naming the cases it should and should not cover. (4) PARTLY ABSORB: take one part and set aside or defer the rest, splitting the item into what you took and what you did not. (5) REJECT: decline it, and give an independent reason, contrary evidence, or an alternative — a reject with no reason does not count.",
2396
+ "Judge substance, not source weight or vote count. Three sources making the same weak point do not outvote one strong objection; a single sharp dissent can be the item you absorb fully while the majority note gets a boundary. The ledger records what is right, not what is popular.",
2397
+ "Hold the two opposite disciplines at once. You may NOT reject a sound point just to look independent or to avoid feeling like a courier; and you may NOT absorb a weak point just to avoid friction or to be agreeable. Both are failures of judgment in opposite directions, and the stated reason on each row is what keeps you honest about which one you might be slipping into.",
2398
+ "Treat the ratio as a readout, not a goal. After scoring, you can look at how much you absorbed versus refined, bounded, partly took, or rejected — but that distribution is the RESULT of judging each item honestly. Never nudge an individual call to make the overall ratio look more independent or more agreeable; the moment the ratio drives a row's verdict, the ledger is corrupted.",
2399
+ "Record the ledger so the synthesis is auditable. Keep the per-item tier and reason, especially for every reject and partial-absorb, so a later reviewer (or your future self) can see that each point was weighed on its merits rather than waved through or swatted away."
2400
+ ],
2401
+ outputShape: [
2402
+ "One row per feedback item — never a merged 'I considered all the feedback' summary.",
2403
+ "A single tier per row: absorb fully / absorb and refine / absorb with a boundary / partly absorb / reject.",
2404
+ "A stated reason on every row, and specifically an independent reason, contrary evidence, or alternative on every reject and partial-absorb.",
2405
+ "For refine rows, what you added beyond the original; for boundary rows, where it does and does not apply; for partial rows, what was taken and what was set aside.",
2406
+ "The resulting absorb/reject distribution shown as an outcome readout, with an explicit note that it was not a target.",
2407
+ "An auditable trail: enough that a later reader can see each point was judged on its merits, not by vote count or by source weight."
2408
+ ],
2409
+ template: [
2410
+ "Artifact / decision under revision, and the bar it must meet:",
2411
+ "Feedback items, listed separately (one row each — do not merge):",
2412
+ "Item 1 — what was said / source / its reason:",
2413
+ "Item 1 — tier (absorb fully / refine / boundary / partly / reject) + your reason:",
2414
+ "Item 2 — what was said / source / its reason:",
2415
+ "Item 2 — tier + your reason:",
2416
+ "(repeat one row per item — every reject and partial needs an independent reason, contrary evidence, or an alternative)",
2417
+ "Discipline check: did I reject anything just to look independent, or absorb anything just to avoid friction?",
2418
+ "Ratio readout (how much absorbed / refined / bounded / partly / rejected) — stated as an OUTCOME, not a target:",
2419
+ "Auditable note: the synthesis a later reviewer can trace back to per-item judgment:"
2420
+ ],
2421
+ passBar: [
2422
+ "Every feedback item has its own row and exactly one tier — nothing is merged into a single overall impression.",
2423
+ "Every reject and every partial-absorb carries an independent reason, contrary evidence, or a named alternative.",
2424
+ "Items were judged on substance, not on how many sources said them or how senior the source was.",
2425
+ "Neither failure direction is present: nothing was rejected merely to look independent, nothing absorbed merely to avoid friction.",
2426
+ "The absorb/reject ratio is presented as an outcome of the per-item calls, with no sign that any row was bent to make the ratio look a certain way.",
2427
+ "The ledger is auditable: a later reader can see each point was weighed, not waved through or swatted away."
2428
+ ],
2429
+ rejectBar: [
2430
+ "The feedback was accepted wholesale — 'all good points, I'll incorporate them' — with no per-item decision (the courier failure the ledger exists to prevent).",
2431
+ "A point was rejected with no reason, contrary evidence, or alternative, so it cannot be told apart from reflexive defensiveness.",
2432
+ "Decisions tracked vote count or source seniority instead of substance (the majority note won just for being the majority).",
2433
+ "An individual call was nudged to make the overall ratio look more independent or more agreeable — the ratio drove the verdict.",
2434
+ "Sound feedback was declined specifically to avoid feeling like a rubber stamp (independence theater), or weak feedback absorbed specifically to keep the peace.",
2435
+ "The items were blurred into one impression, so there is no auditable trail of which point got what verdict and why."
2436
+ ],
2437
+ misuse: [
2438
+ "Collapsing the items into one 'I took the feedback on board' summary, which is exactly the rubber-stamp the ledger is built to stop.",
2439
+ "Setting a target ratio ('I should reject about a third to stay independent') and then bending individual calls to hit it — the ratio is an outcome, never a goal.",
2440
+ "Rejecting a sound point to perform independence, trading the courier failure for an equal-and-opposite defensiveness.",
2441
+ "Absorbing a weak point to avoid friction with the source, then backfilling a reason that does not really hold.",
2442
+ "Counting votes — letting three echoes of the same shallow note outweigh one strong objection because three feels like more.",
2443
+ "Leaving rejects and partials without a stated reason, so the synthesis cannot be audited and looks identical to swatting feedback away."
2444
+ ],
2445
+ example: "A controller merging three reviews of a synthetic spec does not write 'all helpful, will incorporate'. They ledger it: reviewer A's data-shape fix is ABSORBED FULLY (right and clearly scoped); reviewer B's 'add retries everywhere' is ABSORBED WITH A BOUNDARY (yes for the network call, no for the local parse, with the reason); reviewer C's 'rename the module' is REJECTED with an independent reason (the name encodes a deliberate distinction C missed) plus an alternative comment. The resulting mix — one full, one bounded, one rejected — is reported as the outcome of judging each on its merits, explicitly not a quota the controller was aiming for.",
2446
+ filledExample: {
2447
+ scenario: "A maintainer of a synthetic open-source tool gets three reviews on the same pull request and has to merge them into one revision. The instinct is to thank everyone and apply all of it. Instead they run the Feedback Absorption Ledger so the synthesis stays their own judgment.",
2448
+ lines: [
2449
+ "### Artifact under revision and the bar",
2450
+ "A pull request adding a retry wrapper to a synthetic data-fetch tool. Bar: it must be correct, must not retry non-idempotent work, and must stay readable for the next maintainer.",
2451
+ "",
2452
+ "### Feedback items, listed separately (not merged)",
2453
+ "Item 1 (reviewer A): 'The backoff is fixed-interval; under load this will thundering-herd. Use exponential backoff with jitter.'",
2454
+ "Item 2 (reviewer B): 'Wrap every external call in the retry, not just the fetch — be consistent.'",
2455
+ "Item 3 (reviewer C): 'Rename `fetchOnce` to `fetch`; the `Once` suffix is ugly.'",
2456
+ "Item 4 (reviewer A and reviewer B, same point): 'Add a comment explaining the retry budget.'",
2457
+ "",
2458
+ "### Item 1 — tier + reason",
2459
+ "ABSORB AND REFINE. The direction is right (fixed interval is a real herd risk) and I add the more precise version: exponential backoff WITH a cap, so retries do not grow unbounded on a long outage. What I added beyond A's note: the cap, which A did not mention.",
2460
+ "",
2461
+ "### Item 2 — tier + reason",
2462
+ "ABSORB WITH A BOUNDARY. Yes for idempotent reads, NO for the write path — blindly retrying a non-idempotent write can double-apply it. Boundary stated in code and review reply: retry wraps reads only; writes are explicitly excluded with a reason.",
2463
+ "",
2464
+ "### Item 3 — tier + reason",
2465
+ "REJECT, with an independent reason and an alternative. `Once` is not ugliness — it encodes that the function performs exactly one attempt, which is the contract the retry wrapper depends on; renaming it to `fetch` would blur that distinction and invite someone to add a second internal retry. Alternative offered to C: if the suffix reads oddly, rename to `fetchAttempt`, which keeps the single-attempt meaning. Not rejected to look independent — rejected because the name carries information C's note would erase.",
2466
+ "",
2467
+ "### Item 4 — tier + reason",
2468
+ "ABSORB FULLY. Right and clearly scoped, and the fact that two reviewers raised it does not change the verdict — it would be a full absorb even as a lone note, because a retry budget is genuinely opaque without a comment. (Logged explicitly so the ledger shows substance decided it, not the vote count.)",
2469
+ "",
2470
+ "### Discipline check (both directions)",
2471
+ "Did I reject anything just to look independent? Checked item 3 specifically — no; the reason stands on the contract, and I offered C a real alternative. Did I absorb anything just to avoid friction? Checked item 2 — I could have just said yes to 'wrap everything' to be agreeable, but the write-path boundary is load-bearing, so it gets a boundary, not a full absorb.",
2472
+ "",
2473
+ "### Ratio readout (outcome, not a target)",
2474
+ "Of four items: one full, one refine, one boundary, one reject. I was not aiming for any split — this is simply what judging each point on its merits produced. If all four had been sound I would have absorbed all four; the one rejection is there because one point did not hold, not to keep a ratio looking independent.",
2475
+ "",
2476
+ "### Auditable note",
2477
+ "The PR revision links each change back to its ledger row, so the next maintainer can see why writes are excluded from retry and why `fetchOnce` kept its suffix — the synthesis is traceable to per-item judgment, not a wholesale 'applied all review feedback'."
2478
+ ]
2479
+ },
2480
+ failures: [
2481
+ "The reviewer accepts all incoming feedback wholesale and the synthesis becomes a courier for other people's opinions rather than an independent revision.",
2482
+ "A target ratio creeps in and individual calls get bent to hit it, so the ledger stops recording honest judgment.",
2483
+ "Rejects and partial-absorbs are left without reasons, so the synthesis cannot be audited and looks the same as reflexive defensiveness."
2484
+ ]
2485
+ },
2486
+ {
2487
+ id: "collaboration-coach",
2488
+ title: "Collaboration Coach",
2489
+ purpose: "Make the assistant proactively remind the user of the matching collaboration step at six recurring moments — define done, review a completion claim, hand off, harvest, update the profile — instead of waiting to be asked, so the workspace teaches itself while the user works rather than sitting unused behind a manual. The hard constraint is restraint: prompt at key moments only, once per moment, never every turn, because over-prompting is the fastest way to get the whole system uninstalled.",
2490
+ trigger: "Run it as a standing behavior the moment a collaboration moment fires: a new task or vague idea arrives, the assistant is about to act before 'done' is defined, it just claimed completion, the thread is getting long or work is moving to another tool, a reusable judgment or lesson surfaced, or the same preference has shown up several times. The point is that the assistant raises the matching step on its own at that moment, not three turns later when the user finally remembers to ask.",
2491
+ antiTrigger: "Do not prompt on low-stakes, fast-turn work: a quick fact lookup, a one-line edit, a yes/no confirmation, a casual exchange, or any moment where the user clearly just wants the answer. Do not re-fire a reminder the user already acted on or explicitly waved off. A reminder on trivial work is noise, and noise on every turn trains the user to mute the coach exactly when a real high-stakes moment arrives — so the failure here is not 'missed a prompt', it is 'prompted so often the user turned it off'.",
2492
+ input: "The live collaboration moment (which of the six nodes is firing, and the signal that tripped it). The current restraint tier (light / standard / strict), defaulting to standard. Whether this exact reminder has already fired or been dismissed in this thread, so it is not repeated. For the completion-claim node: how many model families are available, so the right guard depth is named (one family -> single-tool-guard; a second different family -> dual-guard; multi-tool -> full fusion review). The matching concrete next action for whichever node fired, so the prompt hands the user a step, not a lecture.",
2493
+ inputsDetailed: [
2494
+ "The firing node (1 task start, 2 pre-execution, 3 completion claim, 4 long thread / tool switch, 5 reusable insight, 6 repeated preference) and the signal that tripped it.",
2495
+ "Restraint tier: light (nodes 3 and 4 only), standard (default — fire at nodes 1, 3, 4, 6; fold node 2 into the task-start reminder and node 5 into a natural pause; count once-per-moment by task phase not by node), or strict (all six, every time they fire); default standard.",
2496
+ "Already-fired / dismissed memory for this thread, so a reminder is not repeated after the user has acted on it or waved it off.",
2497
+ "Model-family count at the completion-claim node, so the guard depth is named correctly (one -> single-tool-guard, two different -> dual-guard, multi-tool -> full fusion).",
2498
+ "The concrete next action attached to the firing node, so the prompt offers a step (write the acceptance card, open single-tool-guard, write the handoff) rather than an abstract nudge.",
2499
+ "User restraint command, if any (\`coach: light\` / \`coach: standard\` / \`coach: strict\`), which changes how often the coach speaks up."
2500
+ ],
2501
+ process: [
2502
+ "On the FIRST message after install, act proactively: introduce yourself, offer to scan the user's recent work, state the privacy boundary before scanning (the scan is run by you, the cloud AI they already use, so content passes through your provider like any normal chat — not 'zero data leaves the machine'), and respect their yes / narrow / no choice — not just recite a list of future reminder moments. Then say reminders are restrained by default and switchable with \`coach: light\` / \`coach: strict\`. Do this once, then stop.",
2503
+ "Map each firing moment to its node and reminder. 1 Task start -> set a context boundary and acceptance before building. 2 Pre-execution -> define the acceptance card first. 3 Completion claim -> run a guard review before trusting it. 4 Long thread / tool switch -> generate a handoff instead of relying on chat memory. 5 Reusable insight -> harvest it into a card. 6 Repeated preference -> offer it as a profile-update candidate.",
2504
+ "At node 3 (completion claim), branch on available model families and name the matching guard: one model family only -> run single-tool-guard (a new conversation plus an adversarial prompt); a second, different family available -> run dual-guard (the cross-family binding gate); a multi-tool setup -> run the full fusion review. Do not silently skip the branch and just say 'looks good'.",
2505
+ "Apply the restraint tier before speaking. Light: only fire at nodes 3 and 4. Standard (default): fire at nodes 1, 3, 4, 6 — fold node 2 into the task-start reminder (skip it entirely if node 1 already landed an acceptance card) and node 5 into a natural pause, not a separate interruption; count \"once per moment\" by task phase, not by node, so the opening of one task is a single moment even if nodes 1 and 2 both trip — never stack reminders on back-to-back turns at a task's start, and never re-raise a reminder already acted on. Strict: fire at all six every time they trip. If a tier would make you repeat a just-given reminder, stay silent instead.",
2506
+ "Keep each reminder to one or two sentences that hand over the concrete next step, then continue the actual work. Do not pause to explain the philosophy of the layer, do not stack multiple reminders into a wall, and do not lecture — a prompt the user cannot act on in one move is noise.",
2507
+ "Honor a restraint switch immediately. When the user says \`coach: light\` / \`coach: standard\` / \`coach: strict\`, change tier for the rest of the session without arguing, and confirm the new tier in a few words."
2508
+ ],
2509
+ outputShape: [
2510
+ "First-run promise: on the first reply, the assistant proactively introduces itself, offers to scan the user's recent work, states the privacy boundary before scanning, and respects the user's yes / narrow / no choice — rather than passively reciting a list of future reminder moments.",
2511
+ "Per-moment reminder: the firing node named, plus the one-or-two-sentence concrete next step it hands over.",
2512
+ "Completion-claim branch: which guard was named (single-tool-guard / dual-guard / full fusion) based on available model families.",
2513
+ "Restraint state: the current tier and a note that the reminder respected it (and was not a repeat of one already acted on).",
2514
+ "Tier change acknowledgement: when the user switches, the new tier confirmed in a few words.",
2515
+ "Continuation: the reminder is followed by getting on with the actual task, not by a paragraph of theory."
2516
+ ],
2517
+ template: [
2518
+ "First-run promise (first reply, proactive — introduce + offer to scan + state the privacy boundary + respect yes / narrow / no):",
2519
+ "Firing node (1-6) and the signal that tripped it:",
2520
+ "Restraint tier in effect (light / standard / strict; default standard):",
2521
+ "Already fired or dismissed this thread? (if yes, stay silent):",
2522
+ "Matching reminder (one or two sentences, hands over the concrete next step):",
2523
+ "Completion-claim branch (one family -> single-tool-guard / two different -> dual-guard / multi-tool -> full fusion):",
2524
+ "Tier change to acknowledge (if the user said coach: light / standard / strict):",
2525
+ "Continue the actual task:"
2526
+ ],
2527
+ passBar: [
2528
+ "On the first reply the assistant acted proactively: it introduced itself, offered to scan recent work, stated the privacy boundary before scanning, respected the user's yes / narrow / no choice, and did not repeat the intro afterward.",
2529
+ "Each reminder fired at the right node with a concrete next step the user can act on in one move.",
2530
+ "The completion-claim node named the correct guard depth for the number of model families available.",
2531
+ "Restraint held: standard by default, once per moment, no reminder the user already acted on was re-raised, and a \`coach:\` switch was honored immediately.",
2532
+ "Reminders stayed short and the assistant got back to the work instead of lecturing about the layer."
2533
+ ],
2534
+ rejectBar: [
2535
+ "A reminder fires every turn, or the same reminder is repeated after the user already acted on it (over-prompting — the uninstall path).",
2536
+ "The completion-claim node says 'looks good' or skips the guard branch instead of naming single-tool-guard / dual-guard / full fusion.",
2537
+ "A reminder hands over a lecture or a vague nudge instead of a concrete next step.",
2538
+ "The default silently runs at strict (all six, every time) when the user never asked for it, burying the signal.",
2539
+ "A \`coach:\` restraint switch is ignored or argued with instead of applied for the rest of the session."
2540
+ ],
2541
+ misuse: [
2542
+ "Turning every turn into a reminder so the user mutes the coach — the most common way this mechanism gets the whole system uninstalled, and the exact opposite of thoroughness.",
2543
+ "Naming a step but not the action: 'you should probably guard this' with no pointer to single-tool-guard or dual-guard, so the user is reminded but not moved.",
2544
+ "Skipping the completion-claim branch and rubber-stamping 'done' as 'looks good', which is the one node that exists to stop a fluent false completion.",
2545
+ "Lecturing the theory of the coaching layer mid-task instead of handing over one concrete step and continuing.",
2546
+ "Defaulting to strict (or to silent) instead of standard, so the user either drowns in prompts or never gets the reminder that mattered.",
2547
+ "Re-firing a reminder the user already dismissed, which reads as nagging and trains them to ignore the next one."
2548
+ ],
2549
+ example: "A synthetic execution session reaches a 'done, implemented and tested' claim. At the completion-claim node the coach fires once, sees only one model family is available, and points the user to single-tool-guard (new conversation plus an adversarial prompt) instead of accepting the fluent claim — then continues, without lecturing.",
2550
+ filledExample: {
2551
+ scenario: "A solo user is working in one AI tool on a small synthetic feature. The coach runs at standard. The session passes through several collaboration moments; this shows the coach firing at the completion-claim node, once, with a concrete next step.",
2552
+ lines: [
2553
+ "### First-run: act proactively the first time",
2554
+ "\"You just installed a collaboration discipline — before I say 'done' I'll show evidence, when I'm unsure I'll pull in a second AI, you won't re-explain when you switch tools, and I'll help you save what's worth keeping. Want me to take 30 seconds and look at a few of your recent tasks to show you what this changes?\" Then it offers a scan, respects yes / narrow / no, and gets to work; it does not repeat this.",
2555
+ "",
2556
+ "### Firing node and signal",
2557
+ "Node 3, completion claim. The execution assistant just returned: \"Done. I implemented the new sort option and added a test; everything passes.\" That 'done / everything passes' phrasing is the signal.",
2558
+ "",
2559
+ "### Restraint tier in effect",
2560
+ "Standard (default). Nodes 1 and 2 already fired once earlier this session (a one-line 'want to set acceptance first?' at task start), so they are not re-raised. Node 3 has not fired yet, so it may fire now — once.",
2561
+ "",
2562
+ "### Matching reminder (one or two sentences, concrete next step)",
2563
+ "\"Before trusting that 'done', this is the completion-claim moment — worth a guard review. You're on a single tool right now, so the fitting move is single-tool-guard: open a fresh conversation and paste an adversarial reviewer prompt against this claim. Want the prompt?\"",
2564
+ "",
2565
+ "### Completion-claim branch",
2566
+ "Only one model family is available, so the coach named single-tool-guard (new conversation + adversarial prompt), explicitly NOT dual-guard. It noted in passing that if a second, different model family were available, the stronger move would be the cross-family dual-guard, and a multi-tool setup would run the full fusion review.",
2567
+ "",
2568
+ "### Tier change to acknowledge",
2569
+ "None this turn. (If the user had said 'coach: light', the coach would have confirmed 'switched to light — I'll only flag completion claims and tool switches now' and applied it for the rest of the session.)",
2570
+ "",
2571
+ "### Continue the actual task",
2572
+ "After the one reminder, the coach drops it and continues helping with the feature. It does not re-raise the guard reminder on the next turn, and it does not deliver a paragraph on why guarding matters — the user already has the one step they can act on."
2573
+ ]
2574
+ },
2575
+ failures: [
2576
+ "The coach fires a reminder on every turn until the user switches it off entirely, losing the one prompt that would have mattered.",
2577
+ "The completion-claim node is reduced to 'looks good' and never names a guard, so a fluent false 'done' passes unchallenged.",
2578
+ "Reminders arrive as theory lectures rather than one concrete next step, so the user reads them as noise and stops acting on them."
2579
+ ]
2580
+ },
2581
+ {
2582
+ id: "single-tool-guard",
2583
+ title: "Single-Tool Guard",
2584
+ purpose: "Give a one-model-family user — the realistic default for most solo users, who have exactly one tool — a real guard to START from, not a downgrade to settle for. With a single AI you still turn 'done' into an evidence-backed, re-checkable result. The single-tool guard runs a fresh conversation plus an adversarial reviewer prompt instead of trusting the same assistant that just wrote it. It honestly does NOT equal the cross-family binding gate: it catches fewer real problems, so the verdict is always labeled not-yet-binding, capped at L2, with the residual risk named on the record. The cross-family dual-guard is the upgrade ceiling, not the entry bar.",
2585
+ trigger: "This is the default starting point for solo users, who have exactly one tool. Use it at a completion claim when only one model family is available and you would otherwise trust the same assistant that just produced the work: a 'done, tested, shipped' claim, a deliverable about to be handed on, or any output where a wrong 'looks fine' would propagate. It is not a fallback you reach for once a cross-family setup fails — it is where most real work begins, and a single AI here already gives you an evidence-backed, re-checkable result instead of a trusted 'looks fine'.",
2586
+ antiTrigger: "Do not use it as a substitute for a cross-family pass when a second, different model family IS available — in that case run dual-guard, because the cross-family binding gate catches what same-family review cannot. Skip it entirely for low-stakes, easily reversible work a human will fully re-check anyway: a quick fact, a one-line tweak, a throwaway draft. Running an adversarial review on trivial work is ceremony, and ceremony you pay for nothing trains people to skip the review when it actually matters.",
2587
+ input: "The artifact under review, with stable line or section references the reviewer can point to. The acceptance card or definition of done it claims to meet. The completion claim's evidence — command output, test results, a reproduced result — or an explicit note that none exists. The context boundary (goal, scope, non-goals) so the reviewer can catch scope drift. A clear acknowledgement that only one model family is available, which is why this is a single-family review and not a cross-family binding pass.",
2588
+ inputsDetailed: [
2589
+ "Artifact under review, with line numbers or section anchors so a finding cites an exact spot, not a vibe.",
2590
+ "Acceptance card / definition of done: the checkable criteria the artifact claims to satisfy.",
2591
+ "Completion-claim evidence: the actual command output, test result, or reproduced behavior the claim rests on, or an explicit note that none exists.",
2592
+ "Context boundary: goal, in-scope, and explicit non-goals, so the reviewer can catch scope drift.",
2593
+ "Single-family acknowledgement: a stated note that only one model family is available, so the verdict is framed as not-yet-binding from the start.",
2594
+ "The original drafting thread is deliberately NOT reused: a fresh conversation is opened, because the original thread carries the assistant's eagerness to please and its memory of having just claimed done."
2595
+ ],
2596
+ process: [
2597
+ "Open a NEW conversation. Use a fresh context — the original thread carries the assistant's eagerness to please and its memory of having just claimed done, both of which suppress the very objections you need.",
2598
+ "Paste an ADVERSARIAL reviewer prompt: instruct the assistant to default to refuting the work, to hunt for missing evidence rather than confirm, and to tie every finding to a specific line or section. The frame must actually be adversarial, not 'take a look'.",
2599
+ "Give it the artifact plus the acceptance card and the completion claim's evidence (or the explicit note that none exists). Ask it to check each completion claim against the evidence, whether the acceptance criteria are met, and whether scope drifted past the stated non-goals.",
2600
+ "Mark the verdict explicitly as \`single-family-only — cross-family binding gate NOT passed\`, and name the residual risk (what a same-family reviewer is most likely to have missed). Never record this as a passed dual-guard.",
2601
+ "Resolve each finding one of two ways: fix it and re-show the evidence in another fresh adversarial pass, or carry it explicitly as named residual risk the owner accepts on the record. A silent 'good enough' is not allowed here either.",
2602
+ "Upgrade path: when a second, different model family becomes available, still run one cross-family binding pass. The single-tool guard is the floor, not the ceiling."
2603
+ ],
2604
+ outputShape: [
2605
+ "Guard level: this is L2 at best (single tool, author-supplied evidence). It CANNOT reach the cross-family L3 gate, so it cannot return a plain pass.",
2606
+ "Verdict: one of the four standard states, but bounded by L2 — the strongest a single-tool guard may give is pass_with_risk; it must NOT be recorded as a passed dual-guard. Use reject for a real defect and insufficient_evidence when the completion claim has no evidence at all.",
2607
+ "Findings, each tied to a specific line or section, produced under an actually-adversarial frame (not a 'looks good' rubber stamp).",
2608
+ "Residual risk: what a same-family reviewer most likely still missed, named on the record.",
2609
+ "Owner sign-off: a pass_with_risk is not 'accepted' on the guard's say-so — a human must explicitly accept the named residual risk on the record.",
2610
+ "Required fixes: the concrete change each blocker needs, to be re-checked in another fresh adversarial pass.",
2611
+ "Acceptance record: for any finding carried rather than fixed, who accepted the residual risk.",
2612
+ "Upgrade note: a reminder to run one cross-family binding pass once a second, different model family is available (that is what lifts the ceiling from L2/pass_with_risk to L3/pass)."
2613
+ ],
2614
+ template: [
2615
+ "Artifact under review (with line/section refs):",
2616
+ "Acceptance source / definition of done:",
2617
+ "Completion claim's evidence (or explicit 'none exists'):",
2618
+ "Single-family acknowledgement (only one model family available):",
2619
+ "Fresh conversation opened? (must be yes — do not reuse the drafting thread):",
2620
+ "Adversarial reviewer prompt used (default to refute, hunt missing evidence, cite specific spots):",
2621
+ "Findings (each cites a line/section/missing evidence):",
2622
+ "Guard level reached — L2 at most (single tool); a clean pass would need the cross-family L3 pack:",
2623
+ "Verdict — pass_with_risk / reject / insufficient_evidence (NEVER a plain pass; NEVER recorded as a passed dual-guard):",
2624
+ "Residual risk (what a same-family reviewer likely still missed) and who accepted it (a pass_with_risk needs an explicit owner sign-off):",
2625
+ "Required fixes (re-check in another fresh adversarial pass):",
2626
+ "Upgrade path (run a cross-family binding pass when a second family is available):"
2627
+ ],
2628
+ passBar: [
2629
+ "The verdict is at most pass_with_risk (the L2 ceiling) and is NOT recorded as a plain pass or a passed cross-family binding gate.",
2630
+ "Residual risk is named — what a same-family reviewer most likely still missed is on the record, not left blank.",
2631
+ "Any pass_with_risk has an explicit owner sign-off on the named residual risk; the guard did not mark it accepted on its own.",
2632
+ "Every finding is tied to a specific line or section, not a general impression.",
2633
+ "The adversarial frame was actually used (default-to-refute, hunt-for-missing-evidence), in a fresh conversation, not a 'looks good' rubber stamp from the original thread.",
2634
+ "The upgrade path is noted: a cross-family binding pass is still owed once a second, different model family is available (that is what lifts the ceiling to L3/pass)."
2635
+ ],
2636
+ rejectBar: [
2637
+ "The single-family review is recorded as a plain pass or as if it cleared the binding gate (the head failure — a same-family pass dressed up as dual-guard / L3).",
2638
+ "A pass_with_risk is treated as accepted without an explicit owner sign-off on the residual risk.",
2639
+ "No residual risk is named, so the next session inherits a hidden gap and assumes more assurance than the pass actually provides.",
2640
+ "The reviewer only graded tone, fluency, or style instead of checking claims against evidence.",
2641
+ "The drafting thread was reused instead of a fresh conversation, so the assistant's just-claimed-done eagerness suppressed the objections.",
2642
+ "The frame was not adversarial — it was a 'take a look' that produced an agreeable 'seems fine'."
2643
+ ],
2644
+ misuse: [
2645
+ "Treating a same-family single review as having passed dual-guard. Same-family reviewers miss the same things; a single-tool pass catches fewer real problems, so it must always be labeled as not-yet-binding, with the residual risk on the record.",
2646
+ "Skipping the fresh conversation and asking the same thread that just said 'done' to review itself, where its eagerness to please buries the objections.",
2647
+ "Using a soft 'review this' prompt instead of an adversarial default-to-refute frame, which produces an agreeable rubber stamp rather than findings.",
2648
+ "Leaving the residual risk blank, so a reader of the record assumes the artifact got more assurance than a single-family pass can give.",
2649
+ "Reviewing against tone and fluency with no acceptance card or evidence, so the pass grades how it reads instead of whether the claims hold.",
2650
+ "Treating the single-tool guard as the ceiling and never running the cross-family binding pass even after a second, different model family becomes available."
2651
+ ],
2652
+ example: "A solo user on one model family gets a confident 'done and tested' claim, opens a fresh conversation, and pastes an adversarial reviewer prompt. The review finds a claimed test that does not exist and ties it to the line. The verdict is recorded as `single-family-only — cross-family binding gate NOT passed`, with the residual risk (a same-family reviewer may share the drafter's blind spots) named on the record.",
2653
+ filledExample: {
2654
+ scenario: "A solo developer has only one model family available. Their execution assistant returned a confident completion claim. Rather than trust the same assistant, they run Single-Tool Guard: a fresh conversation plus an adversarial reviewer prompt, with the verdict explicitly labeled as not-yet-binding.",
2655
+ lines: [
2656
+ "### Artifact under review (with line/section refs)",
2657
+ "A short completion report plus a code block from the execution assistant. The report says: \"Done. I added the CSV export and a test that covers it; everything passes.\" The code block is pasted with line numbers so the reviewer can cite exact lines.",
2658
+ "",
2659
+ "### Acceptance source / definition of done",
2660
+ "AC1: a button exports the current rows to CSV. AC2: there is an automated test that fails before the change and passes after. AC3: empty-table export produces a header-only file, not a crash. AC4: existing data is untouched.",
2661
+ "",
2662
+ "### Completion claim's evidence (or explicit 'none exists')",
2663
+ "The report asserts 'everything passes' but pastes NO command output and NO test run. Evidence: none provided. That gap is recorded explicitly.",
2664
+ "",
2665
+ "### Single-family acknowledgement",
2666
+ "Only one model family is available. So this is a single-family review from the start, and the verdict will be labeled accordingly — it cannot be a cross-family binding pass.",
2667
+ "",
2668
+ "### Fresh conversation opened?",
2669
+ "Yes. A brand-new conversation is opened; the original drafting thread is NOT reused, because that thread just claimed 'done' and is primed to defend it.",
2670
+ "",
2671
+ "### Adversarial reviewer prompt used",
2672
+ "\"You are an adversarial reviewer. Default to refuting this completion claim. Hunt for missing evidence. Tie every finding to a specific line or section. Do not be agreeable; if a claim lacks proof, say so.\"",
2673
+ "",
2674
+ "### Findings (each cites a line/section)",
2675
+ "- AC2 UNSUPPORTED. The pasted code shows the export function but NO test file and NO test run output. 'Added a test; everything passes' has no evidence behind it at the cited lines.",
2676
+ "- AC3 FAIL. The export maps over the rows with no empty-table branch, so an empty table would write a malformed (or empty) file rather than a header-only file — the cited loop has no guard.",
2677
+ "- AC1 PLAUSIBLE but unverified: the export function is present and looks correct, but with no run output it is asserted, not proven.",
2678
+ "",
2679
+ "### Verdict",
2680
+ "`single-family-only — cross-family binding gate NOT passed`. Reject as a completion claim: AC2 has no evidence and AC3 has a real defect. This is explicitly NOT recorded as a passed dual-guard.",
2681
+ "",
2682
+ "### Residual risk (what a same-family reviewer likely still missed) and who accepted it",
2683
+ "A same-family reviewer shares the drafter's blind spots, so it may have missed an issue a different model family would catch — for example a CSV-escaping bug (commas or quotes inside a cell) that neither the drafter nor this same-family reviewer flagged. The owner accepts this residual risk for now, on the record, pending a cross-family pass.",
2684
+ "",
2685
+ "### Required fixes (re-check in another fresh adversarial pass)",
2686
+ "1. Add the empty-table header-only branch. 2. Add the automated test and paste its fail-then-pass output. Re-review in a new adversarial conversation once both exist.",
2687
+ "",
2688
+ "### Upgrade path",
2689
+ "When a second, different model family becomes available, still run one cross-family binding pass — especially on the CSV-escaping risk a same-family review is most likely to share the blind spot on."
2690
+ ]
2691
+ },
2692
+ failures: [
2693
+ "The single-family pass is filed as a passed dual-guard, so the next session believes the cross-family binding gate cleared it when it never ran.",
2694
+ "The drafting thread reviews itself and its eagerness to please buries the objections a fresh adversarial conversation would have surfaced.",
2695
+ "The residual risk is left unnamed, so the record overstates how much assurance a one-family review actually provides."
2696
+ ]
2697
+ }
2698
+ ];
2699
+
2700
+ export const requiredWorkspaceDirs = [
2701
+ "profile",
2702
+ "context",
2703
+ "acceptance",
2704
+ "guard",
2705
+ "handoff",
2706
+ "harvest",
2707
+ "roles",
2708
+ "modes",
2709
+ "mechanisms",
2710
+ "prompts",
2711
+ "skills",
2712
+ "adapters",
2713
+ "examples",
2714
+ "cookbook",
2715
+ "state",
2716
+ "privacy"
2717
+ ];
2718
+
2719
+ export const requiredPromptFiles = promptDefinitions.map((prompt) => prompt.file);
2720
+ export const requiredSkillIds = skillDefinitions.map((skill) => skill.id);
2721
+ export const requiredAdapterIds = adapterDefinitions.map((adapter) => adapter.id);
2722
+ export const requiredCaseIds = caseDefinitions.map((item) => item.id);
2723
+ export const requiredMechanismIds = mechanismDefinitions.map((item) => item.id);