ai-collab-open-system 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. package/.aict/START_HERE.md +127 -0
  2. package/.aict/WORKSPACE_MANIFEST.json +91 -0
  3. package/.aict/acceptance/EXAMPLE.synthetic.md +49 -0
  4. package/.aict/acceptance/FAILURE_MODES.md +40 -0
  5. package/.aict/acceptance/PROMPT.md +47 -0
  6. package/.aict/acceptance/README.md +44 -0
  7. package/.aict/acceptance/TEMPLATE.md +57 -0
  8. package/.aict/adapters/SHARED_CORE_CONTRACT.md +106 -0
  9. package/.aict/adapters/claude-code/ADAPTER.md +28 -0
  10. package/.aict/adapters/cline/ADAPTER.md +28 -0
  11. package/.aict/adapters/codex/ADAPTER.md +28 -0
  12. package/.aict/adapters/copilot/ADAPTER.md +28 -0
  13. package/.aict/adapters/cursor/ADAPTER.md +28 -0
  14. package/.aict/adapters/windsurf/ADAPTER.md +28 -0
  15. package/.aict/context/EXAMPLE.synthetic.md +53 -0
  16. package/.aict/context/FAILURE_MODES.md +40 -0
  17. package/.aict/context/PROMPT.md +47 -0
  18. package/.aict/context/README.md +44 -0
  19. package/.aict/context/TEMPLATE.md +63 -0
  20. package/.aict/cookbook/README.md +8 -0
  21. package/.aict/cookbook/bridge-to-a-second-family.md +103 -0
  22. package/.aict/cookbook/connect-a-tool.md +67 -0
  23. package/.aict/cookbook/review-a-half-product.md +79 -0
  24. package/.aict/cookbook/run-a-first-loop.md +81 -0
  25. package/.aict/examples/README.md +21 -0
  26. package/.aict/examples/ai-coding-long-task/CASE.md +161 -0
  27. package/.aict/examples/ai-coding-long-task/artifacts/acceptance-card.md +36 -0
  28. package/.aict/examples/ai-coding-long-task/artifacts/context-package.md +30 -0
  29. package/.aict/examples/ai-coding-long-task/artifacts/execution-prompt.md +30 -0
  30. package/.aict/examples/ai-coding-long-task/artifacts/first-ai-output.md +109 -0
  31. package/.aict/examples/ai-coding-long-task/artifacts/guard-review.md +40 -0
  32. package/.aict/examples/ai-coding-long-task/artifacts/handoff-note.md +28 -0
  33. package/.aict/examples/ai-coding-long-task/artifacts/harvest-seed.md +28 -0
  34. package/.aict/examples/ai-coding-long-task/artifacts/revised-output.md +62 -0
  35. package/.aict/examples/content-production-harvest/CASE.md +87 -0
  36. package/.aict/examples/content-production-harvest/artifacts/acceptance-card.md +28 -0
  37. package/.aict/examples/content-production-harvest/artifacts/context-package.md +28 -0
  38. package/.aict/examples/content-production-harvest/artifacts/execution-prompt.md +30 -0
  39. package/.aict/examples/content-production-harvest/artifacts/guard-review.md +28 -0
  40. package/.aict/examples/content-production-harvest/artifacts/handoff-note.md +28 -0
  41. package/.aict/examples/content-production-harvest/artifacts/harvest-seed.md +28 -0
  42. package/.aict/examples/multi-tool-collaboration/CASE.md +87 -0
  43. package/.aict/examples/multi-tool-collaboration/artifacts/acceptance-card.md +28 -0
  44. package/.aict/examples/multi-tool-collaboration/artifacts/context-package.md +28 -0
  45. package/.aict/examples/multi-tool-collaboration/artifacts/execution-prompt.md +30 -0
  46. package/.aict/examples/multi-tool-collaboration/artifacts/guard-review.md +28 -0
  47. package/.aict/examples/multi-tool-collaboration/artifacts/handoff-note.md +28 -0
  48. package/.aict/examples/multi-tool-collaboration/artifacts/harvest-seed.md +28 -0
  49. package/.aict/examples/personal-judgment-growth-assistant/CASE.md +87 -0
  50. package/.aict/examples/personal-judgment-growth-assistant/artifacts/acceptance-card.md +28 -0
  51. package/.aict/examples/personal-judgment-growth-assistant/artifacts/context-package.md +28 -0
  52. package/.aict/examples/personal-judgment-growth-assistant/artifacts/execution-prompt.md +30 -0
  53. package/.aict/examples/personal-judgment-growth-assistant/artifacts/guard-review.md +28 -0
  54. package/.aict/examples/personal-judgment-growth-assistant/artifacts/handoff-note.md +28 -0
  55. package/.aict/examples/personal-judgment-growth-assistant/artifacts/harvest-seed.md +28 -0
  56. package/.aict/examples/research-knowledge-synthesis/CASE.md +87 -0
  57. package/.aict/examples/research-knowledge-synthesis/artifacts/acceptance-card.md +28 -0
  58. package/.aict/examples/research-knowledge-synthesis/artifacts/context-package.md +28 -0
  59. package/.aict/examples/research-knowledge-synthesis/artifacts/execution-prompt.md +30 -0
  60. package/.aict/examples/research-knowledge-synthesis/artifacts/guard-review.md +28 -0
  61. package/.aict/examples/research-knowledge-synthesis/artifacts/handoff-note.md +28 -0
  62. package/.aict/examples/research-knowledge-synthesis/artifacts/harvest-seed.md +28 -0
  63. package/.aict/guard/EXAMPLE.synthetic.md +51 -0
  64. package/.aict/guard/FAILURE_MODES.md +40 -0
  65. package/.aict/guard/PROMPT.md +47 -0
  66. package/.aict/guard/README.md +44 -0
  67. package/.aict/guard/TEMPLATE.md +60 -0
  68. package/.aict/handoff/EXAMPLE.synthetic.md +51 -0
  69. package/.aict/handoff/FAILURE_MODES.md +40 -0
  70. package/.aict/handoff/PROMPT.md +47 -0
  71. package/.aict/handoff/README.md +44 -0
  72. package/.aict/handoff/TEMPLATE.md +60 -0
  73. package/.aict/harvest/EXAMPLE.synthetic.md +51 -0
  74. package/.aict/harvest/FAILURE_MODES.md +40 -0
  75. package/.aict/harvest/PROMPT.md +47 -0
  76. package/.aict/harvest/README.md +44 -0
  77. package/.aict/harvest/TEMPLATE.md +60 -0
  78. package/.aict/mechanisms/README.md +34 -0
  79. package/.aict/mechanisms/anti-drift-partner/EXAMPLE.synthetic.md +46 -0
  80. package/.aict/mechanisms/anti-drift-partner/FAILURE_MODES.md +25 -0
  81. package/.aict/mechanisms/anti-drift-partner/PROMPT.md +75 -0
  82. package/.aict/mechanisms/anti-drift-partner/README.md +82 -0
  83. package/.aict/mechanisms/anti-drift-partner/TEMPLATE.md +74 -0
  84. package/.aict/mechanisms/blind-spot-scan/EXAMPLE.synthetic.md +39 -0
  85. package/.aict/mechanisms/blind-spot-scan/FAILURE_MODES.md +25 -0
  86. package/.aict/mechanisms/blind-spot-scan/PROMPT.md +72 -0
  87. package/.aict/mechanisms/blind-spot-scan/README.md +79 -0
  88. package/.aict/mechanisms/blind-spot-scan/TEMPLATE.md +70 -0
  89. package/.aict/mechanisms/collaboration-coach/EXAMPLE.synthetic.md +40 -0
  90. package/.aict/mechanisms/collaboration-coach/FAILURE_MODES.md +25 -0
  91. package/.aict/mechanisms/collaboration-coach/PROMPT.md +72 -0
  92. package/.aict/mechanisms/collaboration-coach/README.md +79 -0
  93. package/.aict/mechanisms/collaboration-coach/TEMPLATE.md +61 -0
  94. package/.aict/mechanisms/do-not-handle-yet/EXAMPLE.synthetic.md +15 -0
  95. package/.aict/mechanisms/do-not-handle-yet/FAILURE_MODES.md +16 -0
  96. package/.aict/mechanisms/do-not-handle-yet/PROMPT.md +41 -0
  97. package/.aict/mechanisms/do-not-handle-yet/README.md +30 -0
  98. package/.aict/mechanisms/do-not-handle-yet/TEMPLATE.md +38 -0
  99. package/.aict/mechanisms/dual-guard/EXAMPLE.synthetic.md +54 -0
  100. package/.aict/mechanisms/dual-guard/FAILURE_MODES.md +25 -0
  101. package/.aict/mechanisms/dual-guard/PROMPT.md +76 -0
  102. package/.aict/mechanisms/dual-guard/README.md +81 -0
  103. package/.aict/mechanisms/dual-guard/TEMPLATE.md +73 -0
  104. package/.aict/mechanisms/feedback-absorption-ledger/EXAMPLE.synthetic.md +49 -0
  105. package/.aict/mechanisms/feedback-absorption-ledger/FAILURE_MODES.md +25 -0
  106. package/.aict/mechanisms/feedback-absorption-ledger/PROMPT.md +74 -0
  107. package/.aict/mechanisms/feedback-absorption-ledger/README.md +81 -0
  108. package/.aict/mechanisms/feedback-absorption-ledger/TEMPLATE.md +69 -0
  109. package/.aict/mechanisms/half-product-review/EXAMPLE.synthetic.md +15 -0
  110. package/.aict/mechanisms/half-product-review/FAILURE_MODES.md +16 -0
  111. package/.aict/mechanisms/half-product-review/PROMPT.md +41 -0
  112. package/.aict/mechanisms/half-product-review/README.md +30 -0
  113. package/.aict/mechanisms/half-product-review/TEMPLATE.md +38 -0
  114. package/.aict/mechanisms/handoff-abc/EXAMPLE.synthetic.md +47 -0
  115. package/.aict/mechanisms/handoff-abc/FAILURE_MODES.md +25 -0
  116. package/.aict/mechanisms/handoff-abc/PROMPT.md +75 -0
  117. package/.aict/mechanisms/handoff-abc/README.md +82 -0
  118. package/.aict/mechanisms/handoff-abc/TEMPLATE.md +60 -0
  119. package/.aict/mechanisms/harvest-and-erc/EXAMPLE.synthetic.md +43 -0
  120. package/.aict/mechanisms/harvest-and-erc/FAILURE_MODES.md +25 -0
  121. package/.aict/mechanisms/harvest-and-erc/PROMPT.md +74 -0
  122. package/.aict/mechanisms/harvest-and-erc/README.md +81 -0
  123. package/.aict/mechanisms/harvest-and-erc/TEMPLATE.md +60 -0
  124. package/.aict/mechanisms/honest-calibration/EXAMPLE.synthetic.md +43 -0
  125. package/.aict/mechanisms/honest-calibration/FAILURE_MODES.md +25 -0
  126. package/.aict/mechanisms/honest-calibration/PROMPT.md +74 -0
  127. package/.aict/mechanisms/honest-calibration/README.md +81 -0
  128. package/.aict/mechanisms/honest-calibration/TEMPLATE.md +66 -0
  129. package/.aict/mechanisms/one-click-dispatch/EXAMPLE.synthetic.md +15 -0
  130. package/.aict/mechanisms/one-click-dispatch/FAILURE_MODES.md +16 -0
  131. package/.aict/mechanisms/one-click-dispatch/PROMPT.md +41 -0
  132. package/.aict/mechanisms/one-click-dispatch/README.md +30 -0
  133. package/.aict/mechanisms/one-click-dispatch/TEMPLATE.md +38 -0
  134. package/.aict/mechanisms/plain-language-first-screen/EXAMPLE.synthetic.md +15 -0
  135. package/.aict/mechanisms/plain-language-first-screen/FAILURE_MODES.md +16 -0
  136. package/.aict/mechanisms/plain-language-first-screen/PROMPT.md +41 -0
  137. package/.aict/mechanisms/plain-language-first-screen/README.md +30 -0
  138. package/.aict/mechanisms/plain-language-first-screen/TEMPLATE.md +38 -0
  139. package/.aict/mechanisms/root-cause-brake/EXAMPLE.synthetic.md +55 -0
  140. package/.aict/mechanisms/root-cause-brake/FAILURE_MODES.md +25 -0
  141. package/.aict/mechanisms/root-cause-brake/PROMPT.md +73 -0
  142. package/.aict/mechanisms/root-cause-brake/README.md +79 -0
  143. package/.aict/mechanisms/root-cause-brake/TEMPLATE.md +74 -0
  144. package/.aict/mechanisms/scout-review-controller/EXAMPLE.synthetic.md +15 -0
  145. package/.aict/mechanisms/scout-review-controller/FAILURE_MODES.md +16 -0
  146. package/.aict/mechanisms/scout-review-controller/PROMPT.md +41 -0
  147. package/.aict/mechanisms/scout-review-controller/README.md +30 -0
  148. package/.aict/mechanisms/scout-review-controller/TEMPLATE.md +38 -0
  149. package/.aict/mechanisms/single-tool-guard/EXAMPLE.synthetic.md +54 -0
  150. package/.aict/mechanisms/single-tool-guard/FAILURE_MODES.md +25 -0
  151. package/.aict/mechanisms/single-tool-guard/PROMPT.md +76 -0
  152. package/.aict/mechanisms/single-tool-guard/README.md +83 -0
  153. package/.aict/mechanisms/single-tool-guard/TEMPLATE.md +75 -0
  154. package/.aict/mechanisms/task-splitting/EXAMPLE.synthetic.md +53 -0
  155. package/.aict/mechanisms/task-splitting/FAILURE_MODES.md +25 -0
  156. package/.aict/mechanisms/task-splitting/PROMPT.md +72 -0
  157. package/.aict/mechanisms/task-splitting/README.md +79 -0
  158. package/.aict/mechanisms/task-splitting/TEMPLATE.md +76 -0
  159. package/.aict/modes/README.md +11 -0
  160. package/.aict/modes/execute.md +31 -0
  161. package/.aict/modes/handoff.md +29 -0
  162. package/.aict/modes/harvest.md +30 -0
  163. package/.aict/modes/review.md +28 -0
  164. package/.aict/modes/shape.md +34 -0
  165. package/.aict/privacy/COMMERCIAL_BOUNDARY.md +34 -0
  166. package/.aict/privacy/PRIVACY.md +36 -0
  167. package/.aict/privacy/REDACTION_CHECKLIST.md +12 -0
  168. package/.aict/profile/CANDIDATES.md +44 -0
  169. package/.aict/profile/EXAMPLE.synthetic.md +49 -0
  170. package/.aict/profile/FAILURE_MODES.md +40 -0
  171. package/.aict/profile/PROMPT.md +47 -0
  172. package/.aict/profile/README.md +44 -0
  173. package/.aict/profile/TEMPLATE.md +57 -0
  174. package/.aict/prompts/acceptance-definition.md +109 -0
  175. package/.aict/prompts/guard-review.md +116 -0
  176. package/.aict/prompts/handoff-generation.md +110 -0
  177. package/.aict/prompts/harvest-extraction.md +110 -0
  178. package/.aict/prompts/mode-switching.md +66 -0
  179. package/.aict/prompts/profile-creation.md +66 -0
  180. package/.aict/prompts/profile-refinement.md +66 -0
  181. package/.aict/prompts/project-context-packaging.md +113 -0
  182. package/.aict/prompts/red-team-challenge.md +106 -0
  183. package/.aict/prompts/rule-update-proposal.md +114 -0
  184. package/.aict/prompts/workflow-reset.md +109 -0
  185. package/.aict/roles/README.md +18 -0
  186. package/.aict/roles/executor.md +34 -0
  187. package/.aict/roles/harvester.md +33 -0
  188. package/.aict/roles/owner-controller.md +38 -0
  189. package/.aict/roles/scout.md +33 -0
  190. package/.aict/roles/supervisor.md +34 -0
  191. package/.aict/roles/system-guardian.md +34 -0
  192. package/.aict/skills/acceptance/SKILL.md +43 -0
  193. package/.aict/skills/context/SKILL.md +44 -0
  194. package/.aict/skills/evidence-pack/SKILL.md +42 -0
  195. package/.aict/skills/guard/SKILL.md +46 -0
  196. package/.aict/skills/handoff/SKILL.md +44 -0
  197. package/.aict/skills/harvest/SKILL.md +44 -0
  198. package/.aict/skills/mode-switch/SKILL.md +42 -0
  199. package/.aict/skills/profile/SKILL.md +42 -0
  200. package/.aict/skills/red-team/SKILL.md +42 -0
  201. package/.aict/skills/single-tool-guard/SKILL.md +42 -0
  202. package/.aict/state/CURRENT_STATE.md +13 -0
  203. package/.aict/state/DECISIONS.md +7 -0
  204. package/.aict/state/TASK_LOG.md +7 -0
  205. package/.aict/state/evidence.jsonl +2 -0
  206. package/.aict/state/learning-ledger.jsonl +1 -0
  207. package/.aict/state/receipts.jsonl +1 -0
  208. package/.aict/state/runs.jsonl +1 -0
  209. package/.aict/state/tasks.jsonl +1 -0
  210. package/.aict/walkthroughs/10-minute-your-task.md +107 -0
  211. package/.aict/walkthroughs/10-minute.md +43 -0
  212. package/.aict/walkthroughs/30-minute.md +22 -0
  213. package/.aict/walkthroughs/60-minute.md +27 -0
  214. package/.aict/walkthroughs/synthetic-loop-transcript.md +43 -0
  215. package/CHANGELOG.md +23 -0
  216. package/CODE_OF_CONDUCT.md +20 -0
  217. package/CONTRIBUTING.md +30 -0
  218. package/KNOWN_LIMITATIONS.md +54 -0
  219. package/LICENSE +199 -0
  220. package/PRODUCT_CONTRACT.md +446 -0
  221. package/README.md +245 -0
  222. package/RELEASE_CHECKLIST.md +78 -0
  223. package/SECURITY.md +56 -0
  224. package/START_HERE.md +89 -0
  225. package/bin/ai-collab.js +2 -0
  226. package/docs/DOGFOOD.md +85 -0
  227. package/docs/FEEDBACK.md +61 -0
  228. package/docs/FIRST_EXPERIENCE_SPEC.md +32 -0
  229. package/docs/FREE_VS_PAID.md +53 -0
  230. package/docs/PUBLIC_BOUNDARY.md +36 -0
  231. package/docs/PUBLIC_MAPPING.md +178 -0
  232. package/docs/RELEASE_PRIORITY.md +23 -0
  233. package/docs/WHY_THIS_EXISTS.md +36 -0
  234. package/docs/open-system/00-start-here.md +60 -0
  235. package/docs/open-system/01-ai-collaboration-os.md +33 -0
  236. package/docs/open-system/02-six-layer-architecture.md +45 -0
  237. package/docs/open-system/03-role-system.md +33 -0
  238. package/docs/open-system/04-core-mechanisms.md +34 -0
  239. package/docs/open-system/05-failure-patterns.md +31 -0
  240. package/docs/open-system/06-how-to-adapt-to-your-workflow.md +31 -0
  241. package/package.json +69 -0
  242. package/privacy-manifest.json +78 -0
  243. package/privacy-scan.local.json.example +18 -0
  244. package/scripts/lib/forbidden-in-pack.js +55 -0
  245. package/scripts/pack-check.js +154 -0
  246. package/scripts/privacy-scan.js +487 -0
  247. package/scripts/validate-contract.js +160 -0
  248. package/src/adapters.js +590 -0
  249. package/src/bootstrap.js +1184 -0
  250. package/src/catalog.js +2723 -0
  251. package/src/cli.js +2899 -0
  252. package/src/dialogue.js +470 -0
  253. package/src/i18n.js +1034 -0
  254. package/src/ledger.js +2011 -0
  255. package/src/render.js +1381 -0
  256. package/src/sendmodel.js +452 -0
  257. package/src/validate.js +1307 -0
  258. package/src/workspace.js +1679 -0
  259. package/tests/contract.test.js +8514 -0
package/src/ledger.js ADDED
@@ -0,0 +1,2011 @@
1
+ // Shared JSONL ledger module.
2
+ //
3
+ // The P1 run layer stores its five append-only logs (tasks, evidence, runs,
4
+ // receipts, learning-ledger) as JSON Lines files under <workspace>/state/. This
5
+ // module is the SINGLE place that knows how to read, parse, and append those
6
+ // lines, so the CLI writer and the validator reader can never drift apart on the
7
+ // on-disk shape. Zero dependencies: hand-rolled parse + append, same style as
8
+ // validate.js (JSON.parse + per-field checks).
9
+ //
10
+ // Design notes:
11
+ // - Append-only: every command appends one line; nothing is rewritten in place
12
+ // except the deliberate run-finish update (readAll -> patch matching line ->
13
+ // rewrite), which stays deterministic because it preserves line order.
14
+ // - Line numbers are 1-based and surfaced on parse so a corrupt ledger fails
15
+ // with a pointable "<file>:<line>" the same way the privacy scanner does.
16
+
17
+ import { createHash } from "node:crypto";
18
+ import { existsSync, readFileSync, appendFileSync, writeFileSync, mkdirSync, openSync, closeSync, unlinkSync, statSync } from "node:fs";
19
+ import path from "node:path";
20
+
21
+ // The five ledgers and their on-disk file names. Single source of truth shared
22
+ // by the generator (committed templates), the CLI commands, and the validator.
23
+ export const LEDGER_FILES = {
24
+ tasks: "tasks.jsonl",
25
+ evidence: "evidence.jsonl",
26
+ runs: "runs.jsonl",
27
+ receipts: "receipts.jsonl",
28
+ learning: "learning-ledger.jsonl"
29
+ };
30
+
31
+ // Enumerations are the contract's load-bearing part (the dispatch instruction
32
+ // pins these), so they live here and are imported by both writer and reader.
33
+ export const TASK_STATUSES = ["open", "done", "blocked", "partial", "unverified"];
34
+ export const RUN_STATUSES = ["running", "finished"];
35
+ export const RECEIPT_VERDICTS = ["pass", "reject", "insufficient_evidence", "pass_with_risk"];
36
+ export const RECEIPT_STATUSES = ["accepted", "rejected", "pending"];
37
+ // Guard evidence-strength levels (P2 + A1). They grade HOW the guard saw the
38
+ // work, so the verdict a receipt may carry is bounded by the strength of the
39
+ // evidence the guard actually had. Higher = stronger evidence; the verdict
40
+ // ceiling rises with the level (see guardLevelVerdictError for the exact bounds):
41
+ // L0 only a completion summary -> insufficient_evidence only
42
+ // L1 artifact/acceptance, no real run -> cannot pass (best is pass_with_risk)
43
+ // L2 author-supplied commands/tests, -> at most pass_with_risk (single tool
44
+ // single tool / single model family, OR a same-family sub-agent review,
45
+ // OR a same-family sub-agent review which is ADVISORY, not binding)
46
+ // L2.5 a weak L3: SAME tool, a DIFFERENT -> at most pass_with_risk (more
47
+ // model under it (some independence, independence than L2, but still one
48
+ // but one tool, no cross-family pack) tool — not the cross-family gate)
49
+ // L3 cross-family review claimed + -> may pass, but the cross-family
50
+ // a cross_family_guard evidence row, attribution is SELF-DECLARED and
51
+ // family SELF-DECLARED (unverified) UNVERIFIED (see family-honesty note)
52
+ // L4 reviewer independently re-ran the -> strongest; pass requires a rerun
53
+ // key evidence AND that rerun is evidence row that REFERENCES a real
54
+ // RECONCILED against a real run exec recorded run (runs.jsonl) AND
55
+ // in runs.jsonl — the hardest local reconciles with it (same task,
56
+ // pass, but still LOCAL-trust only finished, executed:true, matching
57
+ // (see the L4 note) exitCode + command + outputSha256).
58
+ // A self-authored rerun with a fabricated
59
+ // output but no recorded, reconciled run
60
+ // can NO LONGER reach L4.
61
+ //
62
+ // A1 CORE — the level is NOT self-asserted. It is COMPUTED by computeGuardLevel()
63
+ // from (a) the review MODE the author claims (self / same_family_subagent /
64
+ // same_tool_other_model / cross_family / cross_family_rerun) and (b) the evidence
65
+ // actually cited. A claim can be set high but is CAPPED by the evidence behind it:
66
+ // identity (which family reviewed) can be typed in freely, so it never raises the
67
+ // level on its own; evidence hardness (a real rerun OUTPUT) is what earns L4. This
68
+ // is the anti-"silent green" rule: an AI that opens its OWN same-family sub-agent
69
+ // can only reach review mode same_family_subagent -> computed L2 (advisory), never
70
+ // L3, no matter what --claimed-level it types.
71
+ //
72
+ // "L2.5" is a deliberate non-integer label (a weak L3). It sorts between L2 and L3
73
+ // via guardLevelRank (index order below), so all ">= L3" / "< L3" comparisons keep
74
+ // treating it as below the cross-family gate.
75
+ export const GUARD_LEVELS = ["L0", "L1", "L2", "L2.5", "L3", "L4"];
76
+ // The lowest guard level at which a plain "pass" is allowed. Below this, a pass
77
+ // is unsupported by the evidence the guard had (L0/L1/L2/L2.5 cannot clear the
78
+ // cross-family gate; L2.5 is a weak L3 but still one tool).
79
+ export const MIN_PASS_GUARD_LEVEL = "L3";
80
+
81
+ // Review MODE (A1): HOW the work was reviewed. This is the load-bearing INPUT to
82
+ // computeGuardLevel — the author records the review method, and the CLI derives
83
+ // the real guard level from it (capped by evidence). Recording the METHOD (not
84
+ // the level) is what closes the silent-green door: an AI can claim a method, but
85
+ // the method itself bounds the ceiling, and the method "I opened a same-family
86
+ // sub-agent" can never reach the cross-family gate.
87
+ // - "self": the author checked their own work. No independent reviewer.
88
+ // - "same_family_subagent": a sub-agent of the SAME model family / tool reviewed
89
+ // it. This improves role separation and catches some mistakes, but a same
90
+ // family tends to share the same blind spots, so it is ADVISORY, not binding
91
+ // — it can never, on its own, be cross-family independence. Caps at L2.
92
+ // - "same_tool_other_model": the SAME tool but a DIFFERENT model under it. Some
93
+ // independence (a different model), but still one tool / one vendor sandbox —
94
+ // a weak L3 (L2.5), not the cross-family gate.
95
+ // - "cross_family": a DIFFERENT model family / tool reviewed it. This is the
96
+ // binding gate's intent — BUT locally we cannot verify the family is truly
97
+ // different (the family field is self-declared), so an L3 reached this way is
98
+ // marked self-declared / unverified.
99
+ // - "cross_family_rerun": cross-family AND the reviewer independently RE-RAN the
100
+ // key evidence, capturing the OUTPUT in a rerun row that REFERENCES a real
101
+ // recorded run (runs.jsonl) and RECONCILES with it (same task, finished,
102
+ // executed:true, matching exitCode + command + outputSha256). A recorded,
103
+ // reconciled run exec is harder to fake
104
+ // than a typed-in family name or a free-text output, so this is the path to L4.
105
+ // (Still LOCAL trust: a single user can choose the local command, so even a
106
+ // reconciled L4 is "backed by a recorded output-matched local run", not
107
+ // cryptographically verified — see familyHonestyMarker / the L4 marker.)
108
+ export const REVIEW_MODES = [
109
+ "self",
110
+ "same_family_subagent",
111
+ "same_tool_other_model",
112
+ "cross_family",
113
+ "cross_family_rerun"
114
+ ];
115
+
116
+ // The HIGHEST guard level each review mode can support on its own (before the
117
+ // evidence cap). This is the "a claimed method bounds the ceiling" half of the
118
+ // A1 rule: even a structurally perfect evidence pack cannot push the level above
119
+ // what the claimed review METHOD allows. SINGLE source, shared by computeGuardLevel
120
+ // (so the CLI writer and the validator derive the same level).
121
+ // self / same_family_subagent -> L2 (no independence / same-family advisory)
122
+ // same_tool_other_model -> L2.5 (one tool, different model: weak L3)
123
+ // cross_family -> L3 (cross-family claimed; family unverified)
124
+ // cross_family_rerun -> L4 (cross-family + a rerun reconciled to a recorded run exec)
125
+ export const REVIEW_MODE_LEVEL_CEILING = {
126
+ self: "L2",
127
+ same_family_subagent: "L2",
128
+ same_tool_other_model: "L2.5",
129
+ cross_family: "L3",
130
+ cross_family_rerun: "L4"
131
+ };
132
+
133
+ export const LEARNING_TYPES = ["harvest", "profile"];
134
+ export const LEARNING_STATUSES = ["proposed", "confirmed", "edited", "dropped"];
135
+
136
+ // Evidence "kind" is free-form by design (the run loop should be able to attach
137
+ // any kind of proof: a diff, a command, captured output, a file, a plain note),
138
+ // so there is NO closed enum here and any string stays accepted — backward
139
+ // compatibility is a hard requirement. But P2 gives TWO kinds load-bearing
140
+ // meaning, because a guard level is only as honest as the evidence behind it:
141
+ // - "cross_family_guard": a review by a DIFFERENT model family / tool that
142
+ // pressed on the work. This is what makes an L3 "binding" pass binding — a
143
+ // plain pass at L3 must cite at least one piece of this kind, so guardLevel
144
+ // L3 can no longer be self-asserted with a kind:"note" row.
145
+ // - "rerun": the reviewer INDEPENDENTLY re-ran the key evidence and captured
146
+ // the output (carries command / exitCode style fields, plus an OPTIONAL runId
147
+ // pointing at a real recorded run in runs.jsonl). An L4 pass must cite a rerun
148
+ // row via --rerun AND that row must REFERENCE A RECONCILED run (see runId in
149
+ // the structure rules below); a plain "note" — and now a self-authored rerun
150
+ // with no recorded, reconciled run — can no longer prop up an L4 pass.
151
+ // These are the only two semantic kinds the verdict gate keys off; everything
152
+ // else is generic and unconstrained.
153
+ export const EVIDENCE_KIND_CROSS_FAMILY_GUARD = "cross_family_guard";
154
+ export const EVIDENCE_KIND_RERUN = "rerun";
155
+
156
+ // Evidence kinds that show the AUTHOR actually RAN something (a command, captured
157
+ // output, or a rerun) — the difference between "L1 there is an artifact" and "L2
158
+ // the author has run/test evidence". Used by computeGuardLevel's evidence floor.
159
+ // Deliberately small + permissive: any of these cited same-task means "ran".
160
+ export const RUN_EVIDENCE_KINDS = ["output", "command", "test", EVIDENCE_KIND_RERUN];
161
+
162
+ // The minimum STRUCTURED fields each load-bearing kind must carry. This turns the
163
+ // two semantic kinds from a bare label into a small structured record, so an
164
+ // `evidence add --kind cross_family_guard --summary "..."` empty shell can no
165
+ // longer prop up an L3 pass and a bare --kind rerun cannot prop up an L4 pass.
166
+ // Documented here so the CLI flag help, the writer, and the validator all read
167
+ // the same contract:
168
+ // - rerun: the reviewer's independent re-run. Must record WHAT was run
169
+ // (`command`, a non-empty string), HOW it ended (`exitCode`, an integer),
170
+ // AND the raw `output` it produced (a non-empty string). (A1: the OUTPUT is
171
+ // the hard proof — a command + exit code with no captured output is just a
172
+ // claim "I ran it".) OPTIONAL `runId`: a pointer to a real recorded run in
173
+ // runs.jsonl. The runId field is OPTIONAL at the STRUCTURE level (a rerun row
174
+ // without one is still a well-formed generic rerun), but it is what an L4 pass
175
+ // rests on: only a rerun that REFERENCES a recorded run AND RECONCILES with it
176
+ // (rerunRunReconcileError below) counts toward L4. A self-authored rerun with a
177
+ // fabricated output but no runId tops out at L3 — typing an output string is no
178
+ // longer enough; the run must be on the system's own record and agree. Optional
179
+ // context field (runner) stays free-form.
180
+ // - cross_family_guard: a review by a different model family / tool. Must name
181
+ // WHO/WHICH did the review via at least one of `reviewer` (a person/agent),
182
+ // `family` (the model family), or `ref` (a source pointer) — at least one,
183
+ // not all three, so a single honest attribution is enough.
184
+ //
185
+ // SCOPE (deliberate, local-first): specialEvidenceStructureError is a STRUCTURAL
186
+ // completeness check only — it asserts the fields are PRESENT and well-typed, NOT
187
+ // that they are TRUE. It does not verify that `reviewer` names a real person or
188
+ // that `family` is a real model. ONE step beyond pure structure exists for L4:
189
+ // rerunRunReconcileError (below) cross-checks a rerun's claimed exitCode/command
190
+ // against the system's OWN recorded run in runs.jsonl, so a rerun that wants to
191
+ // reach L4 can no longer self-report an exitCode that contradicts the recorded run
192
+ // (closing the red-team "runs=1 but rerun says 0, still L4" hole). This is still
193
+ // LOCAL trust, not anti-forgery: the same user can still choose what command to
194
+ // execute locally, so a reconciled L4 means "backed by a recorded local run exec
195
+ // whose exit/command/output match", not cryptographic proof. It raises the
196
+ // forgery cost from "type one output string" to "drive a real local command and
197
+ // cite matching output".
198
+ // future: cryptographic provenance (signed reviewer identity, attested run logs)
199
+ // would live here if the tool ever needed anti-forgery rather than reconciliation.
200
+ export const CROSS_FAMILY_GUARD_ATTRIBUTION_FIELDS = ["reviewer", "family", "ref"];
201
+
202
+ // True when a value is a non-empty, non-blank string — the bar a required
203
+ // structured text field (command, reviewer, family, ref) must clear.
204
+ function isNonEmptyString(value) {
205
+ return typeof value === "string" && value.trim().length > 0;
206
+ }
207
+
208
+ export function outputSha256(value) {
209
+ return createHash("sha256").update(String(value), "utf8").digest("hex");
210
+ }
211
+
212
+ export function outputByteLength(value) {
213
+ return Buffer.byteLength(String(value), "utf8");
214
+ }
215
+
216
+ // SINGLE source of the "this special-kind evidence row carries its required
217
+ // structured fields" rule, shared by the CLI writer (evidence add) and the
218
+ // validator. Returns an error STRING describing the first missing/ill-typed
219
+ // field, or null when the row is structurally complete (or is a generic kind
220
+ // with no structural requirement). Generic kinds (note / diff / output / file /
221
+ // command / anything else) are unconstrained — backward compatibility is a hard
222
+ // requirement; only the two load-bearing kinds are gated.
223
+ export function specialEvidenceStructureError(record) {
224
+ if (!isLedgerRecord(record)) return null;
225
+ const kind = record.kind;
226
+ if (kind === EVIDENCE_KIND_RERUN) {
227
+ // A rerun must say what was run and how it ended. exitCode must be an
228
+ // integer (0 = passed); a missing or non-integer exit code makes the rerun
229
+ // unverifiable as pass/fail.
230
+ if (!isNonEmptyString(record.command)) {
231
+ return `rerun evidence must record the command that was re-run (--command "..."); none found`;
232
+ }
233
+ if (!Number.isInteger(record.exitCode)) {
234
+ return `rerun evidence must record an integer exitCode (--exit <code>, 0 = passed); got ${JSON.stringify(record.exitCode)}`;
235
+ }
236
+ // A1: the captured OUTPUT is the hard proof. A rerun with a command + exit
237
+ // code but no output is just an unbacked "I ran it" claim and cannot carry an
238
+ // L4 pass; require a non-empty output snippet.
239
+ if (!isNonEmptyString(record.output)) {
240
+ return `rerun evidence must record the raw output it produced (--output "..."); none found (a command + exit code with no output is just a claim "I ran it")`;
241
+ }
242
+ // runId is OPTIONAL at the STRUCTURE level (a rerun without one is a valid
243
+ // generic rerun that just cannot reach L4), but a PRESENT runId must be a
244
+ // real (non-empty string) id — a blank/non-string runId is a malformed
245
+ // reference, not "unlinked". The actual existence/same-task/finished/match
246
+ // reconciliation against runs.jsonl is rerunRunReconcileError (it needs the
247
+ // runs ledger, which a single-record structure check does not have).
248
+ if (record.runId !== undefined && !isNonEmptyString(record.runId)) {
249
+ return `rerun evidence runId, when present, must be a non-empty string (the id of a recorded run in runs.jsonl); got ${JSON.stringify(record.runId)}`;
250
+ }
251
+ return null;
252
+ }
253
+ if (kind === EVIDENCE_KIND_CROSS_FAMILY_GUARD) {
254
+ // A cross-family guard row must name who/which family pressed on the work:
255
+ // at least one of reviewer / family / ref.
256
+ const hasAttribution = CROSS_FAMILY_GUARD_ATTRIBUTION_FIELDS.some((field) => isNonEmptyString(record[field]));
257
+ if (!hasAttribution) {
258
+ return `cross_family_guard evidence must name who/which family reviewed it via at least one of ${CROSS_FAMILY_GUARD_ATTRIBUTION_FIELDS.map((field) => `--${field}`).join(" / ")}; none found`;
259
+ }
260
+ return null;
261
+ }
262
+ return null;
263
+ }
264
+
265
+ // === A1 L4 reconciliation: a rerun must agree with the system's recorded run ===
266
+ //
267
+ // THE GAP THIS CLOSES (red-team P1-B): before this, an L4 rest on a self-authored
268
+ // rerun row (command + exitCode + a typed-in output). runs.jsonl — the system's
269
+ // OWN append-only record of runs — never checked whether the referenced run was
270
+ // actually executed or whether its captured output matched. A rerun row could cite
271
+ // a self-reported start/finish row, or pair a real run exec with fabricated output,
272
+ // and still reach a "verified" L4. This function makes a rerun that wants to count
273
+ // toward L4 reference a real recorded run exec (via runId) and RECONCILE with it.
274
+ //
275
+ // SINGLE source of the reconciliation rule, shared by the CLI writer (it gates the
276
+ // rerun -> L4 path at receipt create) and the validator (read-time), so write-time
277
+ // and read-time can never drift on what "a reconciled rerun" means.
278
+ //
279
+ // Returns an error STRING describing the first reconciliation failure, or null when
280
+ // the rerun row references a recorded run that reconciles. A rerun row with NO runId
281
+ // returns a (non-null) error here too — it is "not reconciled", so it cannot back
282
+ // L4 (the caller treats any non-null as "does not count for L4", and surfaces the
283
+ // string only when an L4 was actually attempted). The rule, in order:
284
+ // 1. runId present (a rerun with no runId is unreconciled — cannot reach L4).
285
+ // 2. that runId names a run that EXISTS in runs.jsonl.
286
+ // 3. that run belongs to the SAME task as the rerun evidence (taskId match) —
287
+ // a rerun cannot borrow another task's run to manufacture an L4.
288
+ // 4. that run is status "finished" (a still-running / unfinished run has no
289
+ // settled exitCode to reconcile against).
290
+ // 5. that run was recorded by `run exec` (executed:true); `run start/finish`
291
+ // rows are self-reported and can only support lower levels.
292
+ // 6. the run's exitCode EQUALS the rerun's exitCode (the red-team case: a run
293
+ // that finished exit 1 cannot back a rerun that self-reports exit 0).
294
+ // 7. the run's command EQUALS the rerun's command (so a rerun cannot point at an
295
+ // unrelated recorded run; trimmed string compare, the same key the rerun's
296
+ // own command structural check uses).
297
+ // 8. the rerun output hashes to the recorded run exec outputSha256; legacy runs
298
+ // with no hash fail closed and must be re-run via run exec to reach L4.
299
+ //
300
+ // `evidenceRecord` is the rerun evidence row; `runRecords` is the parsed runs
301
+ // ledger (array). NOTE: this assumes the row is already a structurally-complete
302
+ // rerun (specialEvidenceStructureError === null); callers filter on that first.
303
+ export function rerunRunReconcileError(evidenceRecord, runRecords) {
304
+ if (!isLedgerRecord(evidenceRecord) || evidenceRecord.kind !== EVIDENCE_KIND_RERUN) {
305
+ return `not a rerun evidence row`;
306
+ }
307
+ const runId = evidenceRecord.runId;
308
+ if (!isNonEmptyString(runId)) {
309
+ return `rerun evidence cites no runId — an L4 rerun must reference a recorded run exec in runs.jsonl (re-run via "ai-collab run exec" and pass --run <runId>); a self-authored rerun with no recorded run tops out at L3`;
310
+ }
311
+ const runs = Array.isArray(runRecords) ? runRecords : [];
312
+ const run = runs.find((candidate) => isLedgerRecord(candidate) && candidate.id === runId);
313
+ if (!run) {
314
+ return `rerun evidence references run "${runId}" but no such run exists in runs.jsonl (broken run reference)`;
315
+ }
316
+ if (run.taskId !== evidenceRecord.taskId) {
317
+ return `rerun evidence (task ${JSON.stringify(evidenceRecord.taskId)}) references run "${runId}" that belongs to another task (${JSON.stringify(run.taskId)}); a rerun may only reference a run of its own task`;
318
+ }
319
+ if (run.status !== "finished") {
320
+ return `rerun evidence references run "${runId}" which is not finished (status ${JSON.stringify(run.status)}); only a finished run has a settled exitCode to reconcile against`;
321
+ }
322
+ if (run.executed !== true) {
323
+ return `rerun evidence references run "${runId}" but that run was not recorded by "ai-collab run exec" (executed:true is missing); self-reported run start/finish rows top out at L3 — re-run via "ai-collab run exec" to reach L4`;
324
+ }
325
+ if (run.exitCode !== evidenceRecord.exitCode) {
326
+ return `rerun evidence claims exitCode ${JSON.stringify(evidenceRecord.exitCode)} but the recorded run "${runId}" finished with exitCode ${JSON.stringify(run.exitCode)} (the rerun must agree with the recorded run)`;
327
+ }
328
+ if (!isNonEmptyString(run.command) || run.command.trim() !== String(evidenceRecord.command).trim()) {
329
+ return `rerun evidence command ${JSON.stringify(evidenceRecord.command)} does not match the recorded run "${runId}" command ${JSON.stringify(run.command)} (the rerun must reference the run it actually came from)`;
330
+ }
331
+ if (!isNonEmptyString(run.outputSha256)) {
332
+ return `rerun evidence references run "${runId}" but that run has no stored outputSha256; legacy runs cannot satisfy L4 output-match — re-run via "ai-collab run exec" to reach L4`;
333
+ }
334
+ const rerunOutputSha256 = outputSha256(evidenceRecord.output);
335
+ if (run.outputSha256 !== rerunOutputSha256) {
336
+ return `rerun evidence output does not match the recorded run "${runId}" outputSha256 (recorded ${run.outputSha256}, rerun ${rerunOutputSha256}); use the exact captured run output or re-run via "ai-collab run exec" to reach L4`;
337
+ }
338
+ return null;
339
+ }
340
+
341
+ // True when a rerun evidence row is RECONCILED against a recorded run — i.e.
342
+ // rerunRunReconcileError returns null. Convenience wrapper so callers read as
343
+ // "does this rerun count toward L4?" rather than re-deriving the null check.
344
+ export function isReconciledRerunEvidence(evidenceRecord, runRecords) {
345
+ return rerunRunReconcileError(evidenceRecord, runRecords) === null;
346
+ }
347
+
348
+ // Verdicts that, on their own, lean toward acceptance. NOTE: leaning toward
349
+ // acceptance is necessary but NOT sufficient to write status "accepted" — an
350
+ // accepted receipt must also cite evidence (see receiptStatusFor). The two
351
+ // "do not accept" verdicts are reject / insufficient_evidence.
352
+ export const ACCEPTING_VERDICTS = ["pass", "pass_with_risk"];
353
+
354
+ // SINGLE source of the "this evidence belongs to this task" predicate, shared by
355
+ // the CLI writer (receipt create) and the validator. A receipt for task B may
356
+ // only be supported by evidence whose own taskId is B: citing task A's evidence
357
+ // proves nothing about task B. Centralizing the comparison here keeps the
358
+ // write-time guard and the read-time check on the exact same definition of
359
+ // "evidence that counts for this task".
360
+ export function evidenceBelongsToTask(evidenceRecord, taskId) {
361
+ return isLedgerRecord(evidenceRecord) && evidenceRecord.taskId === taskId;
362
+ }
363
+
364
+ // SINGLE source of "of these cited evidence ids, which actually belong to the
365
+ // task" — returns the subset of citedIds whose evidence row has taskId === the
366
+ // receipt's task. Both the CLI writer and the validator call this so the
367
+ // own-evidence filter cannot drift between write-time and read-time. Unknown
368
+ // ids (not present in evidenceRecords) are dropped here too, so a cited id that
369
+ // does not resolve to a real same-task evidence row never counts toward
370
+ // acceptance.
371
+ export function ownedEvidenceIds(citedIds, taskId, evidenceRecords) {
372
+ const ids = Array.isArray(citedIds) ? citedIds : [];
373
+ const byId = new Map();
374
+ for (const record of evidenceRecords ?? []) {
375
+ if (isLedgerRecord(record) && typeof record.id === "string") byId.set(record.id, record);
376
+ }
377
+ return ids.filter((id) => evidenceBelongsToTask(byId.get(id), taskId));
378
+ }
379
+
380
+ // SINGLE source of the receipt verdict -> status rule, shared by the CLI writer
381
+ // (receipt create) and the validator. The rule the validator enforces (check 6:
382
+ // an accepted receipt MUST cite evidence that belongs to its own task) and the
383
+ // rule the writer applies must not drift, or the CLI could emit a row the
384
+ // validator rejects. The second argument is the list of cited evidence ids that
385
+ // ALREADY belong to the receipt's task (computed via ownedEvidenceIds), NOT the
386
+ // raw cited list — so evidence borrowed from another task can never push a
387
+ // receipt to "accepted". So:
388
+ // - "pass" WITH >=1 same-task evidence id -> "accepted"
389
+ // - "pass_with_risk": NEVER auto-accepted (P2 owner gate); it stays "pending"
390
+ // until an owner explicitly accepts it (ownerAccepted = true), at which point
391
+ // it becomes "accepted" — but only if it also has same-task evidence.
392
+ // - an accepting verdict WITHOUT same-task evidence -> "pending"
393
+ // - reject / insufficient_evidence -> "rejected"
394
+ // This guarantees the writer never produces the "accepted + no own-task
395
+ // evidence" state the validator flags as an unsupported acceptance, AND never
396
+ // auto-accepts a risk receipt the owner has not signed off on (P2).
397
+ // `ownerAccepted` defaults false so the common "receipt create" path keeps the
398
+ // risk receipt pending; the dedicated owner-acceptance entry passes true.
399
+ export function receiptStatusFor(verdict, ownedEvidenceIdList, ownerAccepted = false) {
400
+ if (!ACCEPTING_VERDICTS.includes(verdict)) return "rejected";
401
+ const count = Array.isArray(ownedEvidenceIdList) ? ownedEvidenceIdList.length : 0;
402
+ if (count === 0) return "pending"; // no own-task evidence -> never accepted
403
+ // pass_with_risk requires an explicit owner acceptance to move past pending;
404
+ // a plain pass with evidence still auto-accepts (preserves P1 behavior).
405
+ if (verdict === "pass_with_risk") return ownerAccepted ? "accepted" : "pending";
406
+ return "accepted";
407
+ }
408
+
409
+ // SINGLE source of the verdict x guardLevel consistency rule (P2 core), shared by
410
+ // the CLI writer (receipt create / receipt accept) and the validator, so the
411
+ // write-time guard and the read-time check can never drift. Returns an error
412
+ // STRING describing the first violation, or null if the (guardLevel, verdict)
413
+ // pair is allowed.
414
+ //
415
+ // The evidence-strength flags turn guardLevel from a self-asserted string into a
416
+ // claim that must be BACKED by real evidence rows (P2 evidence-gate):
417
+ // - `hasRerunEvidence`: the receipt cites >=1 same-task evidence row of kind
418
+ // "rerun" (used only for the L4 rule).
419
+ // - `hasCrossFamilyGuardEvidence`: the receipt cites >=1 same-task evidence row
420
+ // of kind "cross_family_guard" (used only for the L3-pass rule).
421
+ //
422
+ // The rule, in plain terms: a guard may only hand out a verdict its evidence
423
+ // strength can back. A summary-only look (L0) cannot do more than say "not
424
+ // enough evidence"; a paper-only look (L1) cannot pass; a single-tool look (L2)
425
+ // can warn but not clear the gate; a plain pass needs the cross-family Evidence
426
+ // Pack (L3+), and at L3 that pack must actually be cited (a cross_family_guard
427
+ // evidence row), not merely declared; and an L4 "I independently re-ran it" pass
428
+ // must actually show the rerun output.
429
+ // `rerunIdsCited` (optional) is whether the receipt cited ANY rerun ids at all,
430
+ // regardless of whether they resolved to real same-task rerun rows. It only
431
+ // sharpens the L4 error message (distinguish "you passed no --rerun" from "you
432
+ // passed --rerun ids but they are not valid rerun evidence"); it does not change
433
+ // any pass/fail decision. Defaults false so existing two/three-arg callers keep
434
+ // the original behavior.
435
+ export function guardLevelVerdictError(
436
+ guardLevel,
437
+ verdict,
438
+ hasRerunEvidence = false,
439
+ hasCrossFamilyGuardEvidence = false,
440
+ rerunIdsCited = false
441
+ ) {
442
+ if (!GUARD_LEVELS.includes(guardLevel)) {
443
+ return `guardLevel must be one of: ${GUARD_LEVELS.join(", ")} (got ${JSON.stringify(guardLevel)})`;
444
+ }
445
+ if (!RECEIPT_VERDICTS.includes(verdict)) {
446
+ return `verdict must be one of: ${RECEIPT_VERDICTS.join(", ")} (got ${JSON.stringify(verdict)})`;
447
+ }
448
+ // L0: only a completion summary was seen -> the only honest verdict is "not
449
+ // enough evidence". Anything stronger claims more than a summary can show.
450
+ if (guardLevel === "L0" && verdict !== "insufficient_evidence") {
451
+ return `guard level L0 (summary only) can only return insufficient_evidence, not "${verdict}"`;
452
+ }
453
+ // L1: artifact/acceptance exist but no real run -> cannot pass.
454
+ if (guardLevel === "L1" && verdict === "pass") {
455
+ return `guard level L1 (no real run evidence) cannot return "pass" (no run proves the claim); use pass_with_risk, reject, or insufficient_evidence`;
456
+ }
457
+ // L2: author-supplied commands/tests under a single tool, OR a same-family
458
+ // sub-agent (advisory) review -> at most a warned pass; a clean "pass" requires
459
+ // the cross-family L3 gate.
460
+ if (guardLevel === "L2" && verdict === "pass") {
461
+ return `guard level L2 (single-tool / same-family-advisory evidence) cannot return "pass"; the strongest a single tool or same-family review may give is pass_with_risk`;
462
+ }
463
+ // L2.5 (weak L3): same tool, a different model under it. More independence than
464
+ // L2, but still one tool — it has NOT cleared the cross-family gate, so it tops
465
+ // out at pass_with_risk (a plain "pass" still requires L3+ cross-family).
466
+ if (guardLevel === "L2.5" && verdict === "pass") {
467
+ return `guard level L2.5 (same tool, different model — a weak L3) cannot return "pass"; one tool has not cleared the cross-family gate, so the strongest is pass_with_risk`;
468
+ }
469
+ // A clean "pass" requires the binding cross-family Evidence Pack: guardLevel
470
+ // must be at least L3. (L0/L1/L2/L2.5 are already handled above; this also
471
+ // rejects any non-listed level that somehow ranks below L3.)
472
+ if (verdict === "pass" && guardLevelRank(guardLevel) < guardLevelRank(MIN_PASS_GUARD_LEVEL)) {
473
+ return `a "pass" verdict requires guard level >= ${MIN_PASS_GUARD_LEVEL} (got "${guardLevel}")`;
474
+ }
475
+ // L3+ pass: the cross-family Evidence Pack must be CITED, not just declared. The
476
+ // whole point of a clean pass ("a guard from a different model family pressed on
477
+ // it") is the cross-family review — so an L3 OR L4 pass with no cross_family_guard
478
+ // evidence row of its own is exactly the self-asserted "binding" pass the gate
479
+ // exists to stop. L4 does NOT get to clear this on a rerun alone: a reconciled rerun
480
+ // is run evidence, not an independent cross-family check, so an L4 pass must cite
481
+ // BOTH a cross_family_guard row (this rule) AND a reconciled rerun (the rule below).
482
+ // computeGuardLevel already refuses to LABEL a rerun-only receipt L4; this is the
483
+ // read-side twin that also flags a hand-planted L3/L4 pass that cites no cross-family.
484
+ if (verdict === "pass" && (guardLevel === "L3" || guardLevel === "L4") && !hasCrossFamilyGuardEvidence) {
485
+ return `guard level ${guardLevel} claims a cross-family Evidence Pack but the receipt cites no cross_family_guard evidence (a plain pass at ${guardLevel} must reference at least one cross_family_guard evidence row of its own task; a reconciled rerun alone is single-tool run evidence, not an independent cross-family check)`;
486
+ }
487
+ // L4 claims the reviewer independently re-ran the key evidence. hasRerunEvidence
488
+ // is now "has a RECONCILED rerun" (a rerun row that references a recorded run in
489
+ // runs.jsonl and agrees with it), not merely "has a rerun with output". If the
490
+ // receipt says L4 but carries no reconciled rerun, the local execution claim is
491
+ // unbacked. The message distinguishes the two real causes so it does not mislead:
492
+ // either no --rerun id was cited at all, OR ids were cited but none resolved to a
493
+ // RECONCILED same-task rerun row (wrong kind, a rerun missing its command/exitCode
494
+ // structure, OR — the A1 L4 case — a rerun that does not reference a recorded,
495
+ // reconciled run: missing runId, a self-reported run without executed:true, a
496
+ // legacy run with no outputSha256, an output mismatch, or runId whose
497
+ // exitCode/command/task/status disagree with the recorded run).
498
+ if (guardLevel === "L4" && verdict === "pass" && !hasRerunEvidence) {
499
+ if (rerunIdsCited) {
500
+ return `guard level L4 claims an independent re-run but the cited rerun evidence is not a recorded, reconciled run (it must be of kind "rerun" with a command + integer exitCode + output, belong to this task, AND reference a runs.jsonl run exec that reconciles — same task, finished, executed:true, matching exitCode + command + outputSha256)`;
501
+ }
502
+ return `guard level L4 claims an independent re-run but the receipt cites no reconciled rerun (record the run via "ai-collab run exec", add a kind:"rerun" evidence row with --command/--exit/--output/--run <runId>, then cite it via --rerun)`;
503
+ }
504
+ return null;
505
+ }
506
+
507
+ // Rank a guard level for ordered comparisons (its index in GUARD_LEVELS). An
508
+ // unknown level ranks -1 so it never accidentally satisfies a ">= L3" test.
509
+ export function guardLevelRank(guardLevel) {
510
+ return GUARD_LEVELS.indexOf(guardLevel);
511
+ }
512
+
513
+ // The lower of two guard levels by rank (used to CAP a claimed level by the
514
+ // evidence that actually backs it). An unknown level (rank -1) loses, so a typo'd
515
+ // level can never win the min and sneak through.
516
+ export function guardLevelMin(a, b) {
517
+ return guardLevelRank(a) <= guardLevelRank(b) ? a : b;
518
+ }
519
+
520
+ // === A1 CORE: COMPUTE the real guard level from review mode + evidence ========
521
+ //
522
+ // The guard level is NOT what the AI types. computeGuardLevel derives it from two
523
+ // inputs and returns the HONEST level + the metadata a caller needs to mark it:
524
+ //
525
+ // real level = MIN( ceiling(reviewMode) , what the evidence backs )
526
+ //
527
+ // * ceiling(reviewMode) — a claimed METHOD bounds the top. "I opened a
528
+ // same-family sub-agent" caps at L2 no matter what evidence is attached; only
529
+ // a claimed cross_family_rerun can reach L4. (REVIEW_MODE_LEVEL_CEILING.)
530
+ // * what the evidence backs — identity claims do NOT raise this; only real
531
+ // structured evidence does:
532
+ // rerun-with-output cited -> can support L4
533
+ // else cross_family_guard cited -> can support L3
534
+ // else author run/test evidence -> L2
535
+ // else any evidence (artifact) -> L1
536
+ // else nothing -> L0
537
+ //
538
+ // Taking the MIN is the whole anti-silent-green mechanism: a high claim cannot
539
+ // outrun the evidence, and an honest method cannot be inflated past its ceiling.
540
+ //
541
+ // `inputs`:
542
+ // - reviewMode (string | undefined): one of REVIEW_MODES. When omitted, it is
543
+ // INFERRED conservatively from the evidence (so a pre-A1 caller that passes a
544
+ // cross_family_guard row but no --review-mode still resolves to "cross_family"
545
+ // rather than silently collapsing to L0). Inference NEVER invents independence
546
+ // it cannot see: no special evidence -> "self".
547
+ // - hasCrossFamilyGuardEvidence (bool): a same-task, structurally-complete
548
+ // cross_family_guard row is cited.
549
+ // - hasRerunOutputEvidence (bool): a same-task, structurally-complete rerun row
550
+ // that is ALSO RECONCILED against a recorded run in runs.jsonl is cited. (A1 L4
551
+ // reconciliation: this boolean is true ONLY when the cited rerun references a
552
+ // recorded run whose taskId/status/exitCode/command agree — see
553
+ // ownedRerunEvidenceIds, which the callers compute it from. A self-authored
554
+ // rerun with a fabricated output but no recorded, reconciled run leaves this
555
+ // FALSE, so it cannot reach L4.) The name is kept for API stability; the
556
+ // meaning is "a recorded, reconciled rerun", not merely "a rerun with output".
557
+ // - hasAuthorRunEvidence (bool): the author cited some run/test evidence (e.g.
558
+ // kind output / command / rerun) — enough for L2 but not independence.
559
+ // - hasAnyEvidence (bool): any evidence at all is cited.
560
+ //
561
+ // Returns { level, reviewMode, familyUnverified, reason }:
562
+ // - level: the computed guard level (a GUARD_LEVELS value).
563
+ // - reviewMode: the resolved review mode (echoes the input, or the inferred one).
564
+ // - familyUnverified: TRUE whenever the level rests on a SELF-DECLARED
565
+ // cross-family claim. A caller MUST surface this as "self-declared
566
+ // cross-family, unverified" — the tool can reconcile local execution/output
567
+ // at L4, but it still cannot verify which model family actually reviewed.
568
+ // - reason: a short human string explaining the cap (for the CLI to print).
569
+ export function computeGuardLevel(inputs = {}) {
570
+ const {
571
+ reviewMode: rawReviewMode,
572
+ hasCrossFamilyGuardEvidence = false,
573
+ hasRerunOutputEvidence = false,
574
+ hasAuthorRunEvidence = false,
575
+ hasAnyEvidence = false
576
+ } = inputs;
577
+
578
+ // What the EVIDENCE alone can back (identity claims excluded). L4 — the strongest
579
+ // LOCAL level — requires BOTH a different-family review (a cross_family_guard row)
580
+ // AND a rerun reconciled to a recorded run exec output. A reconciled rerun on its OWN is the
581
+ // author re-running their own command (single-tool run evidence), NOT independent
582
+ // cross-family verification, so it tops out at L2 — it can never reach L4 by itself.
583
+ // This keeps "cross_family_rerun" honest: the cross-family part must be CITED, not
584
+ // merely claimed by the review mode.
585
+ let evidenceLevel;
586
+ if (hasRerunOutputEvidence && hasCrossFamilyGuardEvidence) evidenceLevel = "L4";
587
+ else if (hasCrossFamilyGuardEvidence) evidenceLevel = "L3";
588
+ else if (hasAuthorRunEvidence || hasRerunOutputEvidence) evidenceLevel = "L2";
589
+ else if (hasAnyEvidence) evidenceLevel = "L1";
590
+ else evidenceLevel = "L0";
591
+
592
+ // Resolve the review mode. If the caller named one, honor it (validated by the
593
+ // CLI before this point); otherwise INFER the most conservative mode the
594
+ // evidence is consistent with, so an omitted --review-mode never grants more
595
+ // independence than the evidence shows.
596
+ let reviewMode;
597
+ if (REVIEW_MODES.includes(rawReviewMode)) {
598
+ reviewMode = rawReviewMode;
599
+ } else if (hasRerunOutputEvidence && hasCrossFamilyGuardEvidence) {
600
+ reviewMode = "cross_family_rerun";
601
+ } else if (hasCrossFamilyGuardEvidence) {
602
+ reviewMode = "cross_family";
603
+ } else {
604
+ reviewMode = "self";
605
+ }
606
+
607
+ // The METHOD ceiling, then cap by evidence. real = min(methodCeiling, evidence).
608
+ const methodCeiling = REVIEW_MODE_LEVEL_CEILING[reviewMode] ?? "L0";
609
+ const level = guardLevelMin(methodCeiling, evidenceLevel);
610
+
611
+ // familyUnverified: the family label on a cross_family_guard row is
612
+ // SELF-DECLARED — this tool runs locally and cannot verify which model family
613
+ // actually reviewed. L4 adds a locally reconciled rerun (command + exit + output
614
+ // match a recorded run exec), but that proves execution/output matching, not the
615
+ // model family identity typed into the row. Therefore every level that rests on
616
+ // a cross-family claim keeps the unverified-family marker.
617
+ const restsOnCrossFamilyClaim = reviewMode === "cross_family" || reviewMode === "cross_family_rerun";
618
+ const familyUnverified = restsOnCrossFamilyClaim;
619
+
620
+ // A short explanation of why the level is what it is (the cap that bound it).
621
+ let reason;
622
+ if (guardLevelRank(methodCeiling) < guardLevelRank(evidenceLevel)) {
623
+ reason = `review method "${reviewMode}" caps the level at ${methodCeiling} (evidence alone would back ${evidenceLevel})`;
624
+ } else if (guardLevelRank(evidenceLevel) < guardLevelRank(methodCeiling)) {
625
+ reason = `evidence backs only ${evidenceLevel} (review method "${reviewMode}" would allow up to ${methodCeiling})`;
626
+ } else {
627
+ reason = `review method "${reviewMode}" and evidence both support ${level}`;
628
+ }
629
+
630
+ return { level, reviewMode, familyUnverified, reason };
631
+ }
632
+
633
+ // SINGLE source of "RE-COMPUTE a stored receipt's real guard level from its OWN
634
+ // evidence" — the read-side twin of the CLI writer. It does the work the validator
635
+ // (check 8c/8d) and the handoff drafter (buildHandoffModel) both need: resolve which
636
+ // of the receipt's cited evidence ids are same-task + structurally complete for each
637
+ // load-bearing kind, then run computeGuardLevel on those booleans. It deliberately
638
+ // IGNORES the receipt's stored `guardLevel` and stored `familyUnverified` fields and
639
+ // derives both from the evidence, so a hand-edited / old-schema / hand-planted row
640
+ // that lies in those stored fields cannot fool either caller. Returns the full
641
+ // computeGuardLevel result { level, reviewMode, familyUnverified, reason }.
642
+ //
643
+ // Centralizing this is the anti-drift point: before, validate.js inlined the same
644
+ // four-line `computeGuardLevel({ ... ownedCrossFamilyGuardEvidenceIds ... })` block
645
+ // twice and buildHandoffModel read a stored field instead — three places that could
646
+ // disagree. Now all three derive the family-verification truth one way.
647
+ export function computeReceiptGuardLevel(receipt, evidenceRecords = [], runRecords = []) {
648
+ const evidenceIds = Array.isArray(receipt?.evidenceIds) ? receipt.evidenceIds : [];
649
+ const rerunIds = Array.isArray(receipt?.rerunEvidenceIds) ? receipt.rerunEvidenceIds : [];
650
+ const taskId = receipt?.taskId;
651
+ return computeGuardLevel({
652
+ reviewMode: receipt?.reviewMode,
653
+ hasCrossFamilyGuardEvidence: ownedCrossFamilyGuardEvidenceIds(evidenceIds, taskId, evidenceRecords).length > 0,
654
+ hasRerunOutputEvidence: ownedRerunEvidenceIds(rerunIds, taskId, evidenceRecords, runRecords).length > 0,
655
+ hasAuthorRunEvidence: hasOwnedRunEvidence(evidenceIds, taskId, evidenceRecords),
656
+ hasAnyEvidence: ownedEvidenceIds(evidenceIds, taskId, evidenceRecords).length > 0
657
+ });
658
+ }
659
+
660
+ // SINGLE source of the "this is a self-declared, unverified cross-family level"
661
+ // marker text, so the CLI display, the JSON payload, and the validator describe it
662
+ // the same way. Returns the marker string for a familyUnverified level, or null.
663
+ export function familyHonestyMarker(familyUnverified) {
664
+ return familyUnverified ? "self-declared cross-family, unverified" : null;
665
+ }
666
+
667
+ // === A2 CORE: capability detection — "how high CAN this setup ever score?" =====
668
+ //
669
+ // computeGuardLevel (A1) answers "what did THIS task actually earn?" (the achieved
670
+ // level, from the evidence cited this time). computeCapability answers a DIFFERENT
671
+ // question: "given the TOOLS you have, what is the highest guard level you could
672
+ // EVER reach?" — the ceiling, not the achievement. The two are deliberately kept
673
+ // apart: the ceiling is set by your setup (how many independent model families you
674
+ // can bring, whether you can re-run commands); the achievement is set by what you
675
+ // actually did this task. A user with a cross-family setup (ceiling L3/L4) still
676
+ // only earns L1 on a task where they cited no real evidence. So this never touches
677
+ // the ledger or computeGuardLevel — it is a pure ADVISORY / coaching layer.
678
+ //
679
+ // THE LOAD-BEARING JUDGE IS MODEL-FAMILY COUNT, NOT TOOL COUNT (design rule 3):
680
+ // two tools that are the SAME model family share the same blind spots, so they are
681
+ // NOT cross-family independence — they top out where one tool does (L2). The gate
682
+ // to L3 is "a DIFFERENT model family can review your work". One tool that can drive
683
+ // two families under it is a weak L3 (L2.5). Re-running commands on top of a
684
+ // cross-family setup is what unlocks the strongest level (L4).
685
+ //
686
+ // The six capability tiers (Owner's table), each mapped to a guard-level CEILING:
687
+ // 1. one tool, one conversation, no sub-agents -> L0 (self-check only;
688
+ // can list risks but cannot give a real pass) cannot pass anything
689
+ // 2. one tool, but you can open a NEW conversation -> L2 (copy an Evidence
690
+ // to adversarially re-check (same family) Pack into a fresh
691
+ // chat for a second look,
692
+ // but still same family)
693
+ // 3. one tool WITH sub-agents (same family) -> L2 (auto same-family
694
+ // guard/red-team/scout —
695
+ // advisory, at most
696
+ // pass_with_risk)
697
+ // 4. one tool that can switch model FAMILIES under it -> L2.5 (a weak L3: a
698
+ // (e.g. switch the model behind the same tool) different model, but
699
+ // one tool / one sandbox)
700
+ // 5. two DIFFERENT model-family tools -> L3 (a real cross-family
701
+ // double-guard; the gate
702
+ // to a clean pass, though
703
+ // family is self-declared)
704
+ // 6. cross-family AND you can re-run the commands -> L4 (the strongest local
705
+ // yourself (reconcile against a recorded run exec) pass: an independent
706
+ // re-run reconciled to a
707
+ // recorded run exec)
708
+ //
709
+ // Tiers 2 and 3 BOTH cap at L2 but describe different setups (a new conversation vs
710
+ // sub-agents); the higher-numbered tier still wins the ceiling because the ceiling
711
+ // is the MAX across everything the setup can do (see the ranking below).
712
+ export const CAPABILITY_TIERS = [
713
+ {
714
+ id: "single-conversation",
715
+ ceiling: "L0",
716
+ label: "one tool, one conversation",
717
+ experience: "self-check and list risks only — cannot give a real pass"
718
+ },
719
+ {
720
+ id: "new-conversation",
721
+ ceiling: "L2",
722
+ label: "one tool, can open a new conversation",
723
+ experience: "copy an Evidence Pack into a fresh conversation for an adversarial re-check (same family)"
724
+ },
725
+ {
726
+ id: "sub-agents",
727
+ ceiling: "L2",
728
+ label: "one tool with sub-agents",
729
+ experience: "auto same-family guard / red-team / scout — advisory, at most pass_with_risk"
730
+ },
731
+ {
732
+ id: "switch-model-family",
733
+ ceiling: "L2.5",
734
+ label: "one tool that can switch the model family under it",
735
+ experience: "a weak L3 — a different model, but still one tool / one sandbox"
736
+ },
737
+ {
738
+ id: "cross-family",
739
+ ceiling: "L3",
740
+ label: "two different model-family tools",
741
+ experience: "a real cross-family double-guard — the gate to a clean pass (family self-declared)"
742
+ },
743
+ {
744
+ id: "cross-family-rerun",
745
+ ceiling: "L4",
746
+ label: "cross-family and you can re-run the commands yourself",
747
+ experience: "the strongest local pass — an independent re-run reconciled to a recorded run exec output"
748
+ }
749
+ ];
750
+
751
+ // Map a tool name (lowercased) to the model FAMILY behind it, so distinct families
752
+ // can be counted (the load-bearing judge). This is best-effort and DELIBERATELY
753
+ // conservative: a tool we do not recognise maps to "unknown", which is treated as
754
+ // its OWN family only when no better signal exists (so an unknown tool never
755
+ // silently manufactures cross-family independence with a known one — see
756
+ // distinctFamilies). The known map covers the tools the adapter layer already
757
+ // targets. SINGLE source so the CLI and any future caller count families the same.
758
+ // - claude code / claude -> "anthropic"
759
+ // - codex / chatgpt / copilot (GitHub Copilot rides GPT) -> "openai"
760
+ // - cursor -> "cursor" : Cursor is a multi-model HOST (it can drive Anthropic,
761
+ // OpenAI, etc.), not itself a single family. It is its OWN bucket so we never
762
+ // assume which family it is on; if a user is on Cursor AND another tool, that
763
+ // is only true cross-family if they SAY which family Cursor is using (via
764
+ // --families), which is exactly the "signal can't prove family" honesty rule.
765
+ // - cline / windsurf -> also multi-model hosts, same own-bucket treatment.
766
+ export const TOOL_FAMILY = {
767
+ claude: "anthropic",
768
+ "claude-code": "anthropic",
769
+ codex: "openai",
770
+ chatgpt: "openai",
771
+ copilot: "openai",
772
+ cursor: "cursor",
773
+ cline: "cline",
774
+ windsurf: "windsurf",
775
+ gemini: "google"
776
+ };
777
+
778
+ // The project files that SIGNAL a given tool may be configured here, mapped to
779
+ // { tool, family, confident }. `confident` marks whether the marker reliably pins
780
+ // the tool: a tool-specific dir/file (.claude/, .codex/, .cursor/) is confident;
781
+ // a GENERIC file many tools now share (AGENTS.md, and to a lesser extent CLAUDE.md)
782
+ // is NOT — lots of tools read AGENTS.md, so it only hints "some agent tool", never
783
+ // proves which family (design rule 4: a signal is "maybe", never a verdict on
784
+ // family). The detector surfaces low-confidence hits as "inferred — please
785
+ // confirm". SINGLE source so the probe and any test read the same marker table.
786
+ export const TOOL_SIGNALS = [
787
+ { marker: ".claude", tool: "claude", family: "anthropic", confident: true },
788
+ { marker: ".codex", tool: "codex", family: "openai", confident: true },
789
+ { marker: ".cursor", tool: "cursor", family: "cursor", confident: true },
790
+ { marker: ".clinerules", tool: "cline", family: "cline", confident: true },
791
+ { marker: ".windsurf", tool: "windsurf", family: "windsurf", confident: true },
792
+ { marker: ".github/copilot-instructions.md", tool: "copilot", family: "openai", confident: true },
793
+ // Generic, multi-tool markers: a HINT that *an* agent tool is in use, but they
794
+ // do not pin a family. Many tools (Codex, Cursor, Cline, Amp, …) read AGENTS.md;
795
+ // CLAUDE.md usually means Claude Code but is also copied by other setups.
796
+ { marker: "CLAUDE.md", tool: "claude", family: "anthropic", confident: false },
797
+ { marker: "AGENTS.md", tool: null, family: null, confident: false }
798
+ ];
799
+
800
+ // Count the DISTINCT model families in a list of family strings, ignoring blanks.
801
+ // "unknown"/null entries each count as their own anonymous family ONLY if there is
802
+ // nothing else to pin them to — but for the cross-family DECISION we must not let an
803
+ // unknown pair with a known family to fake independence, so unknowns are collapsed:
804
+ // any number of unknown/null families counts as AT MOST ONE extra family, and only
805
+ // when there is no other family already (an unknown alongside a known one adds
806
+ // nothing, because we cannot prove it is actually different). This keeps the gate
807
+ // honest: cross-family requires TWO families we can actually name.
808
+ export function distinctNamedFamilies(families) {
809
+ const named = new Set();
810
+ for (const family of families ?? []) {
811
+ if (typeof family === "string" && family.trim().length > 0 && family !== "unknown") {
812
+ named.add(family.trim());
813
+ }
814
+ }
815
+ return named;
816
+ }
817
+
818
+ // COMPUTE the capability ceiling from a setup description. Pure + zero-dep, mirrors
819
+ // computeGuardLevel's shape. Returns the highest guard level this setup could ever
820
+ // reach, the tier it matches, and the metadata the CLI needs to explain it and to
821
+ // recommend the next step up.
822
+ //
823
+ // `setup`:
824
+ // - families (string[]): the model families the user can bring (e.g.
825
+ // ["anthropic", "openai"]). The COUNT of distinct NAMED families is the gate:
826
+ // >=2 named families -> cross-family (L3 ceiling). Derived from detection +
827
+ // the --families flag; "unknown" entries never count toward the 2 (honesty).
828
+ // - tools (string[]): tool names seen/declared. Used only to derive families
829
+ // when `families` is not given (via TOOL_FAMILY), and for display.
830
+ // - canSwitchModelFamily (bool): one tool can drive a DIFFERENT model family
831
+ // under it -> a weak L3 (L2.5 ceiling) even with a single tool.
832
+ // - hasSubAgents (bool): the tool can spawn same-family sub-agents -> L2 ceiling
833
+ // (advisory reviews) even in one conversation.
834
+ // - canOpenNewConversation (bool): the user can open a fresh conversation to
835
+ // re-check (same family) -> L2 ceiling. Defaults TRUE — almost every chat tool
836
+ // can open a new chat — so the floor for any real tool is L2, not L0. Pass
837
+ // false only to model the strict "one locked conversation" tier 1.
838
+ // - canRerun (bool): the user can independently re-run the commands and reconcile
839
+ // against a recorded run -> unlocks L4, but ONLY on top of a cross-family setup
840
+ // (re-running alone, single family, does not clear the cross-family gate).
841
+ //
842
+ // Returns { ceiling, tier, reason, distinctFamilies, families, recommendation }:
843
+ // - ceiling: a GUARD_LEVELS value — the highest level this setup can ever reach.
844
+ // - tier: the matched CAPABILITY_TIERS entry (id/label/experience) for display.
845
+ // - reason: a short human string explaining what set the ceiling.
846
+ // - distinctFamilies: the count of distinct NAMED families (the gate input).
847
+ // - families: the sorted list of distinct named families (for display).
848
+ // - recommendation: { nextCeiling, action } — the single most valuable next step
849
+ // to raise the ceiling, or null when already at L4. This is the "how do I level
850
+ // up?" half of A2 (design point: give an upgrade PATH, not just a number).
851
+ export function computeCapability(setup = {}) {
852
+ const {
853
+ families: rawFamilies,
854
+ tools: rawTools = [],
855
+ canSwitchModelFamily = false,
856
+ hasSubAgents = false,
857
+ canOpenNewConversation = true,
858
+ canRerun = false
859
+ } = setup;
860
+
861
+ // Resolve families: prefer an explicit list; otherwise derive from tools.
862
+ let familyList;
863
+ if (Array.isArray(rawFamilies) && rawFamilies.length > 0) {
864
+ familyList = rawFamilies;
865
+ } else {
866
+ familyList = (Array.isArray(rawTools) ? rawTools : [])
867
+ .map((tool) => TOOL_FAMILY[String(tool).toLowerCase()] ?? "unknown");
868
+ }
869
+ const named = distinctNamedFamilies(familyList);
870
+ const distinctFamilies = named.size;
871
+ const families = [...named].sort();
872
+
873
+ // Each capability the setup has votes a ceiling; the real ceiling is the MAX
874
+ // (the best the setup can do), capped by the cross-family gate where it applies.
875
+ // Build the candidate ceilings, then take the highest by guardLevelRank.
876
+ const candidates = [];
877
+ const crossFamily = distinctFamilies >= 2;
878
+
879
+ // L0 floor: a single locked conversation with nothing else.
880
+ candidates.push("L0");
881
+ if (canOpenNewConversation) candidates.push("L2"); // a fresh-conversation re-check (same family)
882
+ if (hasSubAgents) candidates.push("L2"); // same-family sub-agent advisory reviews
883
+ if (canSwitchModelFamily) candidates.push("L2.5"); // one tool, a different model under it (weak L3)
884
+ if (crossFamily) candidates.push("L3"); // two named families: the cross-family gate
885
+ // L4 is gated on cross-family: re-running alone (single family) cannot clear the
886
+ // cross-family gate, so canRerun only lifts the ceiling when crossFamily holds.
887
+ if (crossFamily && canRerun) candidates.push("L4");
888
+
889
+ let ceiling = "L0";
890
+ for (const candidate of candidates) {
891
+ if (guardLevelRank(candidate) > guardLevelRank(ceiling)) ceiling = candidate;
892
+ }
893
+
894
+ // Pick the tier whose ceiling matches AND best describes the dominant capability.
895
+ // We match on the ceiling, preferring the most specific tier for that ceiling
896
+ // (cross-family-rerun > cross-family for L4/L3; switch-model-family for L2.5; for
897
+ // L2 prefer sub-agents when present, else new-conversation; L0 is the floor).
898
+ const tier = pickCapabilityTier(ceiling, { hasSubAgents, canSwitchModelFamily, crossFamily, canRerun });
899
+
900
+ // The reason: name the capability that set the ceiling, in plain terms.
901
+ let reason;
902
+ if (ceiling === "L4") {
903
+ reason = `${distinctFamilies} model families plus your own re-run (reconciled to a recorded run exec output) reach the strongest local level`;
904
+ } else if (ceiling === "L3") {
905
+ reason = `${distinctFamilies} distinct model families (${families.join(" + ")}) clear the cross-family gate`;
906
+ } else if (ceiling === "L2.5") {
907
+ reason = `one tool that can switch model families is a weak L3 — more independence than one model, but still one tool`;
908
+ } else if (ceiling === "L2") {
909
+ reason = hasSubAgents
910
+ ? `same-family sub-agent reviews are advisory — useful, but not cross-family independence`
911
+ : `a same-family re-check in a fresh conversation is advisory — not cross-family independence`;
912
+ } else {
913
+ reason = `a single locked conversation can self-check and list risks, but cannot independently review its own work`;
914
+ }
915
+
916
+ const recommendation = capabilityRecommendation(ceiling, {
917
+ distinctFamilies,
918
+ canOpenNewConversation,
919
+ hasSubAgents,
920
+ canSwitchModelFamily,
921
+ crossFamily,
922
+ canRerun
923
+ });
924
+
925
+ return { ceiling, tier, reason, distinctFamilies, families, recommendation };
926
+ }
927
+
928
+ // Choose the CAPABILITY_TIERS entry that best describes a computed ceiling. For
929
+ // ceilings with two tiers (L2 = new-conversation | sub-agents), prefer the more
930
+ // capable description the setup actually has. Always returns a tier (falls back to
931
+ // the first tier whose ceiling matches, then to the L0 tier) so the caller can
932
+ // always show a label.
933
+ export function pickCapabilityTier(ceiling, flags = {}) {
934
+ const { hasSubAgents = false } = flags;
935
+ const byId = (id) => CAPABILITY_TIERS.find((tier) => tier.id === id);
936
+ if (ceiling === "L4") return byId("cross-family-rerun");
937
+ if (ceiling === "L3") return byId("cross-family");
938
+ if (ceiling === "L2.5") return byId("switch-model-family");
939
+ if (ceiling === "L2") return hasSubAgents ? byId("sub-agents") : byId("new-conversation");
940
+ // L0 (or any unrecognised ceiling): the single-conversation floor.
941
+ return byId("single-conversation");
942
+ }
943
+
944
+ // The single most valuable next step to RAISE the ceiling, given the current
945
+ // ceiling and what the setup already has. Returns { nextCeiling, action } or null
946
+ // when already at the top (L4). The advice is concrete and ordered by the cheapest
947
+ // real jump: from L0/L2/L2.5 the high-value move is almost always "bring a second
948
+ // model family" (the cross-family gate to L3); from L3 it is "re-run the commands
949
+ // yourself via run exec and reconcile to the recorded output hash" (L4). SINGLE
950
+ // source so the CLI text and any test agree on the recommended path.
951
+ export function capabilityRecommendation(ceiling, flags = {}) {
952
+ const { crossFamily = false, canRerun = false } = flags;
953
+ if (ceiling === "L4") return null; // already at the strongest local level
954
+ if (ceiling === "L3") {
955
+ // The only step up from L3 is L4: re-run the key commands independently via
956
+ // run exec and reconcile them against the recorded output hash.
957
+ return {
958
+ nextCeiling: "L4",
959
+ action: canRerun
960
+ ? `you can already re-run commands — record the run with ai-collab run exec and add a kind:"rerun" evidence row with matching --output that cites it (--run) to reach L4 on a task`
961
+ : `independently re-run the key commands with ai-collab run exec, then add a kind:"rerun" evidence row with matching --output that cites the run (--run) to reach L4`
962
+ };
963
+ }
964
+ // L0 / L2 / L2.5 all level up the same way: cross the family gate to L3 by
965
+ // bringing a SECOND, different model family (a second tool on a different family,
966
+ // or a tool that can switch families under it).
967
+ if (!crossFamily) {
968
+ return {
969
+ nextCeiling: "L3",
970
+ action: `add a SECOND model family (a tool on a different model family, or switch the model under your current tool) so a cross-family guard can review your work — this is the gate to a clean pass (L3)`
971
+ };
972
+ }
973
+ // Defensive fallback (cross-family true but ceiling below L3 — should not happen):
974
+ // still point at L4 as the next move.
975
+ return {
976
+ nextCeiling: "L4",
977
+ action: `independently re-run the key commands with ai-collab run exec and reconcile the rerun output to that recorded run to reach L4`
978
+ };
979
+ }
980
+
981
+ // SINGLE source of the owner-acceptance integrity rule (P2), shared by the CLI
982
+ // writer and the validator. A receipt that is "accepted" with a pass_with_risk
983
+ // verdict MUST carry the owner-acceptance marker (ownerAccepted === true): the
984
+ // whole point of pass_with_risk is that a human signed off on the named risk, so
985
+ // an accepted risk receipt with no owner mark is an unsupported acceptance.
986
+ // Returns an error string or null. Non-accepted receipts and non-risk verdicts
987
+ // are unaffected.
988
+ export function ownerAcceptanceError(receipt) {
989
+ if (!isLedgerRecord(receipt)) return null;
990
+ if (receipt.status !== "accepted") return null;
991
+ if (receipt.verdict !== "pass_with_risk") return null;
992
+ if (receipt.ownerAccepted === true) return null;
993
+ return `receipt ${receipt.id ?? "(no id)"} is an "accepted" pass_with_risk but has no owner acceptance marker (ownerAccepted: true)`;
994
+ }
995
+
996
+ // SINGLE source of "which of a receipt's cited rerun ids actually count toward
997
+ // L4" — the subset of rerunEvidenceIds that resolve to a same-task evidence row,
998
+ // are of the load-bearing kind "rerun", are STRUCTURALLY COMPLETE, AND (A1 L4
999
+ // reconciliation) reference a recorded run in runs.jsonl that RECONCILES with the
1000
+ // rerun (rerunRunReconcileError === null). Reused by the L4 check on both write and
1001
+ // read sides so (a) a rerun id borrowed from another task / a dangling id never
1002
+ // satisfies the independent-re-run requirement, (b) a plain note cannot be passed
1003
+ // via --rerun to fake a rerun (only an actual kind:"rerun" row counts), and (c) —
1004
+ // the NEW gate — a self-authored rerun whose exitCode/command do not match (or that
1005
+ // names no) recorded run no longer counts: an L4 rerun must agree with the system's
1006
+ // own run record. (P2/A1 evidence-gate: guardLevel L4 must be backed by a recorded,
1007
+ // reconciled run, not a self-asserted output string.)
1008
+ //
1009
+ // `runRecords` is the parsed runs ledger. It defaults to [] so a legacy 3-arg call
1010
+ // (no runs passed) yields ZERO reconciled rerun ids — i.e. failing CLOSED: if a
1011
+ // caller forgets the runs ledger, no rerun reaches L4, rather than silently
1012
+ // re-opening the hole. Every real caller (CLI create/accept, validator check 8/8c/
1013
+ // 8d) passes the runs ledger.
1014
+ export function ownedRerunEvidenceIds(rerunEvidenceIds, taskId, evidenceRecords, runRecords = []) {
1015
+ const kindOwned = ownedEvidenceIdsOfKind(rerunEvidenceIds, taskId, evidenceRecords, EVIDENCE_KIND_RERUN);
1016
+ if (kindOwned.length === 0) return kindOwned;
1017
+ const byId = new Map();
1018
+ for (const record of evidenceRecords ?? []) {
1019
+ if (isLedgerRecord(record) && typeof record.id === "string") byId.set(record.id, record);
1020
+ }
1021
+ // Keep only rerun rows whose recorded-run reconciliation passes. This is the
1022
+ // step that turns "has a structurally-complete rerun output" into "has a rerun
1023
+ // backed by a recorded, reconciled run".
1024
+ return kindOwned.filter((id) => isReconciledRerunEvidence(byId.get(id), runRecords));
1025
+ }
1026
+
1027
+ // SINGLE source of "which of a receipt's cited evidence ids are same-task AND of
1028
+ // the kind 'cross_family_guard'" — i.e. real cross-family review evidence owned
1029
+ // by this task. Reused by the L3-pass check on both write and read sides so a
1030
+ // guardLevel L3 "binding" pass can no longer be self-asserted on a kind:"note"
1031
+ // row: it must cite at least one piece of cross-family guard evidence belonging
1032
+ // to its own task. (P2 evidence-gate core.)
1033
+ export function ownedCrossFamilyGuardEvidenceIds(evidenceIds, taskId, evidenceRecords) {
1034
+ return ownedEvidenceIdsOfKind(evidenceIds, taskId, evidenceRecords, EVIDENCE_KIND_CROSS_FAMILY_GUARD);
1035
+ }
1036
+
1037
+ // Shared kind-aware owned-evidence filter: of the cited ids, the subset whose
1038
+ // evidence row belongs to `taskId`, has `record.kind === kind`, AND (for the two
1039
+ // load-bearing kinds) is STRUCTURALLY COMPLETE. Built on the same ownership
1040
+ // predicate as ownedEvidenceIds so the cross-task back door stays closed for
1041
+ // kind-gated evidence too; the extra kind check is what lets a guard level demand
1042
+ // a SPECIFIC kind of proof (cross_family_guard for L3, rerun for L4) rather than
1043
+ // any evidence at all; the structural check is what makes that proof a real
1044
+ // structured record rather than an empty shell with the right label — an
1045
+ // `--kind cross_family_guard --summary "ok"` row with no reviewer/family/ref no
1046
+ // longer counts toward an L3 pass, and a bare `--kind rerun` no longer counts
1047
+ // toward L4. (P2: label gate -> structure gate.)
1048
+ export function ownedEvidenceIdsOfKind(evidenceIds, taskId, evidenceRecords, kind) {
1049
+ const owned = new Set(ownedEvidenceIds(evidenceIds, taskId, evidenceRecords));
1050
+ const byId = new Map();
1051
+ for (const record of evidenceRecords ?? []) {
1052
+ if (isLedgerRecord(record) && typeof record.id === "string") byId.set(record.id, record);
1053
+ }
1054
+ const ids = Array.isArray(evidenceIds) ? evidenceIds : [];
1055
+ return ids.filter((id) => {
1056
+ if (!owned.has(id)) return false;
1057
+ const record = byId.get(id);
1058
+ if (record?.kind !== kind) return false;
1059
+ // A special kind only counts if its required structured fields are present;
1060
+ // generic kinds have no structural requirement (returns null) so this is a
1061
+ // no-op for them — but those never reach here anyway (kind !== the special
1062
+ // kind already filtered them out).
1063
+ return specialEvidenceStructureError(record) === null;
1064
+ });
1065
+ }
1066
+
1067
+ // True when the receipt cites at least one same-task evidence row of a RUN kind
1068
+ // (output / command / test / rerun) — i.e. the author actually ran something, the
1069
+ // L1 -> L2 floor for computeGuardLevel. SINGLE source so the CLI writer and the
1070
+ // validator compute the same level. Built on the same ownership predicate so a
1071
+ // run row borrowed from another task does not count. (A1 evidence floor.)
1072
+ export function hasOwnedRunEvidence(evidenceIds, taskId, evidenceRecords) {
1073
+ const owned = new Set(ownedEvidenceIds(evidenceIds, taskId, evidenceRecords));
1074
+ const byId = new Map();
1075
+ for (const record of evidenceRecords ?? []) {
1076
+ if (isLedgerRecord(record) && typeof record.id === "string") byId.set(record.id, record);
1077
+ }
1078
+ const ids = Array.isArray(evidenceIds) ? evidenceIds : [];
1079
+ return ids.some((id) => owned.has(id) && RUN_EVIDENCE_KINDS.includes(byId.get(id)?.kind));
1080
+ }
1081
+
1082
+ // SINGLE source of the "a task may be marked done only with evidence" rule,
1083
+ // shared by the CLI writer (task update) and the validator (check 5). A task in
1084
+ // the done state with no evidence row pointing at it is exactly the thin "done"
1085
+ // the system exists to catch, so the writer refuses it up front and the
1086
+ // validator keeps catching any that arrive by other means.
1087
+ export function doneRequiresEvidence(status) {
1088
+ return status === "done";
1089
+ }
1090
+
1091
+ // SINGLE source of the learning-ledger record shape rule (P4), shared by the CLI
1092
+ // writer (learning add) and the validator. A learning row records one captured
1093
+ // lesson (type "harvest") or one suggested standing preference (type "profile"),
1094
+ // and moves through the same proposed/confirmed/edited/dropped discipline the
1095
+ // profile-candidate buffer and the harvest mechanism use — nothing graduates on
1096
+ // the AI's say-so alone. This is the P1 schema (id / taskId? / type / content /
1097
+ // status / createdAt) turned into an enforced contract so the writer can never
1098
+ // emit a row the validator would reject, and a hand-edited ledger that drifts off
1099
+ // the enum (a bogus type, a typo'd status, an empty content) is caught too.
1100
+ // Returns an error STRING describing the first violation, or null when the row is
1101
+ // well-formed. `taskId` is OPTIONAL (a lesson may not belong to a single task),
1102
+ // but when present it must be a non-empty string.
1103
+ export function learningRecordError(record) {
1104
+ if (!isLedgerRecord(record)) return "learning record must be an object";
1105
+ if (typeof record.id !== "string" || record.id.length === 0) {
1106
+ return "learning record must have a non-empty string id";
1107
+ }
1108
+ if (!LEARNING_TYPES.includes(record.type)) {
1109
+ return `learning type must be one of: ${LEARNING_TYPES.join(", ")} (got ${JSON.stringify(record.type)})`;
1110
+ }
1111
+ if (!isNonEmptyString(record.content)) {
1112
+ return "learning record must have non-empty content";
1113
+ }
1114
+ if (!LEARNING_STATUSES.includes(record.status)) {
1115
+ return `learning status must be one of: ${LEARNING_STATUSES.join(", ")} (got ${JSON.stringify(record.status)})`;
1116
+ }
1117
+ // createdAt is REQUIRED and part of the row shape (every writer — `learning add`
1118
+ // and the seed generator — stamps it). A row missing it, or with a non-string
1119
+ // createdAt, is malformed: the recall selector and the "most recent activity"
1120
+ // calculation both read createdAt, so a blank one would silently skew ordering.
1121
+ if (!isNonEmptyString(record.createdAt)) {
1122
+ return `learning record must have a non-empty string createdAt (got ${JSON.stringify(record.createdAt)})`;
1123
+ }
1124
+ // taskId is optional, but a present taskId must be a real (non-empty) id — a
1125
+ // blank or non-string taskId is a malformed binding, not "unbound".
1126
+ if (record.taskId !== undefined && !isNonEmptyString(record.taskId)) {
1127
+ return `learning record taskId, when present, must be a non-empty string (got ${JSON.stringify(record.taskId)})`;
1128
+ }
1129
+ return null;
1130
+ }
1131
+
1132
+ // True for the two learning states that have been reviewed and kept by the human
1133
+ // (confirmed = correct as written; edited = correct after rewording). Only these
1134
+ // graduate into the long-term profile; "proposed" is an un-reviewed guess and
1135
+ // "dropped" was reviewed and rejected, so neither counts. SINGLE source of the
1136
+ // "what counts as a kept preference" rule, shared by the status recall display
1137
+ // and any future graduation step so they cannot drift.
1138
+ export function isGraduatedLearningStatus(status) {
1139
+ return status === "confirmed" || status === "edited";
1140
+ }
1141
+
1142
+ // SINGLE source of "the standing preference to echo back next time" (P4 recall).
1143
+ // Returns the most recently captured profile-type learning row the human has kept
1144
+ // (status confirmed or edited), or null when there is none. "Most recent" is the
1145
+ // last such row in ledger order; since rows are append-only and status flips are
1146
+ // rewritten in place preserving order, the last kept profile row is the latest
1147
+ // preference the user stood behind. Deliberately returns ONE row, not a list —
1148
+ // the recall is a single "still working the way you confirmed" reminder, not a
1149
+ // dump of every preference (the point is the user feels understood without being
1150
+ // made to maintain a system).
1151
+ export function latestConfirmedProfileLearning(records) {
1152
+ let latest = null;
1153
+ for (const record of records ?? []) {
1154
+ if (!isLedgerRecord(record)) continue;
1155
+ if (record.type !== "profile") continue;
1156
+ if (!isGraduatedLearningStatus(record.status)) continue;
1157
+ latest = record; // keep walking; the last kept profile row wins
1158
+ }
1159
+ return latest;
1160
+ }
1161
+
1162
+ // SINGLE source of "every standing preference the user has kept" — the LIST twin
1163
+ // of latestConfirmedProfileLearning. Returns ALL profile-type rows the human kept
1164
+ // (status confirmed or edited), in ledger order, so a consumer that must surface
1165
+ // the whole confirmed profile (e.g. `adapters install` injecting "Your confirmed
1166
+ // preferences" into the rule files a real tool reads) sees them all, not just the
1167
+ // most recent one. Reuses isGraduatedLearningStatus so "what counts as kept" never
1168
+ // drifts from the recall display. Deliberately strict: a "proposed" row (an
1169
+ // un-reviewed guess) and a "dropped" row (reviewed and rejected) are BOTH excluded
1170
+ // — injecting a proposed preference would pass off an unconfirmed guess as a
1171
+ // standing rule, exactly the dishonesty the proposed/confirmed buffer exists to
1172
+ // prevent. Returns [] (never null) when nothing is kept, so the caller can tell
1173
+ // "no confirmed preferences" from a real list without a null check.
1174
+ export function confirmedProfileLearnings(records) {
1175
+ const kept = [];
1176
+ for (const record of records ?? []) {
1177
+ if (!isLedgerRecord(record)) continue;
1178
+ if (record.type !== "profile") continue;
1179
+ if (!isGraduatedLearningStatus(record.status)) continue;
1180
+ kept.push(record);
1181
+ }
1182
+ return kept;
1183
+ }
1184
+
1185
+ // SINGLE source of "the most recent confirmed/edited HARVEST lesson to echo back"
1186
+ // — the symmetric twin of latestConfirmedProfileLearning, so status can surface a
1187
+ // kept harvest lesson the same way it surfaces a kept preference. A harvest row is
1188
+ // a lesson the run loop captured (P4 harvest), NOT a standing preference; before
1189
+ // this it had nowhere to show up after being confirmed (only profile rows were
1190
+ // recalled), so a user who confirmed a harvest lesson saw it vanish. Returns the
1191
+ // last kept harvest row in ledger order (append-only + in-place status rewrites
1192
+ // preserve order, so the last kept harvest row is the latest), or null when none.
1193
+ // Deliberately ONE row, not a dump — same restraint as the profile recall.
1194
+ export function latestConfirmedHarvestLearning(records) {
1195
+ let latest = null;
1196
+ for (const record of records ?? []) {
1197
+ if (!isLedgerRecord(record)) continue;
1198
+ if (record.type !== "harvest") continue;
1199
+ if (!isGraduatedLearningStatus(record.status)) continue;
1200
+ latest = record; // keep walking; the last kept harvest row wins
1201
+ }
1202
+ return latest;
1203
+ }
1204
+
1205
+ // === Synthetic-seed detection (status honesty) ==============================
1206
+ //
1207
+ // init writes one mutually-consistent SYNTHETIC seed set (task t0, evidence
1208
+ // e0/e1, run r0, receipt c0, learning l0) so a brand-new workspace passes every
1209
+ // ledger check with zero errors and the privacy scanner has a real line to scan
1210
+ // (see workspace.js). The side effect: a fresh `status` shows "Receipts: 1
1211
+ // [accepted=1]" etc. that the user did NOT earn, which reads as fake progress.
1212
+ // This is the SINGLE source of "is this row the shipped example seed, not the
1213
+ // user's own work", shared by the status display so the example rows can be
1214
+ // labelled instead of silently counted as achievements.
1215
+ //
1216
+ // SYNTHETIC_SEED_TS is the fixed timestamp every seed row carries (mirrors
1217
+ // SYNTHETIC_TS in workspace.js — the writer's value; kept in sync by the seed
1218
+ // self-consistency tests). The detector keys on BOTH a known seed id AND that
1219
+ // fixed timestamp, so a real user row can never be mislabelled an example:
1220
+ // - a user task is t1+ (nextId skips the t0 seed), so id "t0" + the synthetic
1221
+ // timestamp is unambiguous; the same holds for the other seed ids.
1222
+ // - requiring the synthetic timestamp too means that even if a user somehow
1223
+ // reused a seed id, a row stamped with a REAL createdAt is treated as real.
1224
+ // Conservative on purpose: a row is only an "example seed" when it matches the
1225
+ // shipped id AND the shipped timestamp; anything else is the user's own.
1226
+ export const SYNTHETIC_SEED_TS = "2026-01-01T00:00:00.000Z";
1227
+
1228
+ const SEED_IDS_BY_LEDGER = {
1229
+ tasks: new Set(["t0"]),
1230
+ evidence: new Set(["e0", "e1"]),
1231
+ runs: new Set(["r0"]),
1232
+ receipts: new Set(["c0"]),
1233
+ learning: new Set(["l0"])
1234
+ };
1235
+
1236
+ // True when `record` is one of the shipped synthetic seed rows for `ledgerKey`
1237
+ // (a known seed id stamped with the synthetic timestamp). `ledgerKey` is one of
1238
+ // the SEED_IDS_BY_LEDGER keys; an unknown key (or a non-record) is never a seed.
1239
+ export function isSeedRow(record, ledgerKey) {
1240
+ if (!isLedgerRecord(record)) return false;
1241
+ const ids = SEED_IDS_BY_LEDGER[ledgerKey];
1242
+ if (!ids || !ids.has(record.id)) return false;
1243
+ // The seed timestamp lives in createdAt for every seed ledger except runs,
1244
+ // whose seed row is stamped startedAt/finishedAt (no createdAt). Accept the
1245
+ // synthetic timestamp in any of those positions so the run seed is detected too.
1246
+ return (
1247
+ record.createdAt === SYNTHETIC_SEED_TS ||
1248
+ record.startedAt === SYNTHETIC_SEED_TS ||
1249
+ record.finishedAt === SYNTHETIC_SEED_TS
1250
+ );
1251
+ }
1252
+
1253
+ // How many rows in `records` are the shipped synthetic seed for `ledgerKey`.
1254
+ // A thin count over isSeedRow so callers (e.g. status's top counters) can mark
1255
+ // "N of these are example seeds" the same way the Tasks counter already does,
1256
+ // keeping the definition of "what is a seed" in one place. Pure read: it counts
1257
+ // existing rows, it never changes them, so the displayed total is unaffected.
1258
+ export function countSeedRows(records, ledgerKey) {
1259
+ if (!Array.isArray(records)) return 0;
1260
+ return records.filter((record) => isSeedRow(record, ledgerKey)).length;
1261
+ }
1262
+
1263
+ export function isDoneEligibleReceipt(receipt, evidenceRecords = [], runRecords = []) {
1264
+ if (!isLedgerRecord(receipt)) return false;
1265
+ if (receipt.status !== "accepted") return false;
1266
+ return computeReceiptGuardLevel(receipt, evidenceRecords, runRecords).familyUnverified !== true;
1267
+ }
1268
+
1269
+ export function taskHasDoneEligibleReceipt(taskId, receiptRecords = [], evidenceRecords = [], runRecords = []) {
1270
+ if (typeof taskId !== "string" || taskId.length === 0) return false;
1271
+ if (!Array.isArray(receiptRecords)) return false;
1272
+ return receiptRecords.some(
1273
+ (receipt) => isLedgerRecord(receipt) &&
1274
+ receipt.taskId === taskId &&
1275
+ isDoneEligibleReceipt(receipt, evidenceRecords, runRecords)
1276
+ );
1277
+ }
1278
+
1279
+ export function taskHasAcceptedReceipt(taskId, receiptRecords = []) {
1280
+ if (typeof taskId !== "string" || taskId.length === 0) return false;
1281
+ if (!Array.isArray(receiptRecords)) return false;
1282
+ return receiptRecords.some(
1283
+ (receipt) => isLedgerRecord(receipt) && receipt.taskId === taskId && receipt.status === "accepted"
1284
+ );
1285
+ }
1286
+
1287
+ export function taskIsAuthorMarkedDoneUnverified(task, receiptRecords = [], evidenceRecords = [], runRecords = []) {
1288
+ if (!isLedgerRecord(task)) return false;
1289
+ return task.status === "done" &&
1290
+ !taskHasDoneEligibleReceipt(task.id, receiptRecords, evidenceRecords, runRecords);
1291
+ }
1292
+
1293
+ export function taskStatusDisplay(task, receiptRecords = [], evidenceRecords = [], runRecords = []) {
1294
+ const status = typeof task?.status === "string" ? task.status : "";
1295
+ if (taskIsAuthorMarkedDoneUnverified(task, receiptRecords, evidenceRecords, runRecords)) {
1296
+ return "done — author-marked, unverified";
1297
+ }
1298
+ return status;
1299
+ }
1300
+
1301
+ // === Per-task achievement summary (the "what did I earn" view) ==============
1302
+ //
1303
+ // status used to print only counters ("Tasks: 2 / Receipts: 2 [accepted=2]"),
1304
+ // which never told the user WHAT they earned ON WHICH TASK. This joins each task
1305
+ // to its own receipts / evidence / runs so status can show, per task: the title,
1306
+ // the task status, and the strongest receipt (verdict + recomputed guardLevel +
1307
+ // who accepted it) plus the evidence/run counts behind it. It is a pure read /
1308
+ // aggregation over already-recorded rows, but it deliberately RE-COMPUTES a
1309
+ // receipt's guard level + family marker from that receipt's own evidence via
1310
+ // computeReceiptGuardLevel. That matches the validator/handoff anti-tamper
1311
+ // pattern and prevents a hand-edited stored guardLevel/familyUnverified field
1312
+ // from making `status` look cleaner than `check` and `handoff`.
1313
+ //
1314
+ // "Strongest receipt" = highest recomputed guardLevel, ties broken by the later
1315
+ // createdAt (the most recent at that level), so a task shows its best-earned
1316
+ // receipt rather than an arbitrary one. A task with no receipt reports
1317
+ // receipt:null (still useful: title + evidence/run counts so an in-progress task
1318
+ // is visible). Each entry is flagged isSeed so the display can mark example rows.
1319
+ export function summarizeTasks(tasks, receipts, evidence, runs) {
1320
+ const taskList = Array.isArray(tasks) ? tasks : [];
1321
+ const receiptList = Array.isArray(receipts) ? receipts : [];
1322
+ const evidenceList = Array.isArray(evidence) ? evidence : [];
1323
+ const runList = Array.isArray(runs) ? runs : [];
1324
+
1325
+ return taskList.filter(isLedgerRecord).map((task) => {
1326
+ const taskReceipts = receiptList.filter(
1327
+ (receipt) => isLedgerRecord(receipt) && receipt.taskId === task.id
1328
+ );
1329
+ const evidenceCount = evidenceList.filter(
1330
+ (row) => isLedgerRecord(row) && row.taskId === task.id
1331
+ ).length;
1332
+ const runCount = runList.filter(
1333
+ (row) => isLedgerRecord(row) && row.taskId === task.id
1334
+ ).length;
1335
+
1336
+ const receiptViews = taskReceipts.map((receipt) => ({
1337
+ receipt,
1338
+ computed: computeReceiptGuardLevel(receipt, evidenceList, runList)
1339
+ }));
1340
+
1341
+ // Pick the strongest receipt: highest recomputed guardLevel, tie -> latest createdAt.
1342
+ let best = null;
1343
+ for (const view of receiptViews) {
1344
+ if (best === null) {
1345
+ best = view;
1346
+ continue;
1347
+ }
1348
+ const rankDelta = guardLevelRank(view.computed.level) - guardLevelRank(best.computed.level);
1349
+ if (rankDelta > 0) {
1350
+ best = view;
1351
+ } else if (rankDelta === 0 && String(view.receipt.createdAt) > String(best.receipt.createdAt)) {
1352
+ best = view;
1353
+ }
1354
+ }
1355
+
1356
+ const bestSummary = best
1357
+ ? {
1358
+ id: best.receipt.id,
1359
+ verdict: best.receipt.verdict,
1360
+ guardLevel: best.computed.level,
1361
+ status: best.receipt.status,
1362
+ acceptedBy: typeof best.receipt.acceptedBy === "string" ? best.receipt.acceptedBy : null,
1363
+ familyUnverified: best.computed.familyUnverified === true
1364
+ }
1365
+ : null;
1366
+
1367
+ return {
1368
+ id: task.id,
1369
+ title: typeof task.title === "string" ? task.title : "",
1370
+ status: task.status,
1371
+ statusDisplay: taskStatusDisplay(task, receiptList, evidenceList, runList),
1372
+ authorMarkedDoneUnverified: taskIsAuthorMarkedDoneUnverified(task, receiptList, evidenceList, runList),
1373
+ isSeed: isSeedRow(task, "tasks"),
1374
+ evidenceCount,
1375
+ runCount,
1376
+ receiptCount: taskReceipts.length,
1377
+ receipt: bestSummary
1378
+ };
1379
+ });
1380
+ }
1381
+
1382
+ // === Handoff draft model (the "resume across tools" view) ===================
1383
+ //
1384
+ // `handoff create` exists to turn the ledger into a draft handoff note so the
1385
+ // next session/tool does not start from zero. This function is the HONEST CORE
1386
+ // of that command: it reads the already-recorded rows and sorts each task into
1387
+ // done / pending / blocked / unverified buckets using the SAME definition of
1388
+ // "done" the rest of the tool enforces — a task is DONE only when it carries a
1389
+ // receipt the system actually marked status "accepted". It computes NOTHING new
1390
+ // about guard levels or acceptance; it only reads what receiptStatusFor already
1391
+ // wrote, so a draft can never claim more than the ledger granted.
1392
+ //
1393
+ // The classification rule (deliberately strict, so "done" cannot overclaim):
1394
+ // - DONE: the task has >= 1 DONE-ELIGIBLE receipt — status "accepted" AND not a
1395
+ // self-declared/unverified cross-family level (familyUnverified !== true). A
1396
+ // plain `pass` with own-task evidence auto-accepts; a `pass_with_risk` only
1397
+ // reaches "accepted" after an explicit owner sign-off (see receiptStatusFor);
1398
+ // an owner-accepted pass_with_risk is Done (it does not claim cross-family, so
1399
+ // it is not familyUnverified). This is the ONLY bucket that asserts verified
1400
+ // completion.
1401
+ // - UNVERIFIED: the task has >= 1 receipt but NONE is Done-eligible — i.e. the
1402
+ // work was reviewed but the review did not clear the bar. This is where a
1403
+ // `pass_with_risk` that is still pending lands, where a receipt that cites no
1404
+ // own-task evidence (pending) lands, AND where a task whose only acceptance is
1405
+ // a SELF-DECLARED cross-family level (familyUnverified === true) lands: the
1406
+ // tool cannot verify the family, so an accepted-but-unverified-cross-family
1407
+ // receipt is NOT trusted as done — its honest reason ("re-check the family
1408
+ // identity with a real different model family") rides along in riskNotes. A task can
1409
+ // be in DONE and have OTHER non-Done-eligible receipts; those extra receipts
1410
+ // are surfaced as risk notes but do not pull the task out of DONE (its
1411
+ // genuine accepted, non-unverified receipt stands).
1412
+ // - BLOCKED: the task's own status is "blocked" (and it is not already DONE).
1413
+ // - PENDING: everything else — an open/partial/unverified-status task with no
1414
+ // accepted receipt and no blocked flag, OR a task with no receipt at all.
1415
+ // This is "still in progress / not yet reviewed".
1416
+ // A task with an accepted receipt is reported as DONE regardless of its task
1417
+ // status field, because the accepted receipt is the stronger, evidence-backed
1418
+ // signal (the status string is author-set; acceptance is gated).
1419
+ //
1420
+ // `taskId` (optional) narrows the model to a single task (its bucket + a header
1421
+ // note); omitted, the model covers every non-seed task. Seed rows are excluded
1422
+ // by default (a draft about the shipped example helps no one) unless a seed task
1423
+ // is explicitly named via taskId.
1424
+ //
1425
+ // Returns a plain data object (no rendering, no I/O) so it is unit-testable and
1426
+ // the CLI/renderer can format it however it likes:
1427
+ // { focusTaskId, generatedFrom: { taskCount, ... },
1428
+ // done: [entry], pending: [entry], blocked: [entry], unverified: [entry],
1429
+ // learnings: [keptLearningRow], counts: {...} }
1430
+ // where each `entry` is { id, title, taskStatus, receipts: [receiptView],
1431
+ // evidence: [evidenceView], runs: [runView], riskNotes: [string] }.
1432
+ export function buildHandoffModel(ledgers = {}, options = {}) {
1433
+ const tasks = Array.isArray(ledgers.tasks) ? ledgers.tasks.filter(isLedgerRecord) : [];
1434
+ const evidence = Array.isArray(ledgers.evidence) ? ledgers.evidence.filter(isLedgerRecord) : [];
1435
+ const runs = Array.isArray(ledgers.runs) ? ledgers.runs.filter(isLedgerRecord) : [];
1436
+ const receipts = Array.isArray(ledgers.receipts) ? ledgers.receipts.filter(isLedgerRecord) : [];
1437
+ const learning = Array.isArray(ledgers.learning) ? ledgers.learning.filter(isLedgerRecord) : [];
1438
+
1439
+ const focusTaskId = typeof options.taskId === "string" && options.taskId.length > 0 ? options.taskId : null;
1440
+
1441
+ // A receipt is "accepted" iff its stored status is exactly "accepted". We do
1442
+ // NOT re-derive it from verdict here: receiptStatusFor already settled it at
1443
+ // write time (a pass_with_risk is "pending" until owner-accepted), and the
1444
+ // status string is the single source the validator also keys on. Reading it
1445
+ // (not recomputing) is what keeps the draft from disagreeing with the ledger.
1446
+ const isAcceptedReceipt = (receipt) => receipt.status === "accepted";
1447
+
1448
+ // Build a per-task view, then bucket it. We keep a stable, ledger-order list.
1449
+ const selected = tasks.filter((task) => {
1450
+ if (focusTaskId) return task.id === focusTaskId;
1451
+ return !isSeedRow(task, "tasks"); // whole-workspace draft skips the example seed
1452
+ });
1453
+
1454
+ const done = [];
1455
+ const pending = [];
1456
+ const blocked = [];
1457
+ const unverified = [];
1458
+
1459
+ for (const task of selected) {
1460
+ const taskReceipts = receipts.filter((receipt) => receipt.taskId === task.id);
1461
+ const taskEvidence = evidence.filter((row) => row.taskId === task.id);
1462
+ const taskRuns = runs.filter((row) => row.taskId === task.id);
1463
+
1464
+ const acceptedReceipts = taskReceipts.filter(isAcceptedReceipt);
1465
+ const unacceptedReceipts = taskReceipts.filter((receipt) => !isAcceptedReceipt(receipt));
1466
+ // Done-eligible = accepted AND its cross-family completion claim is TRUSTWORTHY.
1467
+ // We do NOT read the stored `receipt.familyUnverified` field to decide this:
1468
+ // that flag can be absent on an old-schema row, stripped by a hand-edit, or
1469
+ // simply never written on a hand-planted row — and an accepted L3 cross-family
1470
+ // row with the flag MISSING would then sail into Done and let a reader read a
1471
+ // self-asserted cross-family review as an independently verified completion.
1472
+ // Instead we RE-COMPUTE the level from the receipt's OWN evidence (the same
1473
+ // computeGuardLevel basis the CLI writer and the validator use), and trust the
1474
+ // DERIVED familyUnverified. Fail-safe: a receipt that rests on a cross-family
1475
+ // CLAIM is treated as unverified NO MATTER what the stored flag says (true /
1476
+ // false / missing), and routed to Unverified with an honest reason. L4 adds
1477
+ // locally reconciled execution/output evidence, but still cannot verify model
1478
+ // family identity. Accepted receipts therefore stay Done-eligible only when
1479
+ // they do not claim cross-family (e.g. an owner-accepted pass_with_risk).
1480
+ const isFamilyUnverified = (receipt) =>
1481
+ computeReceiptGuardLevel(receipt, evidence, runs).familyUnverified === true;
1482
+ const isLocallyReconciledL4 = (receipt) =>
1483
+ computeReceiptGuardLevel(receipt, evidence, runs).level === "L4";
1484
+ // The level the DRAFT must display for a receipt: the level RE-COMPUTED from the
1485
+ // receipt's own evidence, never the stored `receipt.guardLevel`. The bucketing
1486
+ // and the familyUnverified marker are already derived this way; the detail line
1487
+ // (and the risk notes) must match, or a hand-edited row that stores "L4" with
1488
+ // only L3 evidence would read "L4" in the draft text even while it sits in
1489
+ // Unverified — the stored field lying in the one place a reader actually reads.
1490
+ const displayGuardLevel = (receipt) =>
1491
+ computeReceiptGuardLevel(receipt, evidence, runs).level;
1492
+ const doneEligibleReceipts = acceptedReceipts.filter(
1493
+ (receipt) => isDoneEligibleReceipt(receipt, evidence, runs)
1494
+ );
1495
+
1496
+ // Sort receipts strongest-first (highest guardLevel, then latest createdAt) so
1497
+ // the draft leads with the most load-bearing one in each section. Rank by the
1498
+ // RE-COMPUTED level (same basis as the displayed level), so a forged stored
1499
+ // "L4" cannot jump a receipt to the top of the list while displaying its real,
1500
+ // lower computed level.
1501
+ const byStrength = (a, b) => {
1502
+ const rankDelta = guardLevelRank(displayGuardLevel(b)) - guardLevelRank(displayGuardLevel(a));
1503
+ if (rankDelta !== 0) return rankDelta;
1504
+ return String(b.createdAt).localeCompare(String(a.createdAt));
1505
+ };
1506
+ const sortedAccepted = [...acceptedReceipts].sort(byStrength);
1507
+ const sortedUnaccepted = [...unacceptedReceipts].sort(byStrength);
1508
+
1509
+ const toReceiptView = (receipt) => ({
1510
+ id: receipt.id,
1511
+ verdict: receipt.verdict,
1512
+ // The RE-COMPUTED level from the receipt's own evidence, NOT the stored
1513
+ // `receipt.guardLevel`. This is the level the draft's detail line prints, so
1514
+ // it stays consistent with the bucketing/status and the familyUnverified mark
1515
+ // below — a hand-edited stored "L4" backed by only L3 evidence shows the honest
1516
+ // L3 here instead of laundering the forged level into the handoff text.
1517
+ guardLevel: displayGuardLevel(receipt),
1518
+ status: receipt.status,
1519
+ acceptedBy: typeof receipt.acceptedBy === "string" ? receipt.acceptedBy : null,
1520
+ // familyUnverified marks a SELF-DECLARED cross-family level the tool could not
1521
+ // verify — surfaced so a reader never reads such a level as independently
1522
+ // checked. DERIVED from the receipt's own evidence (not read from the stored
1523
+ // field), so a row whose stored flag is missing/stripped still shows the honest
1524
+ // mark when its computed level rests on an unverified cross-family claim.
1525
+ familyUnverified: isFamilyUnverified(receipt),
1526
+ reviewMode: typeof receipt.reviewMode === "string" ? receipt.reviewMode : null,
1527
+ levelExplanation: typeof receipt.levelExplanation === "string" ? receipt.levelExplanation : null,
1528
+ evidenceIds: Array.isArray(receipt.evidenceIds) ? receipt.evidenceIds : []
1529
+ });
1530
+
1531
+ const evidenceViews = taskEvidence.map((row) => ({
1532
+ id: row.id,
1533
+ kind: typeof row.kind === "string" ? row.kind : "",
1534
+ summary: typeof row.summary === "string" ? row.summary : ""
1535
+ }));
1536
+ // Most recent run first (a draft wants the latest command + its exit code).
1537
+ const runViews = [...taskRuns]
1538
+ .sort((a, b) => String(b.finishedAt ?? b.startedAt).localeCompare(String(a.finishedAt ?? a.startedAt)))
1539
+ .map((row) => ({
1540
+ id: row.id,
1541
+ command: typeof row.command === "string" ? row.command : null,
1542
+ exitCode: Number.isInteger(row.exitCode) ? row.exitCode : null,
1543
+ status: typeof row.status === "string" ? row.status : ""
1544
+ }));
1545
+
1546
+ // Risk notes: the honest "do not trust these as done" flags a reviewer must
1547
+ // see. Built from the UNACCEPTED receipts (a pass_with_risk still pending, a
1548
+ // receipt with no own-task evidence, a self-declared unverified cross-family
1549
+ // level). These appear on whatever bucket the task lands in.
1550
+ const riskNotes = [];
1551
+ for (const receipt of sortedUnaccepted) {
1552
+ if (receipt.verdict === "pass_with_risk" && receipt.status !== "accepted") {
1553
+ riskNotes.push(
1554
+ `receipt ${receipt.id}: pass_with_risk at ${displayGuardLevel(receipt)} is NOT owner-accepted (still ${receipt.status}) — the named residual risk has not been signed off.`
1555
+ );
1556
+ } else if (receipt.status === "pending") {
1557
+ riskNotes.push(
1558
+ `receipt ${receipt.id}: ${receipt.verdict} at ${displayGuardLevel(receipt)} is pending (not accepted) — treat as unverified.`
1559
+ );
1560
+ } else if (receipt.status === "rejected") {
1561
+ riskNotes.push(
1562
+ `receipt ${receipt.id}: ${receipt.verdict} at ${displayGuardLevel(receipt)} was rejected — the work did not pass review.`
1563
+ );
1564
+ }
1565
+ if (isFamilyUnverified(receipt)) {
1566
+ riskNotes.push(
1567
+ `receipt ${receipt.id}: the cross-family review at ${displayGuardLevel(receipt)} is SELF-DECLARED and unverified — re-check with a real different model family before trusting it as cross-family done.`
1568
+ );
1569
+ }
1570
+ if (isLocallyReconciledL4(receipt)) {
1571
+ riskNotes.push(
1572
+ `receipt ${receipt.id}: L4 local execution evidence is present — the cited rerun reconciles to a recorded run exec output.`
1573
+ );
1574
+ }
1575
+ }
1576
+ // A self-declared cross-family level on an ACCEPTED receipt is still worth
1577
+ // surfacing (accepted does not mean the family claim was independently
1578
+ // checked). A task that rests ONLY on such receipts is routed to Unverified
1579
+ // (see the bucketing below), so this note doubles as the honest reason it
1580
+ // is NOT reported as Done: the cross-family attribution cannot be trusted as
1581
+ // "cross-family done" until a real different model family re-checks it. L4
1582
+ // rerun reconciliation is surfaced as a separate local-execution note. The flag is DERIVED from the receipt's
1583
+ // own evidence, so a hand-planted accepted L3 cross-family row whose stored
1584
+ // familyUnverified marker is MISSING still carries this honest reason into
1585
+ // Unverified instead of dropping there silently.
1586
+ for (const receipt of sortedAccepted) {
1587
+ if (isFamilyUnverified(receipt)) {
1588
+ riskNotes.push(
1589
+ `receipt ${receipt.id}: pass · ${displayGuardLevel(receipt)} · accepted locally, but its cross-family attribution is self-declared / unverified — re-check with a real different model family before trusting it as cross-family done.`
1590
+ );
1591
+ }
1592
+ if (isLocallyReconciledL4(receipt)) {
1593
+ riskNotes.push(
1594
+ `receipt ${receipt.id}: L4 local execution evidence is present — the cited rerun reconciles to a recorded run exec output.`
1595
+ );
1596
+ }
1597
+ }
1598
+ const authorMarkedDoneUnverified = taskIsAuthorMarkedDoneUnverified(task, receipts, evidence, runs);
1599
+ if (authorMarkedDoneUnverified) {
1600
+ riskNotes.push(
1601
+ "task status is done, but there is no done-eligible accepted receipt; shown as author-marked, unverified until evidence plus an accepted receipt verify it."
1602
+ );
1603
+ }
1604
+
1605
+ const entry = {
1606
+ id: task.id,
1607
+ title: typeof task.title === "string" ? task.title : "",
1608
+ taskStatus: task.status,
1609
+ taskStatusDisplay: taskStatusDisplay(task, receipts, evidence, runs),
1610
+ authorMarkedDoneUnverified,
1611
+ isSeed: isSeedRow(task, "tasks"),
1612
+ receipts: [...sortedAccepted, ...sortedUnaccepted].map(toReceiptView),
1613
+ acceptedReceipts: sortedAccepted.map(toReceiptView),
1614
+ unacceptedReceipts: sortedUnaccepted.map(toReceiptView),
1615
+ evidence: evidenceViews,
1616
+ runs: runViews,
1617
+ riskNotes
1618
+ };
1619
+
1620
+ // Bucket. DONE wins on a DONE-ELIGIBLE accepted receipt (the evidence-backed
1621
+ // signal: accepted AND not a self-declared/unverified cross-family level),
1622
+ // regardless of the author-set task status. A task whose ONLY acceptances are
1623
+ // self-declared-unverified-cross-family (familyUnverified) is NOT done — it
1624
+ // routes to UNVERIFIED (with the honest reason in its riskNotes), so a reader
1625
+ // never reads a self-asserted cross-family claim as independently checked.
1626
+ // Otherwise blocked status -> BLOCKED; any other receipt -> UNVERIFIED;
1627
+ // nothing reviewed yet -> PENDING.
1628
+ if (doneEligibleReceipts.length > 0) {
1629
+ done.push(entry);
1630
+ } else if (task.status === "blocked") {
1631
+ blocked.push(entry);
1632
+ } else if (taskReceipts.length > 0) {
1633
+ // Reviewed but not Done-eligible: a pass_with_risk still pending, a pending
1634
+ // pass, a rejected receipt, OR an accepted-but-self-declared-unverified
1635
+ // cross-family receipt (accepted locally, family not verified).
1636
+ unverified.push(entry);
1637
+ } else {
1638
+ pending.push(entry);
1639
+ }
1640
+ }
1641
+
1642
+ // Kept learnings (confirmed/edited) to carry forward — both standing
1643
+ // preferences (profile) and captured lessons (harvest). Proposed/dropped rows
1644
+ // are NOT carried (an un-kept guess is not a confirmed learning). One list,
1645
+ // ledger order, so the draft echoes what the user actually kept.
1646
+ const learnings = learning
1647
+ .filter((row) => isGraduatedLearningStatus(row.status))
1648
+ .map((row) => ({
1649
+ id: row.id,
1650
+ type: typeof row.type === "string" ? row.type : "",
1651
+ content: typeof row.content === "string" ? row.content : "",
1652
+ status: row.status
1653
+ }));
1654
+
1655
+ return {
1656
+ focusTaskId,
1657
+ done,
1658
+ pending,
1659
+ blocked,
1660
+ unverified,
1661
+ learnings,
1662
+ counts: {
1663
+ tasksConsidered: selected.length,
1664
+ done: done.length,
1665
+ pending: pending.length,
1666
+ blocked: blocked.length,
1667
+ unverified: unverified.length
1668
+ }
1669
+ };
1670
+ }
1671
+
1672
+ // === "Why this guard level" plain-language explanation ======================
1673
+ //
1674
+ // computeGuardLevel returns a `reason` phrased in terms of method/evidence
1675
+ // ceilings ('review method "self" and evidence both support L1'), which does not
1676
+ // tell a first-time user, in their own terms, WHAT evidence they cited and WHAT
1677
+ // to add to climb a level. This turns the SAME computed inputs into one plain
1678
+ // sentence: "what you cited -> the level it earned -> the concrete next step to
1679
+ // the level above". It is DERIVED from the actual evidence flags (not written to
1680
+ // any fixed string per level), and it NEVER claims a level higher than `level` —
1681
+ // it only names the next rung and what unlocks it, so it can never mislead a user
1682
+ // into thinking they earned more than the computed level.
1683
+ //
1684
+ // `inputs` mirrors the computeGuardLevel evidence flags plus the resolved level:
1685
+ // { level, hasCrossFamilyGuardEvidence, hasRerunOutputEvidence,
1686
+ // hasReconciledRerunEvidence, hasAuthorRunEvidence, hasAnyEvidence }
1687
+ // hasReconciledRerunEvidence is the L4-grade rerun (cited via --rerun AND
1688
+ // reconciled to a recorded run exec with matching output hash);
1689
+ // hasRerunOutputEvidence is any rerun row. The caller passes what it actually
1690
+ // computed; this function only phrases it.
1691
+ export function guardLevelExplanation(inputs = {}) {
1692
+ const {
1693
+ level,
1694
+ hasCrossFamilyGuardEvidence = false,
1695
+ hasReconciledRerunEvidence = false,
1696
+ hasAuthorRunEvidence = false,
1697
+ hasRerunOutputEvidence = false,
1698
+ hasAnyEvidence = false
1699
+ } = inputs;
1700
+
1701
+ // What the author actually cited, in plain words (most-load-bearing first).
1702
+ let cited;
1703
+ if (hasCrossFamilyGuardEvidence && (hasReconciledRerunEvidence || hasRerunOutputEvidence)) {
1704
+ cited = "cited a cross-family review and a rerun";
1705
+ } else if (hasCrossFamilyGuardEvidence) {
1706
+ cited = "cited a cross-family review";
1707
+ } else if (hasReconciledRerunEvidence || hasRerunOutputEvidence) {
1708
+ cited = "cited a rerun";
1709
+ } else if (hasAuthorRunEvidence) {
1710
+ cited = "cited run/output evidence";
1711
+ } else if (hasAnyEvidence) {
1712
+ cited = "cited a note (no run/rerun evidence)";
1713
+ } else {
1714
+ cited = "cited no evidence";
1715
+ }
1716
+
1717
+ // The concrete next rung. Keyed on the achieved level so the advice always
1718
+ // points UP from where the receipt actually landed (never claims this level is
1719
+ // higher than it is). The "to reach LX" target is the next level the named step
1720
+ // would unlock, phrased as guidance, not a grant.
1721
+ let nextStep;
1722
+ switch (level) {
1723
+ case "L0":
1724
+ nextStep = "attach any evidence (a note, a diff, captured output) to reach L1";
1725
+ break;
1726
+ case "L1":
1727
+ nextStep = "cite run/output or a rerun (--kind output / command / test / rerun) to reach L2";
1728
+ break;
1729
+ case "L2":
1730
+ nextStep =
1731
+ "add a cross-family review (--kind cross_family_guard, naming the reviewer/family) to reach L3";
1732
+ break;
1733
+ case "L2.5":
1734
+ // A weak L3 (one tool driving a second family): a genuinely different
1735
+ // model-family review is what promotes it to a full L3.
1736
+ nextStep =
1737
+ "have a genuinely different model family review it (a cross_family_guard from a separate tool) to reach a full L3";
1738
+ break;
1739
+ case "L3":
1740
+ nextStep =
1741
+ "add a rerun reconciled to a recorded run exec (--rerun citing a run id with matching command, exit, and output) to reach L4";
1742
+ break;
1743
+ case "L4":
1744
+ // Top local-trust level — nothing higher to point at.
1745
+ nextStep = "this is the strongest local level (L4)";
1746
+ break;
1747
+ default:
1748
+ nextStep = null;
1749
+ }
1750
+
1751
+ return nextStep ? `${level}: ${cited} — ${nextStep}` : `${level}: ${cited}`;
1752
+ }
1753
+
1754
+ // === Recognized evidence kinds (advisory, NOT a closed enum) ================
1755
+ //
1756
+ // Evidence kind stays FREE-FORM by design (see the long note at the top of this
1757
+ // file: any string is accepted, backward compatibility is a hard requirement).
1758
+ // This list is the set of kinds the docs/help describe and the tool gives meaning
1759
+ // to — the two load-bearing semantic kinds (cross_family_guard, rerun), the run
1760
+ // kinds (output, command, test), and the generic documented kinds (note, diff,
1761
+ // file). It exists ONLY so `evidence add` can WARN on an unrecognized kind (a
1762
+ // likely typo like "reun" for "rerun") while still recording the row. It is NOT a
1763
+ // validation gate — isRecognizedEvidenceKind is advisory; an unknown kind is
1764
+ // still a valid generic evidence row.
1765
+ export const KNOWN_EVIDENCE_KINDS = [
1766
+ "note",
1767
+ "diff",
1768
+ "file",
1769
+ ...RUN_EVIDENCE_KINDS, // output, command, test, rerun
1770
+ EVIDENCE_KIND_CROSS_FAMILY_GUARD
1771
+ ];
1772
+
1773
+ // True when `kind` is one of the documented/meaningful kinds above. Advisory only
1774
+ // (used to warn on a probable typo); an unrecognized kind is still accepted and
1775
+ // recorded as a generic evidence row.
1776
+ export function isRecognizedEvidenceKind(kind) {
1777
+ return KNOWN_EVIDENCE_KINDS.includes(kind);
1778
+ }
1779
+
1780
+ export function ledgerPath(stateDir, ledgerKey) {
1781
+ const fileName = LEDGER_FILES[ledgerKey];
1782
+ if (!fileName) throw new Error(`Unknown ledger "${ledgerKey}".`);
1783
+ return path.join(stateDir, fileName);
1784
+ }
1785
+
1786
+ // A ledger record must be a plain JSON object ({...}). Arrays, null, strings,
1787
+ // and numbers are all legal JSON but are NOT valid ledger rows: every downstream
1788
+ // check reads record.id / record.status / record.taskId, which would throw a
1789
+ // non-pointable TypeError on a null or a scalar. Centralizing the shape rule
1790
+ // here keeps the CLI writer (via readLedger) and the validator reader on the
1791
+ // same definition of "a valid record line".
1792
+ export function isLedgerRecord(value) {
1793
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1794
+ }
1795
+
1796
+ // Parse a JSONL file into { records, errors }. Each record is a parsed plain
1797
+ // object; each error is `{ file, line, message, kind }` for a line that is
1798
+ // non-empty but either (kind "json") does not parse as JSON, or (kind "type")
1799
+ // parses but is not a plain object. Blank lines are skipped (they are not data).
1800
+ // A missing file yields empty records + no errors (an unused ledger is valid).
1801
+ export function parseLedgerFile(file) {
1802
+ const records = [];
1803
+ const errors = [];
1804
+ if (!existsSync(file)) return { records, errors };
1805
+
1806
+ const raw = readFileSync(file, "utf8");
1807
+ const lines = raw.split("\n");
1808
+ for (let index = 0; index < lines.length; index += 1) {
1809
+ const line = lines[index];
1810
+ if (line.trim().length === 0) continue; // blank line: not data
1811
+ let parsed;
1812
+ try {
1813
+ parsed = JSON.parse(line);
1814
+ } catch (parseError) {
1815
+ errors.push({ file, line: index + 1, message: parseError.message, kind: "json" });
1816
+ continue;
1817
+ }
1818
+ // Legal JSON but not a record object (null / array / scalar): reject with a
1819
+ // pointable error instead of letting a later record.id access crash.
1820
+ if (!isLedgerRecord(parsed)) {
1821
+ errors.push({ file, line: index + 1, message: "record must be an object", kind: "type" });
1822
+ continue;
1823
+ }
1824
+ records.push(parsed);
1825
+ }
1826
+ return { records, errors };
1827
+ }
1828
+
1829
+ // Read all valid records from a ledger by key (throws on the first parse error,
1830
+ // for command-side callers that want fail-fast on a corrupt ledger rather than a
1831
+ // silent partial read).
1832
+ export function readLedger(stateDir, ledgerKey) {
1833
+ const file = ledgerPath(stateDir, ledgerKey);
1834
+ const { records, errors } = parseLedgerFile(file);
1835
+ if (errors.length > 0) {
1836
+ const first = errors[0];
1837
+ // A JSON parse error keeps the "is not valid JSON (...)" wording; a type
1838
+ // error (legal JSON but not an object) carries its own message verbatim.
1839
+ const reason = first.kind === "type" ? first.message : `is not valid JSON (${first.message})`;
1840
+ throw new Error(`Corrupt ledger ${path.basename(file)}:${first.line}: ${reason}`);
1841
+ }
1842
+ return records;
1843
+ }
1844
+
1845
+ // Append one record as a single compact JSON line. Creates the state dir if
1846
+ // needed. Always writes a trailing newline so the next append starts on its own
1847
+ // line (and a one-line file stays a valid single record).
1848
+ export function appendLedger(stateDir, ledgerKey, record) {
1849
+ const file = ledgerPath(stateDir, ledgerKey);
1850
+ mkdirSync(path.dirname(file), { recursive: true });
1851
+ appendFileSync(file, `${JSON.stringify(record)}\n`, "utf8");
1852
+ return record;
1853
+ }
1854
+
1855
+ // Rewrite a ledger from a full record array (used by run finish: read all ->
1856
+ // patch the matching line -> write back). Preserves order; one record per line.
1857
+ export function writeLedger(stateDir, ledgerKey, records) {
1858
+ const file = ledgerPath(stateDir, ledgerKey);
1859
+ mkdirSync(path.dirname(file), { recursive: true });
1860
+ const body = records.map((record) => JSON.stringify(record)).join("\n");
1861
+ writeFileSync(file, records.length === 0 ? "" : `${body}\n`, "utf8");
1862
+ }
1863
+
1864
+ // Generate the next sequential id for a ledger given a one-letter prefix, by
1865
+ // taking max(numeric suffix of existing ids with this prefix) + 1. Counting rows
1866
+ // instead would skip an id: the synthetic seed already occupies `t0`, so a
1867
+ // rows.length+1 scheme made the first real task `t2` (gap at t1) and could also
1868
+ // collide after a deletion. Anchoring on the highest existing suffix yields t1
1869
+ // after the t0 seed and stays gap-free / collision-resistant. Deterministic at
1870
+ // runtime; never used for the committed synthetic templates (fixed ids).
1871
+ export function nextId(records, prefix) {
1872
+ let max = -1;
1873
+ for (const record of records) {
1874
+ if (!isLedgerRecord(record)) continue;
1875
+ const id = record.id;
1876
+ if (typeof id !== "string" || !id.startsWith(prefix)) continue;
1877
+ const suffix = id.slice(prefix.length);
1878
+ // Only a pure run of digits is a sequential id we own (e.g. "t12"); ignore
1879
+ // anything else so a hand-edited id cannot derail the counter.
1880
+ if (!/^\d+$/.test(suffix)) continue;
1881
+ const value = Number.parseInt(suffix, 10);
1882
+ if (value > max) max = value;
1883
+ }
1884
+ return `${prefix}${max + 1}`;
1885
+ }
1886
+
1887
+ // --- Concurrency lock (B6a-2) ----------------------------------------------
1888
+ //
1889
+ // The id-allocation path is read-modify-write: readLedger -> nextId(max+1) ->
1890
+ // appendLedger. Two processes (e.g. parallel `task create`s) can interleave between
1891
+ // the read and the append and BOTH mint the same id — a duplicate that later trips
1892
+ // `check`. The CLI is synchronous (spawnSync everywhere), so we serialize the whole
1893
+ // read->compute->append (and run finish's read-all->rewrite) with a short, on-disk
1894
+ // MUTEX: a lock file created with O_EXCL (openSync flag 'wx'), which fails if the file
1895
+ // already exists, so exactly one process holds it at a time. A loser retries with a
1896
+ // small backoff until it wins or times out, and a STALE lock (left by a crashed
1897
+ // process) is reclaimed once it is older than a threshold so the ledger can never
1898
+ // wedge permanently.
1899
+
1900
+ const LOCK_RETRY_MS = 25; // backoff between acquisition attempts
1901
+ const LOCK_TIMEOUT_MS = 5000; // give up acquiring after ~5s (then throw, never hang)
1902
+ const LOCK_STALE_MS = 10000; // a lock file older than this is treated as abandoned
1903
+
1904
+ // Sleep synchronously for `ms` without busy-spinning the CPU. The CLI commands are
1905
+ // synchronous, so we cannot await; Atomics.wait blocks the thread on a private buffer
1906
+ // that is never notified, which the runtime implements as a real timed sleep.
1907
+ function sleepSync(ms) {
1908
+ const shared = new Int32Array(new SharedArrayBuffer(4));
1909
+ Atomics.wait(shared, 0, 0, ms);
1910
+ }
1911
+
1912
+ function lockPathFor(stateDir) {
1913
+ return path.join(stateDir, ".ledger.lock");
1914
+ }
1915
+
1916
+ // Try to create the lock file atomically. Returns the open fd on success, or null if
1917
+ // it already exists (someone else holds it). Any other error propagates (e.g. a real
1918
+ // permissions / disk problem should not be silently swallowed as "locked").
1919
+ function tryAcquire(lockPath) {
1920
+ try {
1921
+ // 'wx' = O_CREAT | O_EXCL | O_WRONLY: create-only, fail if it already exists.
1922
+ return openSync(lockPath, "wx");
1923
+ } catch (error) {
1924
+ if (error && error.code === "EEXIST") return null; // held by someone else
1925
+ throw error;
1926
+ }
1927
+ }
1928
+
1929
+ // If the existing lock file is older than LOCK_STALE_MS, it was almost certainly left
1930
+ // by a process that died mid-critical-section; remove it so a live process can proceed.
1931
+ // Returns true if a stale lock was cleared (the caller should retry immediately).
1932
+ function reclaimIfStale(lockPath) {
1933
+ try {
1934
+ const age = Date.now() - statSync(lockPath).mtimeMs;
1935
+ if (age > LOCK_STALE_MS) {
1936
+ unlinkSync(lockPath);
1937
+ return true;
1938
+ }
1939
+ } catch (error) {
1940
+ // The lock vanished between the failed acquire and this stat (the holder released
1941
+ // it): that is fine — signal "retry now".
1942
+ if (error && error.code === "ENOENT") return true;
1943
+ // Any other error: do not loop forever on it.
1944
+ throw error;
1945
+ }
1946
+ return false;
1947
+ }
1948
+
1949
+ // Run `fn` while holding the ledger lock for `stateDir`. Acquires (with backoff +
1950
+ // stale reclamation), runs fn, and ALWAYS releases in finally (closes the fd + deletes
1951
+ // the lock file) so a throw inside fn cannot leave a wedged lock. Throws if the lock
1952
+ // cannot be acquired within LOCK_TIMEOUT_MS (a real deadlock surfaces loudly rather
1953
+ // than hanging the CLI). Synchronous, matching the rest of the writer path.
1954
+ export function withLedgerLock(stateDir, fn) {
1955
+ mkdirSync(stateDir, { recursive: true }); // the lock lives in stateDir; ensure it exists
1956
+ const lockPath = lockPathFor(stateDir);
1957
+ const deadline = Date.now() + LOCK_TIMEOUT_MS;
1958
+ let fd = null;
1959
+ for (;;) {
1960
+ fd = tryAcquire(lockPath);
1961
+ if (fd !== null) break; // acquired
1962
+ // Could not acquire: either wait and retry, or reclaim a stale lock and retry now.
1963
+ if (!reclaimIfStale(lockPath)) {
1964
+ if (Date.now() >= deadline) {
1965
+ throw new Error(`Could not acquire ledger lock (${lockPath}) within ${LOCK_TIMEOUT_MS}ms — another process may be stuck. No changes made.`);
1966
+ }
1967
+ sleepSync(LOCK_RETRY_MS);
1968
+ }
1969
+ // If reclaimIfStale() cleared a stale/vanished lock, loop straight back and retry
1970
+ // the acquire without sleeping.
1971
+ }
1972
+ try {
1973
+ return fn();
1974
+ } finally {
1975
+ // Release: close the fd, then remove the lock file. Tolerate a missing file (a
1976
+ // stale-reclaim race could have deleted it) so cleanup never throws over the result.
1977
+ try { closeSync(fd); } catch { /* fd already closed */ }
1978
+ try { unlinkSync(lockPath); } catch (error) {
1979
+ if (!(error && error.code === "ENOENT")) throw error;
1980
+ }
1981
+ }
1982
+ }
1983
+
1984
+ // Atomically allocate the next id for `ledgerKey` and append a record, all UNDER the
1985
+ // lock so two concurrent callers can never mint the same id. `buildRecord(id)` receives
1986
+ // the freshly-computed id and returns the full record object to append. The ledger is
1987
+ // re-read INSIDE the lock (not before it), so the id reflects every row another process
1988
+ // committed while we were waiting. Returns the appended record.
1989
+ export function appendWithNextId(stateDir, ledgerKey, prefix, buildRecord) {
1990
+ return withLedgerLock(stateDir, () => {
1991
+ const records = readLedger(stateDir, ledgerKey);
1992
+ const id = nextId(records, prefix);
1993
+ const record = buildRecord(id);
1994
+ appendLedger(stateDir, ledgerKey, record);
1995
+ return record;
1996
+ });
1997
+ }
1998
+
1999
+ // Read every row of `ledgerKey`, hand them to `mutate(records)` (which returns the new
2000
+ // full array), and rewrite the file — all UNDER the lock. Used by run finish's
2001
+ // read-all -> patch matching line -> rewrite, so a concurrent append cannot be lost to
2002
+ // the rewrite (the read and the write are serialized against other writers). `mutate`
2003
+ // runs on a fresh in-lock read; its return value is what gets written.
2004
+ export function rewriteLedgerUnderLock(stateDir, ledgerKey, mutate) {
2005
+ return withLedgerLock(stateDir, () => {
2006
+ const records = readLedger(stateDir, ledgerKey);
2007
+ const next = mutate(records);
2008
+ writeLedger(stateDir, ledgerKey, next);
2009
+ return next;
2010
+ });
2011
+ }