@jokerized/getresearchdone 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (711) hide show
  1. package/.claude-plugin/plugin.json +103 -0
  2. package/README.md +211 -0
  3. package/agents/grd-baseline-assessor.md +684 -0
  4. package/agents/grd-code-reviewer.md +300 -0
  5. package/agents/grd-codebase-mapper.md +355 -0
  6. package/agents/grd-critique-agent.md +119 -0
  7. package/agents/grd-debugger.md +519 -0
  8. package/agents/grd-deep-diver.md +737 -0
  9. package/agents/grd-eval-planner.md +913 -0
  10. package/agents/grd-eval-reporter.md +717 -0
  11. package/agents/grd-executor.md +683 -0
  12. package/agents/grd-feasibility-analyst.md +624 -0
  13. package/agents/grd-integration-checker.md +367 -0
  14. package/agents/grd-knowledge-miner.md +81 -0
  15. package/agents/grd-migrator.md +88 -0
  16. package/agents/grd-phase-researcher.md +697 -0
  17. package/agents/grd-plan-checker.md +443 -0
  18. package/agents/grd-planner.md +1532 -0
  19. package/agents/grd-product-owner.md +562 -0
  20. package/agents/grd-project-researcher.md +513 -0
  21. package/agents/grd-research-synthesizer.md +273 -0
  22. package/agents/grd-roadmapper.md +798 -0
  23. package/agents/grd-surveyor.md +566 -0
  24. package/agents/grd-verifier.md +893 -0
  25. package/bin/gd.js +4 -0
  26. package/bin/gd.ts +227 -0
  27. package/bin/grd-manifest.js +4 -0
  28. package/bin/grd-manifest.ts +286 -0
  29. package/bin/grd-mcp-server.js +4 -0
  30. package/bin/grd-mcp-server.ts +124 -0
  31. package/bin/grd-tools.js +4 -0
  32. package/bin/grd-tools.ts +2471 -0
  33. package/bin/postinstall.js +4 -0
  34. package/bin/postinstall.ts +80 -0
  35. package/commands/add-phase.md +123 -0
  36. package/commands/add-todo.md +87 -0
  37. package/commands/assess-baseline.md +289 -0
  38. package/commands/autopilot.md +100 -0
  39. package/commands/autoplan.md +55 -0
  40. package/commands/check-todos.md +87 -0
  41. package/commands/compare-methods.md +262 -0
  42. package/commands/complete-milestone.md +225 -0
  43. package/commands/debug.md +372 -0
  44. package/commands/deep-dive.md +288 -0
  45. package/commands/discover.md +281 -0
  46. package/commands/discuss-phase.md +188 -0
  47. package/commands/discuss.md +55 -0
  48. package/commands/eval-report.md +310 -0
  49. package/commands/evolve.md +79 -0
  50. package/commands/execute-phase.md +1017 -0
  51. package/commands/feasibility.md +292 -0
  52. package/commands/help.md +407 -0
  53. package/commands/init.md +1508 -0
  54. package/commands/insert-phase.md +113 -0
  55. package/commands/iterate.md +327 -0
  56. package/commands/list-phase-assumptions.md +217 -0
  57. package/commands/long-term-roadmap.md +202 -0
  58. package/commands/map-codebase.md +111 -0
  59. package/commands/migrate.md +159 -0
  60. package/commands/new-milestone.md +169 -0
  61. package/commands/pause-work.md +83 -0
  62. package/commands/plan-milestone-gaps.md +373 -0
  63. package/commands/plan-phase.md +655 -0
  64. package/commands/principles.md +328 -0
  65. package/commands/product-plan.md +319 -0
  66. package/commands/progress.md +481 -0
  67. package/commands/quick.md +167 -0
  68. package/commands/reapply-patches.md +154 -0
  69. package/commands/remove-phase.md +97 -0
  70. package/commands/requirement.md +96 -0
  71. package/commands/resume-project.md +113 -0
  72. package/commands/settings.md +1144 -0
  73. package/commands/survey.md +242 -0
  74. package/commands/sync.md +246 -0
  75. package/commands/tracker-setup.md +322 -0
  76. package/commands/update.md +202 -0
  77. package/commands/verify-phase.md +335 -0
  78. package/commands/verify-work.md +701 -0
  79. package/commands/wireup.md +29 -0
  80. package/dist/bin/gd.d.ts +3 -0
  81. package/dist/bin/gd.d.ts.map +1 -0
  82. package/dist/bin/gd.js +178 -0
  83. package/dist/bin/gd.js.map +1 -0
  84. package/dist/bin/grd-manifest.d.ts +3 -0
  85. package/dist/bin/grd-manifest.d.ts.map +1 -0
  86. package/dist/bin/grd-manifest.js +202 -0
  87. package/dist/bin/grd-manifest.js.map +1 -0
  88. package/dist/bin/grd-mcp-server.d.ts +3 -0
  89. package/dist/bin/grd-mcp-server.d.ts.map +1 -0
  90. package/dist/bin/grd-mcp-server.js +71 -0
  91. package/dist/bin/grd-mcp-server.js.map +1 -0
  92. package/dist/bin/grd-tools.d.ts +3 -0
  93. package/dist/bin/grd-tools.d.ts.map +1 -0
  94. package/dist/bin/grd-tools.js +1680 -0
  95. package/dist/bin/grd-tools.js.map +1 -0
  96. package/dist/bin/postinstall.d.ts +3 -0
  97. package/dist/bin/postinstall.d.ts.map +1 -0
  98. package/dist/bin/postinstall.js +61 -0
  99. package/dist/bin/postinstall.js.map +1 -0
  100. package/dist/lib/autopilot-milestone.d.ts +2 -0
  101. package/dist/lib/autopilot-milestone.d.ts.map +1 -0
  102. package/dist/lib/autopilot-milestone.js +94 -0
  103. package/dist/lib/autopilot-milestone.js.map +1 -0
  104. package/dist/lib/autopilot-pipeline.d.ts +2 -0
  105. package/dist/lib/autopilot-pipeline.d.ts.map +1 -0
  106. package/dist/lib/autopilot-pipeline.js +830 -0
  107. package/dist/lib/autopilot-pipeline.js.map +1 -0
  108. package/dist/lib/autopilot-waves.d.ts +2 -0
  109. package/dist/lib/autopilot-waves.d.ts.map +1 -0
  110. package/dist/lib/autopilot-waves.js +266 -0
  111. package/dist/lib/autopilot-waves.js.map +1 -0
  112. package/dist/lib/autopilot.d.ts +2 -0
  113. package/dist/lib/autopilot.d.ts.map +1 -0
  114. package/dist/lib/autopilot.js +1314 -0
  115. package/dist/lib/autopilot.js.map +1 -0
  116. package/dist/lib/autoplan.d.ts +2 -0
  117. package/dist/lib/autoplan.d.ts.map +1 -0
  118. package/dist/lib/autoplan.js +198 -0
  119. package/dist/lib/autoplan.js.map +1 -0
  120. package/dist/lib/autoresearch.d.ts +2 -0
  121. package/dist/lib/autoresearch.d.ts.map +1 -0
  122. package/dist/lib/autoresearch.js +626 -0
  123. package/dist/lib/autoresearch.js.map +1 -0
  124. package/dist/lib/backend.d.ts +2 -0
  125. package/dist/lib/backend.d.ts.map +1 -0
  126. package/dist/lib/backend.js +1036 -0
  127. package/dist/lib/backend.js.map +1 -0
  128. package/dist/lib/benchmark.d.ts +99 -0
  129. package/dist/lib/benchmark.d.ts.map +1 -0
  130. package/dist/lib/benchmark.js +278 -0
  131. package/dist/lib/benchmark.js.map +1 -0
  132. package/dist/lib/citations.d.ts +2 -0
  133. package/dist/lib/citations.d.ts.map +1 -0
  134. package/dist/lib/citations.js +642 -0
  135. package/dist/lib/citations.js.map +1 -0
  136. package/dist/lib/cleanup.d.ts +2 -0
  137. package/dist/lib/cleanup.d.ts.map +1 -0
  138. package/dist/lib/cleanup.js +1222 -0
  139. package/dist/lib/cleanup.js.map +1 -0
  140. package/dist/lib/cli/adapters.d.ts +10 -0
  141. package/dist/lib/cli/adapters.d.ts.map +1 -0
  142. package/dist/lib/cli/adapters.js +27 -0
  143. package/dist/lib/cli/adapters.js.map +1 -0
  144. package/dist/lib/cli/agent.d.ts +17 -0
  145. package/dist/lib/cli/agent.d.ts.map +1 -0
  146. package/dist/lib/cli/agent.js +53 -0
  147. package/dist/lib/cli/agent.js.map +1 -0
  148. package/dist/lib/cli/index.d.ts +21 -0
  149. package/dist/lib/cli/index.d.ts.map +1 -0
  150. package/dist/lib/cli/index.js +264 -0
  151. package/dist/lib/cli/index.js.map +1 -0
  152. package/dist/lib/cli/output.d.ts +20 -0
  153. package/dist/lib/cli/output.d.ts.map +1 -0
  154. package/dist/lib/cli/output.js +22 -0
  155. package/dist/lib/cli/output.js.map +1 -0
  156. package/dist/lib/cli/scan-dispatch.d.ts +9 -0
  157. package/dist/lib/cli/scan-dispatch.d.ts.map +1 -0
  158. package/dist/lib/cli/scan-dispatch.js +107 -0
  159. package/dist/lib/cli/scan-dispatch.js.map +1 -0
  160. package/dist/lib/cli/tools.d.ts +16 -0
  161. package/dist/lib/cli/tools.d.ts.map +1 -0
  162. package/dist/lib/cli/tools.js +168 -0
  163. package/dist/lib/cli/tools.js.map +1 -0
  164. package/dist/lib/commands/_dashboard-parsers.d.ts +2 -0
  165. package/dist/lib/commands/_dashboard-parsers.d.ts.map +1 -0
  166. package/dist/lib/commands/_dashboard-parsers.js +192 -0
  167. package/dist/lib/commands/_dashboard-parsers.js.map +1 -0
  168. package/dist/lib/commands/analysis.d.ts +2 -0
  169. package/dist/lib/commands/analysis.d.ts.map +1 -0
  170. package/dist/lib/commands/analysis.js +1418 -0
  171. package/dist/lib/commands/analysis.js.map +1 -0
  172. package/dist/lib/commands/assumptions.d.ts +2 -0
  173. package/dist/lib/commands/assumptions.d.ts.map +1 -0
  174. package/dist/lib/commands/assumptions.js +166 -0
  175. package/dist/lib/commands/assumptions.js.map +1 -0
  176. package/dist/lib/commands/blame.d.ts +2 -0
  177. package/dist/lib/commands/blame.d.ts.map +1 -0
  178. package/dist/lib/commands/blame.js +133 -0
  179. package/dist/lib/commands/blame.js.map +1 -0
  180. package/dist/lib/commands/budget.d.ts +2 -0
  181. package/dist/lib/commands/budget.d.ts.map +1 -0
  182. package/dist/lib/commands/budget.js +100 -0
  183. package/dist/lib/commands/budget.js.map +1 -0
  184. package/dist/lib/commands/check-plans.d.ts +2 -0
  185. package/dist/lib/commands/check-plans.d.ts.map +1 -0
  186. package/dist/lib/commands/check-plans.js +190 -0
  187. package/dist/lib/commands/check-plans.js.map +1 -0
  188. package/dist/lib/commands/config.d.ts +2 -0
  189. package/dist/lib/commands/config.d.ts.map +1 -0
  190. package/dist/lib/commands/config.js +188 -0
  191. package/dist/lib/commands/config.js.map +1 -0
  192. package/dist/lib/commands/dashboard.d.ts +2 -0
  193. package/dist/lib/commands/dashboard.d.ts.map +1 -0
  194. package/dist/lib/commands/dashboard.js +466 -0
  195. package/dist/lib/commands/dashboard.js.map +1 -0
  196. package/dist/lib/commands/estimate.d.ts +2 -0
  197. package/dist/lib/commands/estimate.d.ts.map +1 -0
  198. package/dist/lib/commands/estimate.js +148 -0
  199. package/dist/lib/commands/estimate.js.map +1 -0
  200. package/dist/lib/commands/eval-diff.d.ts +2 -0
  201. package/dist/lib/commands/eval-diff.d.ts.map +1 -0
  202. package/dist/lib/commands/eval-diff.js +213 -0
  203. package/dist/lib/commands/eval-diff.js.map +1 -0
  204. package/dist/lib/commands/freshness.d.ts +2 -0
  205. package/dist/lib/commands/freshness.d.ts.map +1 -0
  206. package/dist/lib/commands/freshness.js +163 -0
  207. package/dist/lib/commands/freshness.js.map +1 -0
  208. package/dist/lib/commands/health.d.ts +2 -0
  209. package/dist/lib/commands/health.d.ts.map +1 -0
  210. package/dist/lib/commands/health.js +435 -0
  211. package/dist/lib/commands/health.js.map +1 -0
  212. package/dist/lib/commands/index.d.ts +2 -0
  213. package/dist/lib/commands/index.d.ts.map +1 -0
  214. package/dist/lib/commands/index.js +128 -0
  215. package/dist/lib/commands/index.js.map +1 -0
  216. package/dist/lib/commands/install.d.ts +56 -0
  217. package/dist/lib/commands/install.d.ts.map +1 -0
  218. package/dist/lib/commands/install.js +214 -0
  219. package/dist/lib/commands/install.js.map +1 -0
  220. package/dist/lib/commands/knowhow-aggregator.d.ts +2 -0
  221. package/dist/lib/commands/knowhow-aggregator.d.ts.map +1 -0
  222. package/dist/lib/commands/knowhow-aggregator.js +279 -0
  223. package/dist/lib/commands/knowhow-aggregator.js.map +1 -0
  224. package/dist/lib/commands/knowledge-search.d.ts +2 -0
  225. package/dist/lib/commands/knowledge-search.d.ts.map +1 -0
  226. package/dist/lib/commands/knowledge-search.js +113 -0
  227. package/dist/lib/commands/knowledge-search.js.map +1 -0
  228. package/dist/lib/commands/long-term-roadmap.d.ts +2 -0
  229. package/dist/lib/commands/long-term-roadmap.d.ts.map +1 -0
  230. package/dist/lib/commands/long-term-roadmap.js +272 -0
  231. package/dist/lib/commands/long-term-roadmap.js.map +1 -0
  232. package/dist/lib/commands/patterns.d.ts +91 -0
  233. package/dist/lib/commands/patterns.d.ts.map +1 -0
  234. package/dist/lib/commands/patterns.js +391 -0
  235. package/dist/lib/commands/patterns.js.map +1 -0
  236. package/dist/lib/commands/phase-info.d.ts +2 -0
  237. package/dist/lib/commands/phase-info.d.ts.map +1 -0
  238. package/dist/lib/commands/phase-info.js +509 -0
  239. package/dist/lib/commands/phase-info.js.map +1 -0
  240. package/dist/lib/commands/plan-lint.d.ts +56 -0
  241. package/dist/lib/commands/plan-lint.d.ts.map +1 -0
  242. package/dist/lib/commands/plan-lint.js +481 -0
  243. package/dist/lib/commands/plan-lint.js.map +1 -0
  244. package/dist/lib/commands/plan-phase.d.ts +53 -0
  245. package/dist/lib/commands/plan-phase.d.ts.map +1 -0
  246. package/dist/lib/commands/plan-phase.js +288 -0
  247. package/dist/lib/commands/plan-phase.js.map +1 -0
  248. package/dist/lib/commands/progress.d.ts +2 -0
  249. package/dist/lib/commands/progress.d.ts.map +1 -0
  250. package/dist/lib/commands/progress.js +266 -0
  251. package/dist/lib/commands/progress.js.map +1 -0
  252. package/dist/lib/commands/quality.d.ts +2 -0
  253. package/dist/lib/commands/quality.d.ts.map +1 -0
  254. package/dist/lib/commands/quality.js +80 -0
  255. package/dist/lib/commands/quality.js.map +1 -0
  256. package/dist/lib/commands/rollback.d.ts +2 -0
  257. package/dist/lib/commands/rollback.d.ts.map +1 -0
  258. package/dist/lib/commands/rollback.js +145 -0
  259. package/dist/lib/commands/rollback.js.map +1 -0
  260. package/dist/lib/commands/scan.d.ts +25 -0
  261. package/dist/lib/commands/scan.d.ts.map +1 -0
  262. package/dist/lib/commands/scan.js +28 -0
  263. package/dist/lib/commands/scan.js.map +1 -0
  264. package/dist/lib/commands/search.d.ts +2 -0
  265. package/dist/lib/commands/search.d.ts.map +1 -0
  266. package/dist/lib/commands/search.js +212 -0
  267. package/dist/lib/commands/search.js.map +1 -0
  268. package/dist/lib/commands/select-candidate.d.ts +128 -0
  269. package/dist/lib/commands/select-candidate.d.ts.map +1 -0
  270. package/dist/lib/commands/select-candidate.js +518 -0
  271. package/dist/lib/commands/select-candidate.js.map +1 -0
  272. package/dist/lib/commands/singularity.d.ts +2 -0
  273. package/dist/lib/commands/singularity.d.ts.map +1 -0
  274. package/dist/lib/commands/singularity.js +185 -0
  275. package/dist/lib/commands/singularity.js.map +1 -0
  276. package/dist/lib/commands/slug-timestamp.d.ts +2 -0
  277. package/dist/lib/commands/slug-timestamp.d.ts.map +1 -0
  278. package/dist/lib/commands/slug-timestamp.js +54 -0
  279. package/dist/lib/commands/slug-timestamp.js.map +1 -0
  280. package/dist/lib/commands/tail.d.ts +2 -0
  281. package/dist/lib/commands/tail.d.ts.map +1 -0
  282. package/dist/lib/commands/tail.js +100 -0
  283. package/dist/lib/commands/tail.js.map +1 -0
  284. package/dist/lib/commands/todo.d.ts +2 -0
  285. package/dist/lib/commands/todo.d.ts.map +1 -0
  286. package/dist/lib/commands/todo.js +200 -0
  287. package/dist/lib/commands/todo.js.map +1 -0
  288. package/dist/lib/commands/watch.d.ts +2 -0
  289. package/dist/lib/commands/watch.d.ts.map +1 -0
  290. package/dist/lib/commands/watch.js +72 -0
  291. package/dist/lib/commands/watch.js.map +1 -0
  292. package/dist/lib/complexity.d.ts +55 -0
  293. package/dist/lib/complexity.d.ts.map +1 -0
  294. package/dist/lib/complexity.js +80 -0
  295. package/dist/lib/complexity.js.map +1 -0
  296. package/dist/lib/context/agents.d.ts +2 -0
  297. package/dist/lib/context/agents.d.ts.map +1 -0
  298. package/dist/lib/context/agents.js +344 -0
  299. package/dist/lib/context/agents.js.map +1 -0
  300. package/dist/lib/context/base.d.ts +2 -0
  301. package/dist/lib/context/base.d.ts.map +1 -0
  302. package/dist/lib/context/base.js +81 -0
  303. package/dist/lib/context/base.js.map +1 -0
  304. package/dist/lib/context/execute.d.ts +2 -0
  305. package/dist/lib/context/execute.d.ts.map +1 -0
  306. package/dist/lib/context/execute.js +753 -0
  307. package/dist/lib/context/execute.js.map +1 -0
  308. package/dist/lib/context/index.d.ts +2 -0
  309. package/dist/lib/context/index.d.ts.map +1 -0
  310. package/dist/lib/context/index.js +88 -0
  311. package/dist/lib/context/index.js.map +1 -0
  312. package/dist/lib/context/progress.d.ts +2 -0
  313. package/dist/lib/context/progress.d.ts.map +1 -0
  314. package/dist/lib/context/progress.js +178 -0
  315. package/dist/lib/context/progress.js.map +1 -0
  316. package/dist/lib/context/project.d.ts +2 -0
  317. package/dist/lib/context/project.d.ts.map +1 -0
  318. package/dist/lib/context/project.js +413 -0
  319. package/dist/lib/context/project.js.map +1 -0
  320. package/dist/lib/context/research.d.ts +2 -0
  321. package/dist/lib/context/research.d.ts.map +1 -0
  322. package/dist/lib/context/research.js +466 -0
  323. package/dist/lib/context/research.js.map +1 -0
  324. package/dist/lib/dead-ends.d.ts +28 -0
  325. package/dist/lib/dead-ends.d.ts.map +1 -0
  326. package/dist/lib/dead-ends.js +451 -0
  327. package/dist/lib/dead-ends.js.map +1 -0
  328. package/dist/lib/deps.d.ts +2 -0
  329. package/dist/lib/deps.d.ts.map +1 -0
  330. package/dist/lib/deps.js +630 -0
  331. package/dist/lib/deps.js.map +1 -0
  332. package/dist/lib/discussion.d.ts +2 -0
  333. package/dist/lib/discussion.d.ts.map +1 -0
  334. package/dist/lib/discussion.js +1041 -0
  335. package/dist/lib/discussion.js.map +1 -0
  336. package/dist/lib/drift.d.ts +36 -0
  337. package/dist/lib/drift.d.ts.map +1 -0
  338. package/dist/lib/drift.js +481 -0
  339. package/dist/lib/drift.js.map +1 -0
  340. package/dist/lib/evolve/_dimensions-features.d.ts +2 -0
  341. package/dist/lib/evolve/_dimensions-features.d.ts.map +1 -0
  342. package/dist/lib/evolve/_dimensions-features.js +369 -0
  343. package/dist/lib/evolve/_dimensions-features.js.map +1 -0
  344. package/dist/lib/evolve/_dimensions.d.ts +2 -0
  345. package/dist/lib/evolve/_dimensions.d.ts.map +1 -0
  346. package/dist/lib/evolve/_dimensions.js +358 -0
  347. package/dist/lib/evolve/_dimensions.js.map +1 -0
  348. package/dist/lib/evolve/_product-ideation.d.ts +2 -0
  349. package/dist/lib/evolve/_product-ideation.d.ts.map +1 -0
  350. package/dist/lib/evolve/_product-ideation.js +281 -0
  351. package/dist/lib/evolve/_product-ideation.js.map +1 -0
  352. package/dist/lib/evolve/_prompts.d.ts +2 -0
  353. package/dist/lib/evolve/_prompts.d.ts.map +1 -0
  354. package/dist/lib/evolve/_prompts.js +153 -0
  355. package/dist/lib/evolve/_prompts.js.map +1 -0
  356. package/dist/lib/evolve/cli.d.ts +2 -0
  357. package/dist/lib/evolve/cli.d.ts.map +1 -0
  358. package/dist/lib/evolve/cli.js +224 -0
  359. package/dist/lib/evolve/cli.js.map +1 -0
  360. package/dist/lib/evolve/discovery.d.ts +2 -0
  361. package/dist/lib/evolve/discovery.d.ts.map +1 -0
  362. package/dist/lib/evolve/discovery.js +391 -0
  363. package/dist/lib/evolve/discovery.js.map +1 -0
  364. package/dist/lib/evolve/index.d.ts +2 -0
  365. package/dist/lib/evolve/index.d.ts.map +1 -0
  366. package/dist/lib/evolve/index.js +88 -0
  367. package/dist/lib/evolve/index.js.map +1 -0
  368. package/dist/lib/evolve/orchestrator.d.ts +2 -0
  369. package/dist/lib/evolve/orchestrator.d.ts.map +1 -0
  370. package/dist/lib/evolve/orchestrator.js +851 -0
  371. package/dist/lib/evolve/orchestrator.js.map +1 -0
  372. package/dist/lib/evolve/scoring.d.ts +2 -0
  373. package/dist/lib/evolve/scoring.d.ts.map +1 -0
  374. package/dist/lib/evolve/scoring.js +118 -0
  375. package/dist/lib/evolve/scoring.js.map +1 -0
  376. package/dist/lib/evolve/state.d.ts +2 -0
  377. package/dist/lib/evolve/state.d.ts.map +1 -0
  378. package/dist/lib/evolve/state.js +264 -0
  379. package/dist/lib/evolve/state.js.map +1 -0
  380. package/dist/lib/evolve/types.d.ts +249 -0
  381. package/dist/lib/evolve/types.d.ts.map +1 -0
  382. package/dist/lib/evolve/types.js +3 -0
  383. package/dist/lib/evolve/types.js.map +1 -0
  384. package/dist/lib/frontmatter.d.ts +2 -0
  385. package/dist/lib/frontmatter.d.ts.map +1 -0
  386. package/dist/lib/frontmatter.js +513 -0
  387. package/dist/lib/frontmatter.js.map +1 -0
  388. package/dist/lib/gates.d.ts +2 -0
  389. package/dist/lib/gates.d.ts.map +1 -0
  390. package/dist/lib/gates.js +578 -0
  391. package/dist/lib/gates.js.map +1 -0
  392. package/dist/lib/genome.d.ts +10 -0
  393. package/dist/lib/genome.d.ts.map +1 -0
  394. package/dist/lib/genome.js +368 -0
  395. package/dist/lib/genome.js.map +1 -0
  396. package/dist/lib/got.d.ts +2 -0
  397. package/dist/lib/got.d.ts.map +1 -0
  398. package/dist/lib/got.js +280 -0
  399. package/dist/lib/got.js.map +1 -0
  400. package/dist/lib/invariants.d.ts +2 -0
  401. package/dist/lib/invariants.d.ts.map +1 -0
  402. package/dist/lib/invariants.js +298 -0
  403. package/dist/lib/invariants.js.map +1 -0
  404. package/dist/lib/knowledge.d.ts +2 -0
  405. package/dist/lib/knowledge.d.ts.map +1 -0
  406. package/dist/lib/knowledge.js +658 -0
  407. package/dist/lib/knowledge.js.map +1 -0
  408. package/dist/lib/long-term-roadmap.d.ts +2 -0
  409. package/dist/lib/long-term-roadmap.d.ts.map +1 -0
  410. package/dist/lib/long-term-roadmap.js +602 -0
  411. package/dist/lib/long-term-roadmap.js.map +1 -0
  412. package/dist/lib/markdown-split.d.ts +2 -0
  413. package/dist/lib/markdown-split.d.ts.map +1 -0
  414. package/dist/lib/markdown-split.js +199 -0
  415. package/dist/lib/markdown-split.js.map +1 -0
  416. package/dist/lib/mcp-server.d.ts +2 -0
  417. package/dist/lib/mcp-server.d.ts.map +1 -0
  418. package/dist/lib/mcp-server.js +2424 -0
  419. package/dist/lib/mcp-server.js.map +1 -0
  420. package/dist/lib/metrics.d.ts +16 -0
  421. package/dist/lib/metrics.d.ts.map +1 -0
  422. package/dist/lib/metrics.js +48 -0
  423. package/dist/lib/metrics.js.map +1 -0
  424. package/dist/lib/overstory.d.ts +2 -0
  425. package/dist/lib/overstory.d.ts.map +1 -0
  426. package/dist/lib/overstory.js +211 -0
  427. package/dist/lib/overstory.js.map +1 -0
  428. package/dist/lib/parallel.d.ts +2 -0
  429. package/dist/lib/parallel.d.ts.map +1 -0
  430. package/dist/lib/parallel.js +349 -0
  431. package/dist/lib/parallel.js.map +1 -0
  432. package/dist/lib/paths.d.ts +2 -0
  433. package/dist/lib/paths.d.ts.map +1 -0
  434. package/dist/lib/paths.js +254 -0
  435. package/dist/lib/paths.js.map +1 -0
  436. package/dist/lib/phase-complete-llm.d.ts +22 -0
  437. package/dist/lib/phase-complete-llm.d.ts.map +1 -0
  438. package/dist/lib/phase-complete-llm.js +331 -0
  439. package/dist/lib/phase-complete-llm.js.map +1 -0
  440. package/dist/lib/phase-complete.d.ts +46 -0
  441. package/dist/lib/phase-complete.d.ts.map +1 -0
  442. package/dist/lib/phase-complete.js +278 -0
  443. package/dist/lib/phase-complete.js.map +1 -0
  444. package/dist/lib/phase-io.d.ts +2 -0
  445. package/dist/lib/phase-io.d.ts.map +1 -0
  446. package/dist/lib/phase-io.js +126 -0
  447. package/dist/lib/phase-io.js.map +1 -0
  448. package/dist/lib/phase.d.ts +2 -0
  449. package/dist/lib/phase.d.ts.map +1 -0
  450. package/dist/lib/phase.js +1344 -0
  451. package/dist/lib/phase.js.map +1 -0
  452. package/dist/lib/plan-tournament.d.ts +63 -0
  453. package/dist/lib/plan-tournament.d.ts.map +1 -0
  454. package/dist/lib/plan-tournament.js +353 -0
  455. package/dist/lib/plan-tournament.js.map +1 -0
  456. package/dist/lib/refinement.d.ts +74 -0
  457. package/dist/lib/refinement.d.ts.map +1 -0
  458. package/dist/lib/refinement.js +283 -0
  459. package/dist/lib/refinement.js.map +1 -0
  460. package/dist/lib/requirements.d.ts +2 -0
  461. package/dist/lib/requirements.d.ts.map +1 -0
  462. package/dist/lib/requirements.js +355 -0
  463. package/dist/lib/requirements.js.map +1 -0
  464. package/dist/lib/research-bundle.d.ts +2 -0
  465. package/dist/lib/research-bundle.d.ts.map +1 -0
  466. package/dist/lib/research-bundle.js +246 -0
  467. package/dist/lib/research-bundle.js.map +1 -0
  468. package/dist/lib/roadmap.d.ts +2 -0
  469. package/dist/lib/roadmap.d.ts.map +1 -0
  470. package/dist/lib/roadmap.js +541 -0
  471. package/dist/lib/roadmap.js.map +1 -0
  472. package/dist/lib/sample.d.ts +16 -0
  473. package/dist/lib/sample.d.ts.map +1 -0
  474. package/dist/lib/sample.js +20 -0
  475. package/dist/lib/sample.js.map +1 -0
  476. package/dist/lib/scaffold.d.ts +2 -0
  477. package/dist/lib/scaffold.d.ts.map +1 -0
  478. package/dist/lib/scaffold.js +355 -0
  479. package/dist/lib/scaffold.js.map +1 -0
  480. package/dist/lib/scan/_utils.d.ts +11 -0
  481. package/dist/lib/scan/_utils.d.ts.map +1 -0
  482. package/dist/lib/scan/_utils.js +36 -0
  483. package/dist/lib/scan/_utils.js.map +1 -0
  484. package/dist/lib/scan/base64.d.ts +15 -0
  485. package/dist/lib/scan/base64.d.ts.map +1 -0
  486. package/dist/lib/scan/base64.js +66 -0
  487. package/dist/lib/scan/base64.js.map +1 -0
  488. package/dist/lib/scan/ignorefile.d.ts +30 -0
  489. package/dist/lib/scan/ignorefile.d.ts.map +1 -0
  490. package/dist/lib/scan/ignorefile.js +101 -0
  491. package/dist/lib/scan/ignorefile.js.map +1 -0
  492. package/dist/lib/scan/injection.d.ts +14 -0
  493. package/dist/lib/scan/injection.d.ts.map +1 -0
  494. package/dist/lib/scan/injection.js +39 -0
  495. package/dist/lib/scan/injection.js.map +1 -0
  496. package/dist/lib/scan/patterns.d.ts +17 -0
  497. package/dist/lib/scan/patterns.d.ts.map +1 -0
  498. package/dist/lib/scan/patterns.js +123 -0
  499. package/dist/lib/scan/patterns.js.map +1 -0
  500. package/dist/lib/scan/strip-markdown.d.ts +7 -0
  501. package/dist/lib/scan/strip-markdown.d.ts.map +1 -0
  502. package/dist/lib/scan/strip-markdown.js +38 -0
  503. package/dist/lib/scan/strip-markdown.js.map +1 -0
  504. package/dist/lib/scan/types.d.ts +23 -0
  505. package/dist/lib/scan/types.d.ts.map +1 -0
  506. package/dist/lib/scan/types.js +3 -0
  507. package/dist/lib/scan/types.js.map +1 -0
  508. package/dist/lib/scheduler-wait.d.ts +2 -0
  509. package/dist/lib/scheduler-wait.d.ts.map +1 -0
  510. package/dist/lib/scheduler-wait.js +59 -0
  511. package/dist/lib/scheduler-wait.js.map +1 -0
  512. package/dist/lib/scheduler.d.ts +254 -0
  513. package/dist/lib/scheduler.d.ts.map +1 -0
  514. package/dist/lib/scheduler.js +1147 -0
  515. package/dist/lib/scheduler.js.map +1 -0
  516. package/dist/lib/state.d.ts +2 -0
  517. package/dist/lib/state.d.ts.map +1 -0
  518. package/dist/lib/state.js +744 -0
  519. package/dist/lib/state.js.map +1 -0
  520. package/dist/lib/think.d.ts +18 -0
  521. package/dist/lib/think.d.ts.map +1 -0
  522. package/dist/lib/think.js +317 -0
  523. package/dist/lib/think.js.map +1 -0
  524. package/dist/lib/tracker.d.ts +2 -0
  525. package/dist/lib/tracker.d.ts.map +1 -0
  526. package/dist/lib/tracker.js +1121 -0
  527. package/dist/lib/tracker.js.map +1 -0
  528. package/dist/lib/types.d.ts +1514 -0
  529. package/dist/lib/types.d.ts.map +1 -0
  530. package/dist/lib/types.js +4 -0
  531. package/dist/lib/types.js.map +1 -0
  532. package/dist/lib/utils.d.ts +2 -0
  533. package/dist/lib/utils.d.ts.map +1 -0
  534. package/dist/lib/utils.js +1363 -0
  535. package/dist/lib/utils.js.map +1 -0
  536. package/dist/lib/verify.d.ts +2 -0
  537. package/dist/lib/verify.d.ts.map +1 -0
  538. package/dist/lib/verify.js +1153 -0
  539. package/dist/lib/verify.js.map +1 -0
  540. package/dist/lib/wireup/autofix.d.ts +2 -0
  541. package/dist/lib/wireup/autofix.d.ts.map +1 -0
  542. package/dist/lib/wireup/autofix.js +188 -0
  543. package/dist/lib/wireup/autofix.js.map +1 -0
  544. package/dist/lib/wireup/cli.d.ts +2 -0
  545. package/dist/lib/wireup/cli.d.ts.map +1 -0
  546. package/dist/lib/wireup/cli.js +194 -0
  547. package/dist/lib/wireup/cli.js.map +1 -0
  548. package/dist/lib/wireup/detection.d.ts +47 -0
  549. package/dist/lib/wireup/detection.d.ts.map +1 -0
  550. package/dist/lib/wireup/detection.js +410 -0
  551. package/dist/lib/wireup/detection.js.map +1 -0
  552. package/dist/lib/wireup/discovery.d.ts +2 -0
  553. package/dist/lib/wireup/discovery.d.ts.map +1 -0
  554. package/dist/lib/wireup/discovery.js +934 -0
  555. package/dist/lib/wireup/discovery.js.map +1 -0
  556. package/dist/lib/wireup/execution.d.ts +2 -0
  557. package/dist/lib/wireup/execution.d.ts.map +1 -0
  558. package/dist/lib/wireup/execution.js +573 -0
  559. package/dist/lib/wireup/execution.js.map +1 -0
  560. package/dist/lib/wireup/index.d.ts +2 -0
  561. package/dist/lib/wireup/index.d.ts.map +1 -0
  562. package/dist/lib/wireup/index.js +85 -0
  563. package/dist/lib/wireup/index.js.map +1 -0
  564. package/dist/lib/wireup/orchestrator.d.ts +2 -0
  565. package/dist/lib/wireup/orchestrator.d.ts.map +1 -0
  566. package/dist/lib/wireup/orchestrator.js +366 -0
  567. package/dist/lib/wireup/orchestrator.js.map +1 -0
  568. package/dist/lib/wireup/report.d.ts +47 -0
  569. package/dist/lib/wireup/report.d.ts.map +1 -0
  570. package/dist/lib/wireup/report.js +201 -0
  571. package/dist/lib/wireup/report.js.map +1 -0
  572. package/dist/lib/wireup/scenarios.d.ts +2 -0
  573. package/dist/lib/wireup/scenarios.d.ts.map +1 -0
  574. package/dist/lib/wireup/scenarios.js +516 -0
  575. package/dist/lib/wireup/scenarios.js.map +1 -0
  576. package/dist/lib/wireup/state.d.ts +2 -0
  577. package/dist/lib/wireup/state.d.ts.map +1 -0
  578. package/dist/lib/wireup/state.js +102 -0
  579. package/dist/lib/wireup/state.js.map +1 -0
  580. package/dist/lib/wireup/types.d.ts +376 -0
  581. package/dist/lib/wireup/types.d.ts.map +1 -0
  582. package/dist/lib/wireup/types.js +3 -0
  583. package/dist/lib/wireup/types.js.map +1 -0
  584. package/dist/lib/worktree.d.ts +2 -0
  585. package/dist/lib/worktree.d.ts.map +1 -0
  586. package/dist/lib/worktree.js +999 -0
  587. package/dist/lib/worktree.js.map +1 -0
  588. package/lib/autopilot-milestone.ts +136 -0
  589. package/lib/autopilot-pipeline.ts +1179 -0
  590. package/lib/autopilot-waves.ts +361 -0
  591. package/lib/autopilot.ts +1874 -0
  592. package/lib/autoplan.ts +280 -0
  593. package/lib/autoresearch.js +4 -0
  594. package/lib/autoresearch.ts +886 -0
  595. package/lib/backend.ts +1252 -0
  596. package/lib/benchmark.ts +341 -0
  597. package/lib/citations.ts +760 -0
  598. package/lib/cleanup.ts +1588 -0
  599. package/lib/cli/adapters.ts +41 -0
  600. package/lib/cli/agent.ts +83 -0
  601. package/lib/cli/index.ts +273 -0
  602. package/lib/cli/output.ts +33 -0
  603. package/lib/cli/scan-dispatch.ts +130 -0
  604. package/lib/cli/tools.ts +198 -0
  605. package/lib/commands/_dashboard-parsers.ts +275 -0
  606. package/lib/commands/analysis.ts +1851 -0
  607. package/lib/commands/assumptions.ts +232 -0
  608. package/lib/commands/blame.ts +174 -0
  609. package/lib/commands/budget.ts +148 -0
  610. package/lib/commands/check-plans.ts +233 -0
  611. package/lib/commands/config.ts +287 -0
  612. package/lib/commands/dashboard.ts +680 -0
  613. package/lib/commands/estimate.ts +204 -0
  614. package/lib/commands/eval-diff.ts +252 -0
  615. package/lib/commands/freshness.ts +213 -0
  616. package/lib/commands/health.ts +607 -0
  617. package/lib/commands/index.ts +266 -0
  618. package/lib/commands/install.ts +307 -0
  619. package/lib/commands/knowhow-aggregator.ts +345 -0
  620. package/lib/commands/knowledge-search.ts +153 -0
  621. package/lib/commands/long-term-roadmap.ts +390 -0
  622. package/lib/commands/patterns.ts +465 -0
  623. package/lib/commands/phase-info.ts +698 -0
  624. package/lib/commands/plan-lint.ts +546 -0
  625. package/lib/commands/plan-phase.ts +375 -0
  626. package/lib/commands/progress.ts +319 -0
  627. package/lib/commands/quality.ts +138 -0
  628. package/lib/commands/rollback.ts +195 -0
  629. package/lib/commands/scan.ts +72 -0
  630. package/lib/commands/search.ts +300 -0
  631. package/lib/commands/select-candidate.ts +687 -0
  632. package/lib/commands/singularity.ts +222 -0
  633. package/lib/commands/slug-timestamp.ts +74 -0
  634. package/lib/commands/tail.ts +129 -0
  635. package/lib/commands/todo.ts +273 -0
  636. package/lib/commands/watch.ts +80 -0
  637. package/lib/complexity.ts +117 -0
  638. package/lib/context/agents.ts +505 -0
  639. package/lib/context/base.ts +123 -0
  640. package/lib/context/execute.ts +977 -0
  641. package/lib/context/index.ts +110 -0
  642. package/lib/context/progress.ts +278 -0
  643. package/lib/context/project.ts +531 -0
  644. package/lib/context/research.ts +646 -0
  645. package/lib/dead-ends.ts +506 -0
  646. package/lib/deps.ts +773 -0
  647. package/lib/discussion.ts +1275 -0
  648. package/lib/drift.ts +519 -0
  649. package/lib/evolve/_dimensions-features.ts +525 -0
  650. package/lib/evolve/_dimensions.ts +511 -0
  651. package/lib/evolve/_product-ideation.ts +405 -0
  652. package/lib/evolve/_prompts.ts +178 -0
  653. package/lib/evolve/cli.ts +330 -0
  654. package/lib/evolve/discovery.ts +571 -0
  655. package/lib/evolve/index.ts +105 -0
  656. package/lib/evolve/orchestrator.ts +1139 -0
  657. package/lib/evolve/scoring.ts +167 -0
  658. package/lib/evolve/state.ts +330 -0
  659. package/lib/evolve/types.ts +290 -0
  660. package/lib/frontmatter.ts +615 -0
  661. package/lib/gates.ts +695 -0
  662. package/lib/genome.ts +402 -0
  663. package/lib/got.js +4 -0
  664. package/lib/got.ts +361 -0
  665. package/lib/invariants.ts +378 -0
  666. package/lib/knowledge.ts +768 -0
  667. package/lib/long-term-roadmap.ts +806 -0
  668. package/lib/markdown-split.ts +273 -0
  669. package/lib/mcp-server.ts +3292 -0
  670. package/lib/metrics.ts +49 -0
  671. package/lib/overstory.ts +270 -0
  672. package/lib/parallel.ts +570 -0
  673. package/lib/paths.ts +293 -0
  674. package/lib/phase-complete-llm.ts +376 -0
  675. package/lib/phase-complete.ts +366 -0
  676. package/lib/phase-io.ts +101 -0
  677. package/lib/phase.ts +1981 -0
  678. package/lib/plan-tournament.ts +426 -0
  679. package/lib/refinement.ts +349 -0
  680. package/lib/requirements.ts +469 -0
  681. package/lib/research-bundle.ts +300 -0
  682. package/lib/roadmap.ts +775 -0
  683. package/lib/scaffold.ts +480 -0
  684. package/lib/scan/_utils.ts +37 -0
  685. package/lib/scan/base64.ts +90 -0
  686. package/lib/scan/ignorefile.ts +109 -0
  687. package/lib/scan/injection.ts +67 -0
  688. package/lib/scan/patterns.ts +139 -0
  689. package/lib/scan/strip-markdown.ts +39 -0
  690. package/lib/scan/types.ts +28 -0
  691. package/lib/scheduler-wait.ts +58 -0
  692. package/lib/scheduler.ts +1370 -0
  693. package/lib/state.ts +1000 -0
  694. package/lib/think.ts +365 -0
  695. package/lib/tracker.ts +1591 -0
  696. package/lib/types.ts +1663 -0
  697. package/lib/utils.ts +1479 -0
  698. package/lib/verify.ts +1434 -0
  699. package/lib/wireup/autofix.ts +241 -0
  700. package/lib/wireup/cli.ts +278 -0
  701. package/lib/wireup/detection.ts +542 -0
  702. package/lib/wireup/discovery.ts +1063 -0
  703. package/lib/wireup/execution.ts +686 -0
  704. package/lib/wireup/index.ts +117 -0
  705. package/lib/wireup/orchestrator.ts +519 -0
  706. package/lib/wireup/report.ts +286 -0
  707. package/lib/wireup/scenarios.ts +616 -0
  708. package/lib/wireup/state.ts +139 -0
  709. package/lib/wireup/types.ts +436 -0
  710. package/lib/worktree.ts +1309 -0
  711. package/package.json +67 -0
@@ -0,0 +1,913 @@
1
+ ---
2
+ name: grd-eval-planner
3
+ description: Designs evaluation plans with tiered verification (sanity/proxy/deferred). Produces EVAL.md with metrics, datasets, baselines, and targets for R&D phases.
4
+ tools: Read, Write, Bash, Grep, Glob, WebSearch, WebFetch
5
+ color: green
6
+ effort: medium
7
+ maxTurns: 20
8
+ ---
9
+
10
+ <role>
11
+ You are a GRD evaluation planner. You design rigorous evaluation plans with tiered verification levels, ensuring that every R&D phase has clear, measurable success criteria — even when full evaluation must be deferred.
12
+
13
+ Spawned by:
14
+ - `/grd:eval-plan` workflow (standalone evaluation planning)
15
+ - `/grd:plan-phase` workflow (when phase needs evaluation design)
16
+ - `/grd:iterate` workflow (when redesigning evaluation after failed metrics)
17
+
18
+ Your job: Design evaluation plans that honestly assess what can and cannot be verified at each stage. The tiered verification system (sanity/proxy/deferred) prevents false confidence from proxy metrics while ensuring meaningful validation happens at every phase.
19
+
20
+ **Core responsibilities:**
21
+ - Read phase RESEARCH.md and deep-dives for paper evaluation methodology
22
+ - Determine what can be verified independently vs. needs integration
23
+ - Design sanity checks (always include — Level 1)
24
+ - Design proxy metrics with evidence and rationale (Level 2)
25
+ - Identify deferred validations with validates_at references (Level 3)
26
+ - Write EVAL.md in the phase directory
27
+ - Be honest about evaluation limitations
28
+ </role>
29
+
30
+ <naming_convention>
31
+ ALL generated markdown files MUST use UPPERCASE filenames. This applies to every .md file written into .planning/ or any subdirectory:
32
+ - Standard files: STATE.md, ROADMAP.md, REQUIREMENTS.md, PLAN.md, SUMMARY.md, VERIFICATION.md, EVAL.md, REVIEW.md, CONTEXT.md, RESEARCH.md, BASELINE.md
33
+ - Slug-based files: use UPPERCASE slugs — e.g., VASWANI-ATTENTION-2017.md, not vaswani-attention-2017.md
34
+ - Feasibility files: {METHOD-SLUG}-FEASIBILITY.md
35
+ - Todo files: {DATE}-{SLUG}.md (date lowercase ok, slug UPPERCASE)
36
+ - Handoff files: .CONTINUE-HERE.md
37
+ - Quick task summaries: {N}-SUMMARY.md
38
+ Never create lowercase .md filenames in .planning/.
39
+ </naming_convention>
40
+
41
+ <philosophy>
42
+
43
+ ## Honest Evaluation Over Metric Theater
44
+
45
+ The greatest risk in R&D is false confidence from proxy metrics. A proxy metric that correlates 0.6 with your actual goal is useful IF you know it's 0.6 — and dangerous if you treat it as 1.0.
46
+
47
+ **Core principle:** Every metric must be tagged with its verification level and confidence. Unvalidated proxy metrics MUST be tagged as such.
48
+
49
+ ## Tiered Verification Is Not Optional
50
+
51
+ Every evaluation plan MUST include all three tiers:
52
+ 1. **Sanity (Level 1):** Can we run it at all? Does the output look reasonable?
53
+ 2. **Proxy (Level 2):** Does it perform well on an indirect measure?
54
+ 3. **Deferred (Level 3):** Does it actually work in the full system?
55
+
56
+ Skipping tiers creates blind spots. A method that passes proxy but fails deferred evaluation wastes the most time — you've already integrated it.
57
+
58
+ ## If You Can't Design a Meaningful Proxy, Say So
59
+
60
+ Not all problems have good proxy metrics. This is FINE. The evaluation plan should say:
61
+ - "No meaningful proxy metric exists for this phase"
62
+ - "Validation deferred to phase XX-integration"
63
+ - "Sanity checks are the only available verification at this stage"
64
+
65
+ This is more valuable than inventing a proxy metric that doesn't correlate with success.
66
+
67
+ ## Reference the Paper's Evaluation
68
+
69
+ Every R&D evaluation plan should trace its metrics back to the source:
70
+ - "Using PSNR/SSIM because the paper reports these on Set5/Set14"
71
+ - "Paper ablation Table 3 can be reproduced with our subset"
72
+ - "Paper doesn't evaluate on our domain — proxy metrics designed from first principles"
73
+
74
+ ## Reproducibility Is a Metric
75
+
76
+ Can we reproduce the paper's results? This is itself an evaluation. If we can't reproduce Table 1, either:
77
+ - Our implementation differs (find the bug)
78
+ - The paper's results aren't robust (consider alternatives)
79
+ - Our data/setup differs in meaningful ways (document why)
80
+
81
+ </philosophy>
82
+
83
+ <tiered_verification>
84
+
85
+ ## Verification Levels
86
+
87
+ ### Level 1: Sanity Checks
88
+
89
+ **Purpose:** Verify basic functionality. "Does it run? Does the output look reasonable?"
90
+
91
+ **Always doable in-phase.** No external dependencies, no integration needed.
92
+
93
+ **Standard sanity checks (include all applicable):**
94
+
95
+ ```yaml
96
+ sanity:
97
+ - name: "Input/output format"
98
+ check: "Model accepts expected input shape and produces expected output shape"
99
+ command: "[specific test command]"
100
+ expected: "[expected output]"
101
+
102
+ - name: "Distribution check"
103
+ check: "Output values are in expected range"
104
+ command: "[visualization or statistics command]"
105
+ expected: "[e.g., pixel values in [0, 1], probabilities sum to 1]"
106
+
107
+ - name: "Pipeline crash test"
108
+ check: "Process N samples without error"
109
+ command: "[batch processing command]"
110
+ expected: "No errors, no NaN/Inf values"
111
+
112
+ - name: "Processing speed"
113
+ check: "Inference time within acceptable range"
114
+ command: "[timing command]"
115
+ expected: "[e.g., < 100ms per sample on target GPU]"
116
+
117
+ - name: "Memory usage"
118
+ check: "GPU memory usage within budget"
119
+ command: "[memory monitoring command]"
120
+ expected: "[e.g., < 8GB VRAM at batch_size=1]"
121
+
122
+ - name: "Determinism"
123
+ check: "Same input produces same output (if applicable)"
124
+ command: "[run twice and compare]"
125
+ expected: "Outputs identical or within tolerance"
126
+ ```
127
+
128
+ ### Level 2: Proxy Metrics
129
+
130
+ **Purpose:** Indirect evaluation when full metrics aren't available.
131
+
132
+ **Only valid with evidence.** Each proxy metric must state:
133
+ - What it measures
134
+ - Why it correlates with the real metric (evidence from paper or domain knowledge)
135
+ - Estimated correlation strength (if known)
136
+ - What it DOESN'T capture
137
+
138
+ ```yaml
139
+ proxy:
140
+ - name: "[Metric name]"
141
+ what: "[What is being measured]"
142
+ how: "[How to compute it]"
143
+ command: "[specific command]"
144
+ target: "[target value]"
145
+ evidence_from: "[paper section or domain reasoning]"
146
+ correlation: "[HIGH/MEDIUM/LOW — with actual metric]"
147
+ blind_spots: "[What this metric misses]"
148
+ validated: false # MUST be false until deferred validation confirms
149
+
150
+ - name: "Small-subset downstream evaluation"
151
+ what: "Performance on a representative subset"
152
+ how: "Run full evaluation pipeline on N% of data"
153
+ command: "[command]"
154
+ target: "[derived from paper scaling]"
155
+ evidence_from: "deep-dives/PAPER.md#results"
156
+ correlation: "MEDIUM — subset may not represent full distribution"
157
+ blind_spots: "Distribution shift between subset and full dataset"
158
+ validated: false
159
+
160
+ - name: "Paper ablation reproduction"
161
+ what: "Reproduce specific ablation from paper"
162
+ how: "Match paper's ablation condition exactly"
163
+ command: "[command]"
164
+ target: "[paper's reported value +/- tolerance]"
165
+ evidence_from: "deep-dives/PAPER.md#ablation"
166
+ correlation: "HIGH — directly measures same thing as paper"
167
+ blind_spots: "Our data may differ from paper's data"
168
+ validated: false
169
+ ```
170
+
171
+ ### Level 3: Deferred Validation
172
+
173
+ **Purpose:** Full evaluation that requires integration or resources not available in-phase.
174
+
175
+ **Each deferred item must specify WHERE and WHEN it gets validated.**
176
+
177
+ ```yaml
178
+ deferred:
179
+ - name: "[Metric name]"
180
+ what: "[What is being measured]"
181
+ how: "[How to compute when ready]"
182
+ why_deferred: "[Why it can't be done now]"
183
+ validates_at: "phase-XX-integration"
184
+ depends_on: "[What must exist first]"
185
+ target: "[target value from PRODUCT-QUALITY.md or paper]"
186
+ risk_if_unmet: "[What happens if this fails at deferred stage]"
187
+
188
+ - name: "Full pipeline metrics"
189
+ what: "End-to-end quality metrics (PSNR/SSIM/LPIPS)"
190
+ how: "Run full evaluation suite on test set"
191
+ why_deferred: "Requires integrated pipeline from phase XX"
192
+ validates_at: "phase-XX-integration"
193
+ depends_on: "Full pipeline assembled and functional"
194
+ target: "PSNR > 30dB, SSIM > 0.92"
195
+ risk_if_unmet: "Method may need replacement — budget 1 additional phase"
196
+
197
+ - name: "Real data robustness"
198
+ what: "Performance on production data (not benchmarks)"
199
+ how: "Run on sample of actual user data"
200
+ why_deferred: "Production data pipeline not available in research phase"
201
+ validates_at: "phase-XX-production-eval"
202
+ depends_on: "Data pipeline + model serving"
203
+ target: "Quality regression < 5% vs benchmark data"
204
+ risk_if_unmet: "Domain adaptation may be needed"
205
+ ```
206
+
207
+ </tiered_verification>
208
+
209
+ <execution_flow>
210
+
211
+ <step name="load_context" priority="first">
212
+ Load all relevant context for evaluation design.
213
+
214
+ **Read phase context:**
215
+ ```bash
216
+ PHASE_DIR=$(ls -d ${phases_dir}/*${PHASE}* 2>/dev/null | head -1)
217
+ cat "$PHASE_DIR"/*-RESEARCH.md 2>/dev/null
218
+ cat "$PHASE_DIR"/*-PLAN.md 2>/dev/null
219
+ cat "$PHASE_DIR"/*-CONTEXT.md 2>/dev/null
220
+ ```
221
+
222
+ **Read research context:**
223
+ ```bash
224
+ cat ${research_dir}/LANDSCAPE.md 2>/dev/null
225
+ cat ${research_dir}/PAPERS.md 2>/dev/null
226
+ ls ${research_dir}/deep-dives/*.md 2>/dev/null
227
+ ```
228
+
229
+ **Read baseline and targets:**
230
+ ```bash
231
+ cat .planning/BASELINE.md 2>/dev/null
232
+ cat .planning/PRODUCT-QUALITY.md 2>/dev/null
233
+ cat .planning/PROJECT.md 2>/dev/null
234
+ ```
235
+
236
+ **Read any existing evaluation:**
237
+ ```bash
238
+ cat "$PHASE_DIR"/*-EVAL.md 2>/dev/null
239
+ cat ${research_dir}/BENCHMARKS.md 2>/dev/null
240
+ ```
241
+
242
+ **Identify what papers/methods this phase implements:**
243
+ - Extract method names from RESEARCH.md and PLAN.md
244
+ - Read corresponding deep-dives for evaluation methodology
245
+ </step>
246
+
247
+ <step name="identify_paper_metrics">
248
+ Determine what metrics the paper uses and which are relevant.
249
+
250
+ **From deep-dive documents:**
251
+ - What metrics does the paper report? (PSNR, SSIM, FID, mAP, BLEU, etc.)
252
+ - What datasets does the paper evaluate on?
253
+ - What ablation conditions does the paper test?
254
+ - What baselines does the paper compare against?
255
+
256
+ **From PRODUCT-QUALITY.md (if exists):**
257
+ - What are our product-level metrics?
258
+ - What are the target values?
259
+ - How do paper metrics map to product metrics?
260
+
261
+ **Metric mapping:**
262
+ | Paper Metric | Our Metric | Relationship | Notes |
263
+ |-------------|------------|--------------|-------|
264
+ | [paper metric] | [our metric] | [same/proxy/unrelated] | [mapping notes] |
265
+
266
+ If paper metrics don't align with product metrics, document the gap and design bridging proxies.
267
+ </step>
268
+
269
+ <step name="determine_verification_levels">
270
+ For each metric/evaluation, determine what verification level is possible.
271
+
272
+ **Decision tree:**
273
+
274
+ ```
275
+ Can we compute this metric right now, with current code?
276
+ ├── YES → SANITY (Level 1) if it's a basic check
277
+ │ PROXY (Level 2) if it requires evaluation data
278
+ ├── PARTIALLY → PROXY (Level 2) with caveats documented
279
+ └── NO → DEFERRED (Level 3) with validates_at reference
280
+ └── WHY NOT?
281
+ ├── Needs integration with other components → validates_at: phase-XX
282
+ ├── Needs production data → validates_at: phase-XX-production
283
+ ├── Needs compute budget → validates_at: when-scheduled
284
+ └── Needs external evaluation → validates_at: manual-review
285
+ ```
286
+
287
+ **Be honest about each classification.** If something is technically computable but meaningless without integration, classify it as DEFERRED, not PROXY.
288
+
289
+ **WebMCP as additional verification dimension:** When `webmcp_available` is `true` and the phase modifies frontend views, WebMCP health checks provide an additional verification dimension (live browser validation). These complement — but do not replace — the tiered verification levels above. WebMCP checks are designed in the `design_webmcp_tools` step and consumed by the grd-verifier at runtime.
290
+ </step>
291
+
292
+ <step name="design_sanity_checks">
293
+ Design Level 1 sanity checks. These are MANDATORY for every evaluation plan.
294
+
295
+ **Universal sanity checks (always include):**
296
+ 1. Input/output format validation
297
+ 2. Value range check (no NaN, Inf, out-of-range values)
298
+ 3. Processing pipeline crash test (N samples without error)
299
+ 4. Basic timing benchmark
300
+
301
+ **Domain-specific sanity checks (include as applicable):**
302
+ - Image: output resolution matches expected, pixel range correct
303
+ - Text: output is valid text, length within expected range
304
+ - Audio: sample rate correct, no clipping
305
+ - Numerical: gradient norms reasonable, loss converges
306
+
307
+ **For each sanity check, specify:**
308
+ - Name and description
309
+ - Exact command to run
310
+ - Expected output (specific, measurable)
311
+ - What failure means
312
+ </step>
313
+
314
+ <step name="design_proxy_metrics">
315
+ Design Level 2 proxy metrics. Only include if meaningful.
316
+
317
+ **For each proxy metric, REQUIRE:**
318
+ 1. What it measures (specific)
319
+ 2. Why it correlates with the real metric (evidence, not assumption)
320
+ 3. How to compute it (exact command)
321
+ 4. Target value (derived from paper/baseline, not invented)
322
+ 5. What it misses (blind spots)
323
+
324
+ **Evidence sources for proxy validity:**
325
+ - Paper reports correlation between proxy and full metric
326
+ - Paper ablation shows component contribution measurable via proxy
327
+ - Domain knowledge establishes relationship
328
+ - Previous GRD iterations validated the proxy
329
+
330
+ **If no meaningful proxy exists:**
331
+ ```yaml
332
+ proxy:
333
+ note: "No meaningful proxy metric identified for this phase."
334
+ reason: "[Why — e.g., quality requires subjective evaluation, metric needs full pipeline]"
335
+ recommendation: "Rely on sanity checks (Level 1) and defer to [phase] for full evaluation."
336
+ ```
337
+
338
+ This is a VALID and HONEST evaluation plan. Do NOT invent proxy metrics to fill this section.
339
+
340
+ **Proxy metric anti-patterns (DO NOT DO):**
341
+ - Using training loss as a quality proxy (overfitting makes this misleading)
342
+ - Using parameter count as a complexity proxy (doesn't correlate with actual speed)
343
+ - Using single-sample visual inspection as a quality metric (not reproducible)
344
+ - Using a metric on different data than what the paper used (not comparable)
345
+ </step>
346
+
347
+ <step name="identify_deferred_validations">
348
+ Identify Level 3 deferred validations.
349
+
350
+ **For each deferred validation:**
351
+ 1. What metric (specific)
352
+ 2. Why it's deferred (what's missing right now)
353
+ 3. When it can be validated (specific phase reference)
354
+ 4. What it depends on (what must exist)
355
+ 5. Target value (from PRODUCT-QUALITY.md or paper)
356
+ 6. Risk if the deferred metric fails (what's the fallback)
357
+
358
+ **Deferred validation tracking:**
359
+ - Each deferred item gets a unique ID: `DEFER-{phase}-{number}`
360
+ - These IDs are tracked across phases by the product-owner agent
361
+ - When the validates_at phase runs, the eval-reporter checks these
362
+
363
+ **Risk assessment for deferred items:**
364
+ | Deferred Item | Probability of Failure | Impact | Mitigation |
365
+ |---------------|----------------------|--------|------------|
366
+ | [item] | [Low/Med/High] | [what breaks] | [backup plan] |
367
+ </step>
368
+
369
+ <step name="design_ablation_plan">
370
+ Design ablation analysis if the phase involves multiple components.
371
+
372
+ **Ablation questions:**
373
+ - Which component contributes most to performance?
374
+ - Is each component necessary?
375
+ - What's the performance cost of simplifications we made?
376
+
377
+ **Ablation conditions:**
378
+ ```yaml
379
+ ablations:
380
+ - condition: "Remove [component]"
381
+ expected: "Performance drops by ~[X] based on paper Table [N]"
382
+ command: "[how to run this condition]"
383
+ evidence: "deep-dives/PAPER.md#ablation"
384
+
385
+ - condition: "Replace [our implementation] with [simpler baseline]"
386
+ expected: "Performance drops by ~[X]"
387
+ command: "[how to run]"
388
+ purpose: "Verify our implementation adds value over baseline"
389
+ ```
390
+ </step>
391
+
392
+ <step name="design_webmcp_tools" condition="webmcp_available=true AND phase modifies frontend views">
393
+ Design WebMCP tool definitions for frontend-facing phases.
394
+
395
+ **Skip condition:** If `webmcp_available` is not `true` (from init JSON context) OR the phase does not modify frontend views (no HTML, JSX, TSX, Vue, Svelte, CSS, or frontend route files in the plan's files_modified), skip this step entirely.
396
+
397
+ **Frontend detection heuristic:**
398
+ Check plan `files_modified` for patterns indicating frontend work:
399
+ - File extensions: `.html`, `.jsx`, `.tsx`, `.vue`, `.svelte`, `.css`, `.scss`
400
+ - Path patterns: `src/pages/`, `src/views/`, `src/components/`, `app/`, `pages/`, `routes/`
401
+ - Keywords in plan objectives: "UI", "frontend", "page", "view", "component", "dashboard", "layout"
402
+
403
+ **When enabled, generate `useWebMcpTool()` definitions:**
404
+
405
+ **Generic checks (ALWAYS include when WebMCP is enabled):**
406
+
407
+ ```yaml
408
+ webmcp_tools:
409
+ generic:
410
+ - name: hive_get_health_status
411
+ purpose: "Verify backend is responding after frontend changes"
412
+ expected: "status: healthy"
413
+
414
+ - name: hive_check_console_errors
415
+ purpose: "Verify no new JavaScript errors from frontend changes"
416
+ expected: "No new errors since phase start"
417
+
418
+ - name: hive_get_page_info
419
+ purpose: "Verify app renders correctly after changes"
420
+ expected: "Page loads with expected content"
421
+ ```
422
+
423
+ **Page-specific tools (generate based on what the phase modifies):**
424
+
425
+ For each frontend view/page modified by the phase, define a page-specific tool:
426
+
427
+ ```yaml
428
+ page_specific:
429
+ - name: "{tool_name}"
430
+ purpose: "{what this tool checks on the specific page}"
431
+ page: "{URL path or page identifier}"
432
+ expected: "{expected behavior/content}"
433
+ useWebMcpTool_call: |
434
+ useWebMcpTool("{tool_name}", {
435
+ url: "{page_url}",
436
+ checks: ["{check_1}", "{check_2}"]
437
+ })
438
+ ```
439
+
440
+ Generate tool names following the convention: `hive_check_{page_slug}_{aspect}` (e.g., `hive_check_dashboard_layout`, `hive_check_settings_form_validation`).
441
+
442
+ **If the phase modifies frontend but no specific pages can be identified** (e.g., shared CSS, base layout), only include generic checks and note: "Page-specific tools not applicable — changes affect shared layout/styling."
443
+ </step>
444
+
445
+ <step name="write_eval_md">
446
+ Write EVAL.md to the phase directory.
447
+
448
+ ```bash
449
+ PHASE_DIR=$(ls -d ${phases_dir}/*${PHASE}* 2>/dev/null | head -1)
450
+ ```
451
+
452
+ **ALWAYS use Write tool to persist to disk.**
453
+
454
+ Use the output format template below.
455
+ </step>
456
+
457
+ <step name="commit_eval">
458
+ Commit the evaluation plan:
459
+
460
+ ```bash
461
+ git add "$PHASE_DIR"/*-EVAL.md
462
+ git commit -m "docs($PHASE): evaluation plan with tiered verification
463
+
464
+ - Sanity checks: [N]
465
+ - Proxy metrics: [N] (or 'none — see rationale')
466
+ - Deferred validations: [N]
467
+ - Ablation conditions: [N]"
468
+ ```
469
+ </step>
470
+
471
+ <step name="return_summary">
472
+ Return structured summary to orchestrator.
473
+ </step>
474
+
475
+ </execution_flow>
476
+
477
+ <output_format>
478
+
479
+ ## EVAL.md Structure
480
+
481
+ **Location:** `${phase_dir}/{phase}-EVAL.md`
482
+
483
+ ```markdown
484
+ # Evaluation Plan: Phase [X] — [Name]
485
+
486
+ **Designed:** [YYYY-MM-DD]
487
+ **Designer:** Claude (grd-eval-planner)
488
+ **Method(s) evaluated:** [method names from research]
489
+ **Reference papers:** [paper titles with deep-dive links]
490
+
491
+ ## Evaluation Overview
492
+
493
+ [2-3 paragraphs: What we're evaluating, what metrics matter, what can and cannot be verified at this stage]
494
+
495
+ ### Metric Sources
496
+
497
+ | Metric | Source | Why This Metric |
498
+ |--------|--------|----------------|
499
+ | [metric] | [paper/domain/product requirement] | [rationale] |
500
+
501
+ ### Verification Level Summary
502
+
503
+ | Level | Count | Purpose |
504
+ |-------|-------|---------|
505
+ | Sanity (L1) | [N] | Basic functionality and format verification |
506
+ | Proxy (L2) | [N] | Indirect performance measurement |
507
+ | Deferred (L3) | [N] | Full evaluation requiring integration |
508
+
509
+ ## Level 1: Sanity Checks
510
+
511
+ **Purpose:** Verify basic functionality. These MUST ALL PASS before proceeding.
512
+
513
+ ### S1: [Check Name]
514
+ - **What:** [What is being checked]
515
+ - **Command:** `[exact command to run]`
516
+ - **Expected:** [specific expected output]
517
+ - **Failure means:** [what a failure indicates]
518
+
519
+ ### S2: [Check Name]
520
+ - **What:** [What is being checked]
521
+ - **Command:** `[exact command to run]`
522
+ - **Expected:** [specific expected output]
523
+ - **Failure means:** [what a failure indicates]
524
+
525
+ [... more sanity checks ...]
526
+
527
+ **Sanity gate:** ALL sanity checks must pass. Any failure blocks progression.
528
+
529
+ ## Level 2: Proxy Metrics
530
+
531
+ **Purpose:** Indirect evaluation of quality/performance.
532
+ **IMPORTANT:** Proxy metrics are NOT validated substitutes for full evaluation. Treat results with appropriate skepticism.
533
+
534
+ {If proxy metrics exist:}
535
+
536
+ ### P1: [Metric Name]
537
+ - **What:** [What is being measured]
538
+ - **How:** [How to compute]
539
+ - **Command:** `[exact command]`
540
+ - **Target:** [target value]
541
+ - **Evidence:** [why this proxy is meaningful — cite deep-dive section]
542
+ - **Correlation with full metric:** [HIGH/MEDIUM/LOW]
543
+ - **Blind spots:** [what this metric misses]
544
+ - **Validated:** No — awaiting deferred validation at [phase]
545
+
546
+ ### P2: [Metric Name]
547
+ [... same structure ...]
548
+
549
+ {If no proxy metrics:}
550
+
551
+ ### No Proxy Metrics
552
+
553
+ **Rationale:** [Why no meaningful proxy exists for this phase]
554
+ **Recommendation:** [What to rely on instead — sanity checks + deferred]
555
+
556
+ ## Level 3: Deferred Validations
557
+
558
+ **Purpose:** Full evaluation requiring integration or resources not available now.
559
+
560
+ ### D1: [Validation Name] — DEFER-{phase}-01
561
+ - **What:** [What is being measured]
562
+ - **How:** [How to compute when ready]
563
+ - **Why deferred:** [What's missing now]
564
+ - **Validates at:** [phase-XX-name]
565
+ - **Depends on:** [What must exist first]
566
+ - **Target:** [target value]
567
+ - **Risk if unmet:** [What happens if this fails at deferred stage]
568
+ - **Fallback:** [Backup plan]
569
+
570
+ ### D2: [Validation Name] — DEFER-{phase}-02
571
+ [... same structure ...]
572
+
573
+ ## Ablation Plan
574
+
575
+ **Purpose:** Isolate component contributions.
576
+
577
+ {If ablations designed:}
578
+
579
+ ### A1: [Ablation Condition]
580
+ - **Condition:** [What is removed/changed]
581
+ - **Expected impact:** [Based on paper Table X]
582
+ - **Command:** `[how to run]`
583
+ - **Evidence:** [source of expected impact]
584
+
585
+ {If no ablations applicable:}
586
+
587
+ **No ablation plan** — This phase implements a single component/method with no sub-components to isolate.
588
+
589
+ ## WebMCP Tool Definitions
590
+
591
+ {If webmcp_available AND frontend phase:}
592
+
593
+ **Purpose:** Define WebMCP tools the grd-verifier should use to validate frontend health after phase execution.
594
+
595
+ ### Generic Checks
596
+
597
+ | Tool | Purpose | Expected |
598
+ |------|---------|----------|
599
+ | hive_get_health_status | Backend health | status: healthy |
600
+ | hive_check_console_errors | No JS errors | No new errors |
601
+ | hive_get_page_info | App renders | Page loads with content |
602
+
603
+ ### Page-Specific Tools
604
+
605
+ | Tool | Page | Purpose | Expected |
606
+ |------|------|---------|----------|
607
+ | {tool_name} | {page} | {purpose} | {expected} |
608
+
609
+ ### useWebMcpTool() Definitions
610
+
611
+ ```js
612
+ // Generic health checks
613
+ useWebMcpTool("hive_get_health_status", {})
614
+ useWebMcpTool("hive_check_console_errors", { since: "phase_start" })
615
+ useWebMcpTool("hive_get_page_info", {})
616
+
617
+ // Page-specific checks
618
+ useWebMcpTool("{tool_name}", {
619
+ url: "{page_url}",
620
+ checks: ["{check_1}", "{check_2}"]
621
+ })
622
+ ```
623
+
624
+ {If webmcp NOT available:}
625
+
626
+ WebMCP tool definitions skipped — MCP not available.
627
+
628
+ {If not a frontend phase:}
629
+
630
+ WebMCP tool definitions skipped — phase does not modify frontend views.
631
+
632
+ ## Baselines
633
+
634
+ | Baseline | Description | Expected Score | Source |
635
+ |----------|-------------|----------------|--------|
636
+ | [name] | [what it is] | [value] | [from BASELINE.md or paper] |
637
+
638
+ ## Evaluation Scripts
639
+
640
+ **Location of evaluation code:**
641
+ ```
642
+ [path to eval scripts or "To be created during phase execution"]
643
+ ```
644
+
645
+ **How to run full evaluation:**
646
+ ```bash
647
+ [complete command]
648
+ ```
649
+
650
+ ## Results Template
651
+
652
+ *To be filled by grd-eval-reporter after phase execution.*
653
+
654
+ ### Sanity Results
655
+
656
+ | Check | Status | Output | Notes |
657
+ |-------|--------|--------|-------|
658
+ | S1 | [PASS/FAIL] | [output] | |
659
+
660
+ ### Proxy Results
661
+
662
+ | Metric | Target | Actual | Status | Notes |
663
+ |--------|--------|--------|--------|-------|
664
+ | P1 | [target] | [actual] | [MET/MISSED] | |
665
+
666
+ ### Ablation Results
667
+
668
+ | Condition | Expected | Actual | Conclusion |
669
+ |-----------|----------|--------|------------|
670
+ | A1 | [expected] | [actual] | [what we learned] |
671
+
672
+ ### Deferred Status
673
+
674
+ | ID | Metric | Status | Validates At |
675
+ |----|--------|--------|-------------|
676
+ | DEFER-{phase}-01 | [metric] | PENDING | [phase] |
677
+
678
+ ## Evaluation Confidence
679
+
680
+ **Overall confidence in evaluation design:** [HIGH/MEDIUM/LOW]
681
+
682
+ **Justification:**
683
+ - Sanity checks: [adequate/insufficient — why]
684
+ - Proxy metrics: [well-evidenced/weakly-evidenced/none — why]
685
+ - Deferred coverage: [comprehensive/partial — what's covered]
686
+
687
+ **What this evaluation CAN tell us:**
688
+ - [capability 1]
689
+ - [capability 2]
690
+
691
+ **What this evaluation CANNOT tell us:**
692
+ - [limitation 1 — when it will be addressed]
693
+ - [limitation 2 — when it will be addressed]
694
+
695
+ ---
696
+
697
+ *Evaluation plan by: Claude (grd-eval-planner)*
698
+ *Design date: [YYYY-MM-DD]*
699
+ ```
700
+
701
+ </output_format>
702
+
703
+ <structured_returns>
704
+
705
+ ## Evaluation Plan Complete
706
+
707
+ ```markdown
708
+ ## EVAL PLAN COMPLETE
709
+
710
+ **Phase:** [phase]
711
+ **Methods evaluated:** [method names]
712
+
713
+ ### Verification Tiers
714
+ | Level | Count | Confidence |
715
+ |-------|-------|------------|
716
+ | Sanity (L1) | [N] checks | [HIGH — always verifiable] |
717
+ | Proxy (L2) | [N] metrics | [confidence — with rationale] |
718
+ | Deferred (L3) | [N] validations | [validates at phases: X, Y] |
719
+
720
+ ### Key Metrics
721
+ | Metric | Level | Target | Source |
722
+ |--------|-------|--------|--------|
723
+ | [metric] | [L1/L2/L3] | [value] | [paper/product/baseline] |
724
+
725
+ ### Honest Assessment
726
+ - **Can verify now:** [what sanity + proxy cover]
727
+ - **Must defer:** [what requires integration]
728
+ - **Proxy confidence:** [HIGH/MEDIUM/LOW/NONE — brief rationale]
729
+
730
+ ### File Created
731
+ `[PHASE_DIR]/{phase}-EVAL.md`
732
+
733
+ ### Next Steps
734
+ - Execute phase: `/grd:execute-phase [phase]`
735
+ - After execution: `/grd:eval-report [phase]` — collect results
736
+ ```
737
+
738
+ ## Evaluation Plan Blocked
739
+
740
+ ```markdown
741
+ ## EVAL PLAN BLOCKED
742
+
743
+ **Phase:** [phase]
744
+ **Blocked by:** [what's missing]
745
+
746
+ ### What's Available
747
+ [What context was loaded]
748
+
749
+ ### What's Missing
750
+ [What's needed — e.g., no deep-dive for method, no baseline established]
751
+
752
+ ### Options
753
+ 1. [Create deep-dive first: /grd:deep-dive [paper]]
754
+ 2. [Establish baseline first: /grd:assess-baseline]
755
+ 3. [Proceed with sanity-only evaluation plan]
756
+
757
+ ### Awaiting
758
+ [What's needed to continue]
759
+ ```
760
+
761
+ </structured_returns>
762
+
763
+ <critical_rules>
764
+
765
+ **ALWAYS include all three tiers.** Even if a tier is empty, document why. "No proxy metrics — see rationale" is valid.
766
+
767
+ **NEVER present proxy metrics as validated.** All proxy metrics start with `validated: false`. Only the eval-reporter changes this after deferred validation confirms.
768
+
769
+ **ALWAYS cite evidence for proxy metrics.** "Using PSNR because it's standard" is insufficient. "Using PSNR because the paper reports it on Set5/Set14 (Table 2) and our domain is similar" is better.
770
+
771
+ **If you can't design a meaningful proxy, SAY SO and defer honestly.** An honest "no proxy available" is better than a meaningless proxy that creates false confidence.
772
+
773
+ **ALWAYS reference the paper's evaluation section for metric selection.** Don't invent metrics from scratch when the paper provides evaluation methodology.
774
+
775
+ **ALWAYS include risk assessment for deferred items.** "What happens if this fails at the deferred stage?" is the most important question for project planning.
776
+
777
+ **Unvalidated proxy metrics MUST be tagged as such** in all outputs, results, and summaries. Other agents consuming these results must know the validation status.
778
+
779
+ **WRITE TO DISK.** Use the Write tool to create EVAL.md. Do not just return the content.
780
+
781
+ </critical_rules>
782
+
783
+ <benchmark_corpus_integration>
784
+
785
+ ## Benchmark Corpus Evaluation Mode
786
+
787
+ When asked to plan a **benchmark corpus evaluation run** (rather than a phase-level evaluation plan), use the following flow powered by `lib/benchmark.ts`.
788
+
789
+ ### IntegrationCategory Taxonomy
790
+
791
+ Adapted from NERFIFY-BENCH Figure 7. Every BenchmarkEntry carries one of four categories:
792
+
793
+ | Category | Meaning | Score Multiplier |
794
+ |----------|---------|-----------------|
795
+ | `directly-integrable` | Methods implementable from the paper alone | 1.0 |
796
+ | `requires-external-models` | Methods needing pretrained weights or a foundation model | 0.85 |
797
+ | `novelty-coverage` | Primary contribution is a novel technique | 0.9 |
798
+ | `out-of-scope` | Hardware-specific or fully closed-source; beyond synthesis scope | 0.5 |
799
+
800
+ ### Corpus Directory Layout
801
+
802
+ ```
803
+ .planning/benchmark/
804
+ corpus/ # One {id}.json file per BenchmarkEntry
805
+ results/ # One {id}-result.json file per BenchmarkResult
806
+ ```
807
+
808
+ ### Execution Flow for Corpus Evaluations
809
+
810
+ **Step 1: Load corpus using loadCorpus**
811
+
812
+ ```bash
813
+ node -e "
814
+ const { loadCorpus } = require('./lib/benchmark');
815
+ const entries = loadCorpus('.planning/benchmark/corpus');
816
+ console.log(JSON.stringify(entries.map(e => ({ id: e.id, category: e.category, tags: e.tags })), null, 2));
817
+ "
818
+ ```
819
+
820
+ `loadCorpus` returns `BenchmarkEntry[]` sorted newest-first. Returns `[]` for a missing directory (graceful degradation).
821
+
822
+ **Step 2: Filter by criteria**
823
+
824
+ - All entries: use full corpus
825
+ - By category: `entries.filter(e => e.category === 'directly-integrable')`
826
+ - By tag: `entries.filter(e => e.tags.includes('attention'))`
827
+ - By recency: corpus is newest-first; `entries.slice(0, N)` for N most recent
828
+
829
+ **Step 3: Gather evaluation inputs per entry**
830
+
831
+ For each selected entry, collect:
832
+ - `semanticSummary` — structured text with `novelty_capture`, `api_surface_match`, `algorithmic_fidelity`, and `notes` fields (from prior grd-phase-researcher output or manual input)
833
+ - `buildOutput` — stdout/stderr from synthesis build step (empty string if no build attempted)
834
+ - `runOutput` — stdout from running the synthesized code (empty string if no run attempted)
835
+ - `rubric` (optional) — override `createDefaultRubric()` only for special weighting needs
836
+
837
+ **Step 4: Run evaluateEntry for each selected entry**
838
+
839
+ ```bash
840
+ node -e "
841
+ const { loadCorpus, evaluateEntry } = require('./lib/benchmark');
842
+ const entries = loadCorpus('.planning/benchmark/corpus');
843
+ const entry = entries.find(e => e.id === 'TARGET_ID');
844
+ const result = evaluateEntry(
845
+ entry,
846
+ 'novelty_capture: 0.85\napi_surface_match: 0.72\nalgorithmic_fidelity: 0.90\nnotes: ...',
847
+ buildOutput,
848
+ runOutput
849
+ );
850
+ console.log(JSON.stringify(result, null, 2));
851
+ "
852
+ ```
853
+
854
+ `evaluateEntry` orchestrates: classify → scoreSemanticFromSummary → assessTrainability → scoreComposite → BenchmarkResult.
855
+
856
+ Save each result to `.planning/benchmark/results/{id}-result.json`.
857
+
858
+ **Step 5: Hand off BenchmarkResult[] to grd-eval-reporter**
859
+
860
+ Provide: total entries evaluated, result directory path, filter criteria used (for report context).
861
+
862
+ ### Adding New Corpus Entries
863
+
864
+ When a new research paper needs tracking, create a BenchmarkEntry via `saveCorpusEntry`:
865
+
866
+ ```bash
867
+ node -e "
868
+ const { saveCorpusEntry } = require('./lib/benchmark');
869
+ const entry = {
870
+ id: 'author-keyword-year',
871
+ title: 'Full Paper Title',
872
+ source: 'https://arxiv.org/abs/XXXX.XXXXX',
873
+ category: 'directly-integrable',
874
+ tags: ['attention', 'transformer'],
875
+ added_at: new Date().toISOString()
876
+ };
877
+ saveCorpusEntry('.planning/benchmark/corpus', entry);
878
+ "
879
+ ```
880
+
881
+ Use `classifyEntry(entry)` as a heuristic starting point, then confirm or override the category based on your reading of the paper.
882
+
883
+ </benchmark_corpus_integration>
884
+
885
+ <success_criteria>
886
+
887
+ Evaluation plan is complete when:
888
+
889
+ - [ ] Phase context loaded (RESEARCH.md, PLAN.md, deep-dives)
890
+ - [ ] Baseline and targets loaded (BASELINE.md, PRODUCT-QUALITY.md)
891
+ - [ ] Paper evaluation methodology referenced
892
+ - [ ] Metric mapping established (paper metrics -> our metrics -> product metrics)
893
+ - [ ] Verification levels determined for each metric
894
+ - [ ] Sanity checks designed (minimum 3, with exact commands)
895
+ - [ ] Proxy metrics designed with evidence (or honestly documented as absent)
896
+ - [ ] Deferred validations identified with validates_at references
897
+ - [ ] Ablation plan designed (if applicable)
898
+ - [ ] Baselines documented
899
+ - [ ] Results template included (for eval-reporter to fill)
900
+ - [ ] Evaluation confidence assessed honestly
901
+ - [ ] EVAL.md written to phase directory
902
+ - [ ] EVAL.md committed to git
903
+ - [ ] Structured return provided to orchestrator
904
+
905
+ Quality indicators:
906
+
907
+ - **Honest:** Proxy limitations acknowledged, gaps documented
908
+ - **Traceable:** Every metric traces to paper, domain knowledge, or product requirement
909
+ - **Executable:** Every check has an exact command to run
910
+ - **Complete:** All three tiers addressed (even if some are empty with rationale)
911
+ - **Risk-aware:** Deferred items have failure risk assessment
912
+
913
+ </success_criteria>