@jokerized/getresearchdone 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +103 -0
- package/README.md +211 -0
- package/agents/grd-baseline-assessor.md +684 -0
- package/agents/grd-code-reviewer.md +300 -0
- package/agents/grd-codebase-mapper.md +355 -0
- package/agents/grd-critique-agent.md +119 -0
- package/agents/grd-debugger.md +519 -0
- package/agents/grd-deep-diver.md +737 -0
- package/agents/grd-eval-planner.md +913 -0
- package/agents/grd-eval-reporter.md +717 -0
- package/agents/grd-executor.md +683 -0
- package/agents/grd-feasibility-analyst.md +624 -0
- package/agents/grd-integration-checker.md +367 -0
- package/agents/grd-knowledge-miner.md +81 -0
- package/agents/grd-migrator.md +88 -0
- package/agents/grd-phase-researcher.md +697 -0
- package/agents/grd-plan-checker.md +443 -0
- package/agents/grd-planner.md +1532 -0
- package/agents/grd-product-owner.md +562 -0
- package/agents/grd-project-researcher.md +513 -0
- package/agents/grd-research-synthesizer.md +273 -0
- package/agents/grd-roadmapper.md +798 -0
- package/agents/grd-surveyor.md +566 -0
- package/agents/grd-verifier.md +893 -0
- package/bin/gd.js +4 -0
- package/bin/gd.ts +227 -0
- package/bin/grd-manifest.js +4 -0
- package/bin/grd-manifest.ts +286 -0
- package/bin/grd-mcp-server.js +4 -0
- package/bin/grd-mcp-server.ts +124 -0
- package/bin/grd-tools.js +4 -0
- package/bin/grd-tools.ts +2471 -0
- package/bin/postinstall.js +4 -0
- package/bin/postinstall.ts +80 -0
- package/commands/add-phase.md +123 -0
- package/commands/add-todo.md +87 -0
- package/commands/assess-baseline.md +289 -0
- package/commands/autopilot.md +100 -0
- package/commands/autoplan.md +55 -0
- package/commands/check-todos.md +87 -0
- package/commands/compare-methods.md +262 -0
- package/commands/complete-milestone.md +225 -0
- package/commands/debug.md +372 -0
- package/commands/deep-dive.md +288 -0
- package/commands/discover.md +281 -0
- package/commands/discuss-phase.md +188 -0
- package/commands/discuss.md +55 -0
- package/commands/eval-report.md +310 -0
- package/commands/evolve.md +79 -0
- package/commands/execute-phase.md +1017 -0
- package/commands/feasibility.md +292 -0
- package/commands/help.md +407 -0
- package/commands/init.md +1508 -0
- package/commands/insert-phase.md +113 -0
- package/commands/iterate.md +327 -0
- package/commands/list-phase-assumptions.md +217 -0
- package/commands/long-term-roadmap.md +202 -0
- package/commands/map-codebase.md +111 -0
- package/commands/migrate.md +159 -0
- package/commands/new-milestone.md +169 -0
- package/commands/pause-work.md +83 -0
- package/commands/plan-milestone-gaps.md +373 -0
- package/commands/plan-phase.md +655 -0
- package/commands/principles.md +328 -0
- package/commands/product-plan.md +319 -0
- package/commands/progress.md +481 -0
- package/commands/quick.md +167 -0
- package/commands/reapply-patches.md +154 -0
- package/commands/remove-phase.md +97 -0
- package/commands/requirement.md +96 -0
- package/commands/resume-project.md +113 -0
- package/commands/settings.md +1144 -0
- package/commands/survey.md +242 -0
- package/commands/sync.md +246 -0
- package/commands/tracker-setup.md +322 -0
- package/commands/update.md +202 -0
- package/commands/verify-phase.md +335 -0
- package/commands/verify-work.md +701 -0
- package/commands/wireup.md +29 -0
- package/dist/bin/gd.d.ts +3 -0
- package/dist/bin/gd.d.ts.map +1 -0
- package/dist/bin/gd.js +178 -0
- package/dist/bin/gd.js.map +1 -0
- package/dist/bin/grd-manifest.d.ts +3 -0
- package/dist/bin/grd-manifest.d.ts.map +1 -0
- package/dist/bin/grd-manifest.js +202 -0
- package/dist/bin/grd-manifest.js.map +1 -0
- package/dist/bin/grd-mcp-server.d.ts +3 -0
- package/dist/bin/grd-mcp-server.d.ts.map +1 -0
- package/dist/bin/grd-mcp-server.js +71 -0
- package/dist/bin/grd-mcp-server.js.map +1 -0
- package/dist/bin/grd-tools.d.ts +3 -0
- package/dist/bin/grd-tools.d.ts.map +1 -0
- package/dist/bin/grd-tools.js +1680 -0
- package/dist/bin/grd-tools.js.map +1 -0
- package/dist/bin/postinstall.d.ts +3 -0
- package/dist/bin/postinstall.d.ts.map +1 -0
- package/dist/bin/postinstall.js +61 -0
- package/dist/bin/postinstall.js.map +1 -0
- package/dist/lib/autopilot-milestone.d.ts +2 -0
- package/dist/lib/autopilot-milestone.d.ts.map +1 -0
- package/dist/lib/autopilot-milestone.js +94 -0
- package/dist/lib/autopilot-milestone.js.map +1 -0
- package/dist/lib/autopilot-pipeline.d.ts +2 -0
- package/dist/lib/autopilot-pipeline.d.ts.map +1 -0
- package/dist/lib/autopilot-pipeline.js +830 -0
- package/dist/lib/autopilot-pipeline.js.map +1 -0
- package/dist/lib/autopilot-waves.d.ts +2 -0
- package/dist/lib/autopilot-waves.d.ts.map +1 -0
- package/dist/lib/autopilot-waves.js +266 -0
- package/dist/lib/autopilot-waves.js.map +1 -0
- package/dist/lib/autopilot.d.ts +2 -0
- package/dist/lib/autopilot.d.ts.map +1 -0
- package/dist/lib/autopilot.js +1314 -0
- package/dist/lib/autopilot.js.map +1 -0
- package/dist/lib/autoplan.d.ts +2 -0
- package/dist/lib/autoplan.d.ts.map +1 -0
- package/dist/lib/autoplan.js +198 -0
- package/dist/lib/autoplan.js.map +1 -0
- package/dist/lib/autoresearch.d.ts +2 -0
- package/dist/lib/autoresearch.d.ts.map +1 -0
- package/dist/lib/autoresearch.js +626 -0
- package/dist/lib/autoresearch.js.map +1 -0
- package/dist/lib/backend.d.ts +2 -0
- package/dist/lib/backend.d.ts.map +1 -0
- package/dist/lib/backend.js +1036 -0
- package/dist/lib/backend.js.map +1 -0
- package/dist/lib/benchmark.d.ts +99 -0
- package/dist/lib/benchmark.d.ts.map +1 -0
- package/dist/lib/benchmark.js +278 -0
- package/dist/lib/benchmark.js.map +1 -0
- package/dist/lib/citations.d.ts +2 -0
- package/dist/lib/citations.d.ts.map +1 -0
- package/dist/lib/citations.js +642 -0
- package/dist/lib/citations.js.map +1 -0
- package/dist/lib/cleanup.d.ts +2 -0
- package/dist/lib/cleanup.d.ts.map +1 -0
- package/dist/lib/cleanup.js +1222 -0
- package/dist/lib/cleanup.js.map +1 -0
- package/dist/lib/cli/adapters.d.ts +10 -0
- package/dist/lib/cli/adapters.d.ts.map +1 -0
- package/dist/lib/cli/adapters.js +27 -0
- package/dist/lib/cli/adapters.js.map +1 -0
- package/dist/lib/cli/agent.d.ts +17 -0
- package/dist/lib/cli/agent.d.ts.map +1 -0
- package/dist/lib/cli/agent.js +53 -0
- package/dist/lib/cli/agent.js.map +1 -0
- package/dist/lib/cli/index.d.ts +21 -0
- package/dist/lib/cli/index.d.ts.map +1 -0
- package/dist/lib/cli/index.js +264 -0
- package/dist/lib/cli/index.js.map +1 -0
- package/dist/lib/cli/output.d.ts +20 -0
- package/dist/lib/cli/output.d.ts.map +1 -0
- package/dist/lib/cli/output.js +22 -0
- package/dist/lib/cli/output.js.map +1 -0
- package/dist/lib/cli/scan-dispatch.d.ts +9 -0
- package/dist/lib/cli/scan-dispatch.d.ts.map +1 -0
- package/dist/lib/cli/scan-dispatch.js +107 -0
- package/dist/lib/cli/scan-dispatch.js.map +1 -0
- package/dist/lib/cli/tools.d.ts +16 -0
- package/dist/lib/cli/tools.d.ts.map +1 -0
- package/dist/lib/cli/tools.js +168 -0
- package/dist/lib/cli/tools.js.map +1 -0
- package/dist/lib/commands/_dashboard-parsers.d.ts +2 -0
- package/dist/lib/commands/_dashboard-parsers.d.ts.map +1 -0
- package/dist/lib/commands/_dashboard-parsers.js +192 -0
- package/dist/lib/commands/_dashboard-parsers.js.map +1 -0
- package/dist/lib/commands/analysis.d.ts +2 -0
- package/dist/lib/commands/analysis.d.ts.map +1 -0
- package/dist/lib/commands/analysis.js +1418 -0
- package/dist/lib/commands/analysis.js.map +1 -0
- package/dist/lib/commands/assumptions.d.ts +2 -0
- package/dist/lib/commands/assumptions.d.ts.map +1 -0
- package/dist/lib/commands/assumptions.js +166 -0
- package/dist/lib/commands/assumptions.js.map +1 -0
- package/dist/lib/commands/blame.d.ts +2 -0
- package/dist/lib/commands/blame.d.ts.map +1 -0
- package/dist/lib/commands/blame.js +133 -0
- package/dist/lib/commands/blame.js.map +1 -0
- package/dist/lib/commands/budget.d.ts +2 -0
- package/dist/lib/commands/budget.d.ts.map +1 -0
- package/dist/lib/commands/budget.js +100 -0
- package/dist/lib/commands/budget.js.map +1 -0
- package/dist/lib/commands/check-plans.d.ts +2 -0
- package/dist/lib/commands/check-plans.d.ts.map +1 -0
- package/dist/lib/commands/check-plans.js +190 -0
- package/dist/lib/commands/check-plans.js.map +1 -0
- package/dist/lib/commands/config.d.ts +2 -0
- package/dist/lib/commands/config.d.ts.map +1 -0
- package/dist/lib/commands/config.js +188 -0
- package/dist/lib/commands/config.js.map +1 -0
- package/dist/lib/commands/dashboard.d.ts +2 -0
- package/dist/lib/commands/dashboard.d.ts.map +1 -0
- package/dist/lib/commands/dashboard.js +466 -0
- package/dist/lib/commands/dashboard.js.map +1 -0
- package/dist/lib/commands/estimate.d.ts +2 -0
- package/dist/lib/commands/estimate.d.ts.map +1 -0
- package/dist/lib/commands/estimate.js +148 -0
- package/dist/lib/commands/estimate.js.map +1 -0
- package/dist/lib/commands/eval-diff.d.ts +2 -0
- package/dist/lib/commands/eval-diff.d.ts.map +1 -0
- package/dist/lib/commands/eval-diff.js +213 -0
- package/dist/lib/commands/eval-diff.js.map +1 -0
- package/dist/lib/commands/freshness.d.ts +2 -0
- package/dist/lib/commands/freshness.d.ts.map +1 -0
- package/dist/lib/commands/freshness.js +163 -0
- package/dist/lib/commands/freshness.js.map +1 -0
- package/dist/lib/commands/health.d.ts +2 -0
- package/dist/lib/commands/health.d.ts.map +1 -0
- package/dist/lib/commands/health.js +435 -0
- package/dist/lib/commands/health.js.map +1 -0
- package/dist/lib/commands/index.d.ts +2 -0
- package/dist/lib/commands/index.d.ts.map +1 -0
- package/dist/lib/commands/index.js +128 -0
- package/dist/lib/commands/index.js.map +1 -0
- package/dist/lib/commands/install.d.ts +56 -0
- package/dist/lib/commands/install.d.ts.map +1 -0
- package/dist/lib/commands/install.js +214 -0
- package/dist/lib/commands/install.js.map +1 -0
- package/dist/lib/commands/knowhow-aggregator.d.ts +2 -0
- package/dist/lib/commands/knowhow-aggregator.d.ts.map +1 -0
- package/dist/lib/commands/knowhow-aggregator.js +279 -0
- package/dist/lib/commands/knowhow-aggregator.js.map +1 -0
- package/dist/lib/commands/knowledge-search.d.ts +2 -0
- package/dist/lib/commands/knowledge-search.d.ts.map +1 -0
- package/dist/lib/commands/knowledge-search.js +113 -0
- package/dist/lib/commands/knowledge-search.js.map +1 -0
- package/dist/lib/commands/long-term-roadmap.d.ts +2 -0
- package/dist/lib/commands/long-term-roadmap.d.ts.map +1 -0
- package/dist/lib/commands/long-term-roadmap.js +272 -0
- package/dist/lib/commands/long-term-roadmap.js.map +1 -0
- package/dist/lib/commands/patterns.d.ts +91 -0
- package/dist/lib/commands/patterns.d.ts.map +1 -0
- package/dist/lib/commands/patterns.js +391 -0
- package/dist/lib/commands/patterns.js.map +1 -0
- package/dist/lib/commands/phase-info.d.ts +2 -0
- package/dist/lib/commands/phase-info.d.ts.map +1 -0
- package/dist/lib/commands/phase-info.js +509 -0
- package/dist/lib/commands/phase-info.js.map +1 -0
- package/dist/lib/commands/plan-lint.d.ts +56 -0
- package/dist/lib/commands/plan-lint.d.ts.map +1 -0
- package/dist/lib/commands/plan-lint.js +481 -0
- package/dist/lib/commands/plan-lint.js.map +1 -0
- package/dist/lib/commands/plan-phase.d.ts +53 -0
- package/dist/lib/commands/plan-phase.d.ts.map +1 -0
- package/dist/lib/commands/plan-phase.js +288 -0
- package/dist/lib/commands/plan-phase.js.map +1 -0
- package/dist/lib/commands/progress.d.ts +2 -0
- package/dist/lib/commands/progress.d.ts.map +1 -0
- package/dist/lib/commands/progress.js +266 -0
- package/dist/lib/commands/progress.js.map +1 -0
- package/dist/lib/commands/quality.d.ts +2 -0
- package/dist/lib/commands/quality.d.ts.map +1 -0
- package/dist/lib/commands/quality.js +80 -0
- package/dist/lib/commands/quality.js.map +1 -0
- package/dist/lib/commands/rollback.d.ts +2 -0
- package/dist/lib/commands/rollback.d.ts.map +1 -0
- package/dist/lib/commands/rollback.js +145 -0
- package/dist/lib/commands/rollback.js.map +1 -0
- package/dist/lib/commands/scan.d.ts +25 -0
- package/dist/lib/commands/scan.d.ts.map +1 -0
- package/dist/lib/commands/scan.js +28 -0
- package/dist/lib/commands/scan.js.map +1 -0
- package/dist/lib/commands/search.d.ts +2 -0
- package/dist/lib/commands/search.d.ts.map +1 -0
- package/dist/lib/commands/search.js +212 -0
- package/dist/lib/commands/search.js.map +1 -0
- package/dist/lib/commands/select-candidate.d.ts +128 -0
- package/dist/lib/commands/select-candidate.d.ts.map +1 -0
- package/dist/lib/commands/select-candidate.js +518 -0
- package/dist/lib/commands/select-candidate.js.map +1 -0
- package/dist/lib/commands/singularity.d.ts +2 -0
- package/dist/lib/commands/singularity.d.ts.map +1 -0
- package/dist/lib/commands/singularity.js +185 -0
- package/dist/lib/commands/singularity.js.map +1 -0
- package/dist/lib/commands/slug-timestamp.d.ts +2 -0
- package/dist/lib/commands/slug-timestamp.d.ts.map +1 -0
- package/dist/lib/commands/slug-timestamp.js +54 -0
- package/dist/lib/commands/slug-timestamp.js.map +1 -0
- package/dist/lib/commands/tail.d.ts +2 -0
- package/dist/lib/commands/tail.d.ts.map +1 -0
- package/dist/lib/commands/tail.js +100 -0
- package/dist/lib/commands/tail.js.map +1 -0
- package/dist/lib/commands/todo.d.ts +2 -0
- package/dist/lib/commands/todo.d.ts.map +1 -0
- package/dist/lib/commands/todo.js +200 -0
- package/dist/lib/commands/todo.js.map +1 -0
- package/dist/lib/commands/watch.d.ts +2 -0
- package/dist/lib/commands/watch.d.ts.map +1 -0
- package/dist/lib/commands/watch.js +72 -0
- package/dist/lib/commands/watch.js.map +1 -0
- package/dist/lib/complexity.d.ts +55 -0
- package/dist/lib/complexity.d.ts.map +1 -0
- package/dist/lib/complexity.js +80 -0
- package/dist/lib/complexity.js.map +1 -0
- package/dist/lib/context/agents.d.ts +2 -0
- package/dist/lib/context/agents.d.ts.map +1 -0
- package/dist/lib/context/agents.js +344 -0
- package/dist/lib/context/agents.js.map +1 -0
- package/dist/lib/context/base.d.ts +2 -0
- package/dist/lib/context/base.d.ts.map +1 -0
- package/dist/lib/context/base.js +81 -0
- package/dist/lib/context/base.js.map +1 -0
- package/dist/lib/context/execute.d.ts +2 -0
- package/dist/lib/context/execute.d.ts.map +1 -0
- package/dist/lib/context/execute.js +753 -0
- package/dist/lib/context/execute.js.map +1 -0
- package/dist/lib/context/index.d.ts +2 -0
- package/dist/lib/context/index.d.ts.map +1 -0
- package/dist/lib/context/index.js +88 -0
- package/dist/lib/context/index.js.map +1 -0
- package/dist/lib/context/progress.d.ts +2 -0
- package/dist/lib/context/progress.d.ts.map +1 -0
- package/dist/lib/context/progress.js +178 -0
- package/dist/lib/context/progress.js.map +1 -0
- package/dist/lib/context/project.d.ts +2 -0
- package/dist/lib/context/project.d.ts.map +1 -0
- package/dist/lib/context/project.js +413 -0
- package/dist/lib/context/project.js.map +1 -0
- package/dist/lib/context/research.d.ts +2 -0
- package/dist/lib/context/research.d.ts.map +1 -0
- package/dist/lib/context/research.js +466 -0
- package/dist/lib/context/research.js.map +1 -0
- package/dist/lib/dead-ends.d.ts +28 -0
- package/dist/lib/dead-ends.d.ts.map +1 -0
- package/dist/lib/dead-ends.js +451 -0
- package/dist/lib/dead-ends.js.map +1 -0
- package/dist/lib/deps.d.ts +2 -0
- package/dist/lib/deps.d.ts.map +1 -0
- package/dist/lib/deps.js +630 -0
- package/dist/lib/deps.js.map +1 -0
- package/dist/lib/discussion.d.ts +2 -0
- package/dist/lib/discussion.d.ts.map +1 -0
- package/dist/lib/discussion.js +1041 -0
- package/dist/lib/discussion.js.map +1 -0
- package/dist/lib/drift.d.ts +36 -0
- package/dist/lib/drift.d.ts.map +1 -0
- package/dist/lib/drift.js +481 -0
- package/dist/lib/drift.js.map +1 -0
- package/dist/lib/evolve/_dimensions-features.d.ts +2 -0
- package/dist/lib/evolve/_dimensions-features.d.ts.map +1 -0
- package/dist/lib/evolve/_dimensions-features.js +369 -0
- package/dist/lib/evolve/_dimensions-features.js.map +1 -0
- package/dist/lib/evolve/_dimensions.d.ts +2 -0
- package/dist/lib/evolve/_dimensions.d.ts.map +1 -0
- package/dist/lib/evolve/_dimensions.js +358 -0
- package/dist/lib/evolve/_dimensions.js.map +1 -0
- package/dist/lib/evolve/_product-ideation.d.ts +2 -0
- package/dist/lib/evolve/_product-ideation.d.ts.map +1 -0
- package/dist/lib/evolve/_product-ideation.js +281 -0
- package/dist/lib/evolve/_product-ideation.js.map +1 -0
- package/dist/lib/evolve/_prompts.d.ts +2 -0
- package/dist/lib/evolve/_prompts.d.ts.map +1 -0
- package/dist/lib/evolve/_prompts.js +153 -0
- package/dist/lib/evolve/_prompts.js.map +1 -0
- package/dist/lib/evolve/cli.d.ts +2 -0
- package/dist/lib/evolve/cli.d.ts.map +1 -0
- package/dist/lib/evolve/cli.js +224 -0
- package/dist/lib/evolve/cli.js.map +1 -0
- package/dist/lib/evolve/discovery.d.ts +2 -0
- package/dist/lib/evolve/discovery.d.ts.map +1 -0
- package/dist/lib/evolve/discovery.js +391 -0
- package/dist/lib/evolve/discovery.js.map +1 -0
- package/dist/lib/evolve/index.d.ts +2 -0
- package/dist/lib/evolve/index.d.ts.map +1 -0
- package/dist/lib/evolve/index.js +88 -0
- package/dist/lib/evolve/index.js.map +1 -0
- package/dist/lib/evolve/orchestrator.d.ts +2 -0
- package/dist/lib/evolve/orchestrator.d.ts.map +1 -0
- package/dist/lib/evolve/orchestrator.js +851 -0
- package/dist/lib/evolve/orchestrator.js.map +1 -0
- package/dist/lib/evolve/scoring.d.ts +2 -0
- package/dist/lib/evolve/scoring.d.ts.map +1 -0
- package/dist/lib/evolve/scoring.js +118 -0
- package/dist/lib/evolve/scoring.js.map +1 -0
- package/dist/lib/evolve/state.d.ts +2 -0
- package/dist/lib/evolve/state.d.ts.map +1 -0
- package/dist/lib/evolve/state.js +264 -0
- package/dist/lib/evolve/state.js.map +1 -0
- package/dist/lib/evolve/types.d.ts +249 -0
- package/dist/lib/evolve/types.d.ts.map +1 -0
- package/dist/lib/evolve/types.js +3 -0
- package/dist/lib/evolve/types.js.map +1 -0
- package/dist/lib/frontmatter.d.ts +2 -0
- package/dist/lib/frontmatter.d.ts.map +1 -0
- package/dist/lib/frontmatter.js +513 -0
- package/dist/lib/frontmatter.js.map +1 -0
- package/dist/lib/gates.d.ts +2 -0
- package/dist/lib/gates.d.ts.map +1 -0
- package/dist/lib/gates.js +578 -0
- package/dist/lib/gates.js.map +1 -0
- package/dist/lib/genome.d.ts +10 -0
- package/dist/lib/genome.d.ts.map +1 -0
- package/dist/lib/genome.js +368 -0
- package/dist/lib/genome.js.map +1 -0
- package/dist/lib/got.d.ts +2 -0
- package/dist/lib/got.d.ts.map +1 -0
- package/dist/lib/got.js +280 -0
- package/dist/lib/got.js.map +1 -0
- package/dist/lib/invariants.d.ts +2 -0
- package/dist/lib/invariants.d.ts.map +1 -0
- package/dist/lib/invariants.js +298 -0
- package/dist/lib/invariants.js.map +1 -0
- package/dist/lib/knowledge.d.ts +2 -0
- package/dist/lib/knowledge.d.ts.map +1 -0
- package/dist/lib/knowledge.js +658 -0
- package/dist/lib/knowledge.js.map +1 -0
- package/dist/lib/long-term-roadmap.d.ts +2 -0
- package/dist/lib/long-term-roadmap.d.ts.map +1 -0
- package/dist/lib/long-term-roadmap.js +602 -0
- package/dist/lib/long-term-roadmap.js.map +1 -0
- package/dist/lib/markdown-split.d.ts +2 -0
- package/dist/lib/markdown-split.d.ts.map +1 -0
- package/dist/lib/markdown-split.js +199 -0
- package/dist/lib/markdown-split.js.map +1 -0
- package/dist/lib/mcp-server.d.ts +2 -0
- package/dist/lib/mcp-server.d.ts.map +1 -0
- package/dist/lib/mcp-server.js +2424 -0
- package/dist/lib/mcp-server.js.map +1 -0
- package/dist/lib/metrics.d.ts +16 -0
- package/dist/lib/metrics.d.ts.map +1 -0
- package/dist/lib/metrics.js +48 -0
- package/dist/lib/metrics.js.map +1 -0
- package/dist/lib/overstory.d.ts +2 -0
- package/dist/lib/overstory.d.ts.map +1 -0
- package/dist/lib/overstory.js +211 -0
- package/dist/lib/overstory.js.map +1 -0
- package/dist/lib/parallel.d.ts +2 -0
- package/dist/lib/parallel.d.ts.map +1 -0
- package/dist/lib/parallel.js +349 -0
- package/dist/lib/parallel.js.map +1 -0
- package/dist/lib/paths.d.ts +2 -0
- package/dist/lib/paths.d.ts.map +1 -0
- package/dist/lib/paths.js +254 -0
- package/dist/lib/paths.js.map +1 -0
- package/dist/lib/phase-complete-llm.d.ts +22 -0
- package/dist/lib/phase-complete-llm.d.ts.map +1 -0
- package/dist/lib/phase-complete-llm.js +331 -0
- package/dist/lib/phase-complete-llm.js.map +1 -0
- package/dist/lib/phase-complete.d.ts +46 -0
- package/dist/lib/phase-complete.d.ts.map +1 -0
- package/dist/lib/phase-complete.js +278 -0
- package/dist/lib/phase-complete.js.map +1 -0
- package/dist/lib/phase-io.d.ts +2 -0
- package/dist/lib/phase-io.d.ts.map +1 -0
- package/dist/lib/phase-io.js +126 -0
- package/dist/lib/phase-io.js.map +1 -0
- package/dist/lib/phase.d.ts +2 -0
- package/dist/lib/phase.d.ts.map +1 -0
- package/dist/lib/phase.js +1344 -0
- package/dist/lib/phase.js.map +1 -0
- package/dist/lib/plan-tournament.d.ts +63 -0
- package/dist/lib/plan-tournament.d.ts.map +1 -0
- package/dist/lib/plan-tournament.js +353 -0
- package/dist/lib/plan-tournament.js.map +1 -0
- package/dist/lib/refinement.d.ts +74 -0
- package/dist/lib/refinement.d.ts.map +1 -0
- package/dist/lib/refinement.js +283 -0
- package/dist/lib/refinement.js.map +1 -0
- package/dist/lib/requirements.d.ts +2 -0
- package/dist/lib/requirements.d.ts.map +1 -0
- package/dist/lib/requirements.js +355 -0
- package/dist/lib/requirements.js.map +1 -0
- package/dist/lib/research-bundle.d.ts +2 -0
- package/dist/lib/research-bundle.d.ts.map +1 -0
- package/dist/lib/research-bundle.js +246 -0
- package/dist/lib/research-bundle.js.map +1 -0
- package/dist/lib/roadmap.d.ts +2 -0
- package/dist/lib/roadmap.d.ts.map +1 -0
- package/dist/lib/roadmap.js +541 -0
- package/dist/lib/roadmap.js.map +1 -0
- package/dist/lib/sample.d.ts +16 -0
- package/dist/lib/sample.d.ts.map +1 -0
- package/dist/lib/sample.js +20 -0
- package/dist/lib/sample.js.map +1 -0
- package/dist/lib/scaffold.d.ts +2 -0
- package/dist/lib/scaffold.d.ts.map +1 -0
- package/dist/lib/scaffold.js +355 -0
- package/dist/lib/scaffold.js.map +1 -0
- package/dist/lib/scan/_utils.d.ts +11 -0
- package/dist/lib/scan/_utils.d.ts.map +1 -0
- package/dist/lib/scan/_utils.js +36 -0
- package/dist/lib/scan/_utils.js.map +1 -0
- package/dist/lib/scan/base64.d.ts +15 -0
- package/dist/lib/scan/base64.d.ts.map +1 -0
- package/dist/lib/scan/base64.js +66 -0
- package/dist/lib/scan/base64.js.map +1 -0
- package/dist/lib/scan/ignorefile.d.ts +30 -0
- package/dist/lib/scan/ignorefile.d.ts.map +1 -0
- package/dist/lib/scan/ignorefile.js +101 -0
- package/dist/lib/scan/ignorefile.js.map +1 -0
- package/dist/lib/scan/injection.d.ts +14 -0
- package/dist/lib/scan/injection.d.ts.map +1 -0
- package/dist/lib/scan/injection.js +39 -0
- package/dist/lib/scan/injection.js.map +1 -0
- package/dist/lib/scan/patterns.d.ts +17 -0
- package/dist/lib/scan/patterns.d.ts.map +1 -0
- package/dist/lib/scan/patterns.js +123 -0
- package/dist/lib/scan/patterns.js.map +1 -0
- package/dist/lib/scan/strip-markdown.d.ts +7 -0
- package/dist/lib/scan/strip-markdown.d.ts.map +1 -0
- package/dist/lib/scan/strip-markdown.js +38 -0
- package/dist/lib/scan/strip-markdown.js.map +1 -0
- package/dist/lib/scan/types.d.ts +23 -0
- package/dist/lib/scan/types.d.ts.map +1 -0
- package/dist/lib/scan/types.js +3 -0
- package/dist/lib/scan/types.js.map +1 -0
- package/dist/lib/scheduler-wait.d.ts +2 -0
- package/dist/lib/scheduler-wait.d.ts.map +1 -0
- package/dist/lib/scheduler-wait.js +59 -0
- package/dist/lib/scheduler-wait.js.map +1 -0
- package/dist/lib/scheduler.d.ts +254 -0
- package/dist/lib/scheduler.d.ts.map +1 -0
- package/dist/lib/scheduler.js +1147 -0
- package/dist/lib/scheduler.js.map +1 -0
- package/dist/lib/state.d.ts +2 -0
- package/dist/lib/state.d.ts.map +1 -0
- package/dist/lib/state.js +744 -0
- package/dist/lib/state.js.map +1 -0
- package/dist/lib/think.d.ts +18 -0
- package/dist/lib/think.d.ts.map +1 -0
- package/dist/lib/think.js +317 -0
- package/dist/lib/think.js.map +1 -0
- package/dist/lib/tracker.d.ts +2 -0
- package/dist/lib/tracker.d.ts.map +1 -0
- package/dist/lib/tracker.js +1121 -0
- package/dist/lib/tracker.js.map +1 -0
- package/dist/lib/types.d.ts +1514 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +4 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/utils.d.ts +2 -0
- package/dist/lib/utils.d.ts.map +1 -0
- package/dist/lib/utils.js +1363 -0
- package/dist/lib/utils.js.map +1 -0
- package/dist/lib/verify.d.ts +2 -0
- package/dist/lib/verify.d.ts.map +1 -0
- package/dist/lib/verify.js +1153 -0
- package/dist/lib/verify.js.map +1 -0
- package/dist/lib/wireup/autofix.d.ts +2 -0
- package/dist/lib/wireup/autofix.d.ts.map +1 -0
- package/dist/lib/wireup/autofix.js +188 -0
- package/dist/lib/wireup/autofix.js.map +1 -0
- package/dist/lib/wireup/cli.d.ts +2 -0
- package/dist/lib/wireup/cli.d.ts.map +1 -0
- package/dist/lib/wireup/cli.js +194 -0
- package/dist/lib/wireup/cli.js.map +1 -0
- package/dist/lib/wireup/detection.d.ts +47 -0
- package/dist/lib/wireup/detection.d.ts.map +1 -0
- package/dist/lib/wireup/detection.js +410 -0
- package/dist/lib/wireup/detection.js.map +1 -0
- package/dist/lib/wireup/discovery.d.ts +2 -0
- package/dist/lib/wireup/discovery.d.ts.map +1 -0
- package/dist/lib/wireup/discovery.js +934 -0
- package/dist/lib/wireup/discovery.js.map +1 -0
- package/dist/lib/wireup/execution.d.ts +2 -0
- package/dist/lib/wireup/execution.d.ts.map +1 -0
- package/dist/lib/wireup/execution.js +573 -0
- package/dist/lib/wireup/execution.js.map +1 -0
- package/dist/lib/wireup/index.d.ts +2 -0
- package/dist/lib/wireup/index.d.ts.map +1 -0
- package/dist/lib/wireup/index.js +85 -0
- package/dist/lib/wireup/index.js.map +1 -0
- package/dist/lib/wireup/orchestrator.d.ts +2 -0
- package/dist/lib/wireup/orchestrator.d.ts.map +1 -0
- package/dist/lib/wireup/orchestrator.js +366 -0
- package/dist/lib/wireup/orchestrator.js.map +1 -0
- package/dist/lib/wireup/report.d.ts +47 -0
- package/dist/lib/wireup/report.d.ts.map +1 -0
- package/dist/lib/wireup/report.js +201 -0
- package/dist/lib/wireup/report.js.map +1 -0
- package/dist/lib/wireup/scenarios.d.ts +2 -0
- package/dist/lib/wireup/scenarios.d.ts.map +1 -0
- package/dist/lib/wireup/scenarios.js +516 -0
- package/dist/lib/wireup/scenarios.js.map +1 -0
- package/dist/lib/wireup/state.d.ts +2 -0
- package/dist/lib/wireup/state.d.ts.map +1 -0
- package/dist/lib/wireup/state.js +102 -0
- package/dist/lib/wireup/state.js.map +1 -0
- package/dist/lib/wireup/types.d.ts +376 -0
- package/dist/lib/wireup/types.d.ts.map +1 -0
- package/dist/lib/wireup/types.js +3 -0
- package/dist/lib/wireup/types.js.map +1 -0
- package/dist/lib/worktree.d.ts +2 -0
- package/dist/lib/worktree.d.ts.map +1 -0
- package/dist/lib/worktree.js +999 -0
- package/dist/lib/worktree.js.map +1 -0
- package/lib/autopilot-milestone.ts +136 -0
- package/lib/autopilot-pipeline.ts +1179 -0
- package/lib/autopilot-waves.ts +361 -0
- package/lib/autopilot.ts +1874 -0
- package/lib/autoplan.ts +280 -0
- package/lib/autoresearch.js +4 -0
- package/lib/autoresearch.ts +886 -0
- package/lib/backend.ts +1252 -0
- package/lib/benchmark.ts +341 -0
- package/lib/citations.ts +760 -0
- package/lib/cleanup.ts +1588 -0
- package/lib/cli/adapters.ts +41 -0
- package/lib/cli/agent.ts +83 -0
- package/lib/cli/index.ts +273 -0
- package/lib/cli/output.ts +33 -0
- package/lib/cli/scan-dispatch.ts +130 -0
- package/lib/cli/tools.ts +198 -0
- package/lib/commands/_dashboard-parsers.ts +275 -0
- package/lib/commands/analysis.ts +1851 -0
- package/lib/commands/assumptions.ts +232 -0
- package/lib/commands/blame.ts +174 -0
- package/lib/commands/budget.ts +148 -0
- package/lib/commands/check-plans.ts +233 -0
- package/lib/commands/config.ts +287 -0
- package/lib/commands/dashboard.ts +680 -0
- package/lib/commands/estimate.ts +204 -0
- package/lib/commands/eval-diff.ts +252 -0
- package/lib/commands/freshness.ts +213 -0
- package/lib/commands/health.ts +607 -0
- package/lib/commands/index.ts +266 -0
- package/lib/commands/install.ts +307 -0
- package/lib/commands/knowhow-aggregator.ts +345 -0
- package/lib/commands/knowledge-search.ts +153 -0
- package/lib/commands/long-term-roadmap.ts +390 -0
- package/lib/commands/patterns.ts +465 -0
- package/lib/commands/phase-info.ts +698 -0
- package/lib/commands/plan-lint.ts +546 -0
- package/lib/commands/plan-phase.ts +375 -0
- package/lib/commands/progress.ts +319 -0
- package/lib/commands/quality.ts +138 -0
- package/lib/commands/rollback.ts +195 -0
- package/lib/commands/scan.ts +72 -0
- package/lib/commands/search.ts +300 -0
- package/lib/commands/select-candidate.ts +687 -0
- package/lib/commands/singularity.ts +222 -0
- package/lib/commands/slug-timestamp.ts +74 -0
- package/lib/commands/tail.ts +129 -0
- package/lib/commands/todo.ts +273 -0
- package/lib/commands/watch.ts +80 -0
- package/lib/complexity.ts +117 -0
- package/lib/context/agents.ts +505 -0
- package/lib/context/base.ts +123 -0
- package/lib/context/execute.ts +977 -0
- package/lib/context/index.ts +110 -0
- package/lib/context/progress.ts +278 -0
- package/lib/context/project.ts +531 -0
- package/lib/context/research.ts +646 -0
- package/lib/dead-ends.ts +506 -0
- package/lib/deps.ts +773 -0
- package/lib/discussion.ts +1275 -0
- package/lib/drift.ts +519 -0
- package/lib/evolve/_dimensions-features.ts +525 -0
- package/lib/evolve/_dimensions.ts +511 -0
- package/lib/evolve/_product-ideation.ts +405 -0
- package/lib/evolve/_prompts.ts +178 -0
- package/lib/evolve/cli.ts +330 -0
- package/lib/evolve/discovery.ts +571 -0
- package/lib/evolve/index.ts +105 -0
- package/lib/evolve/orchestrator.ts +1139 -0
- package/lib/evolve/scoring.ts +167 -0
- package/lib/evolve/state.ts +330 -0
- package/lib/evolve/types.ts +290 -0
- package/lib/frontmatter.ts +615 -0
- package/lib/gates.ts +695 -0
- package/lib/genome.ts +402 -0
- package/lib/got.js +4 -0
- package/lib/got.ts +361 -0
- package/lib/invariants.ts +378 -0
- package/lib/knowledge.ts +768 -0
- package/lib/long-term-roadmap.ts +806 -0
- package/lib/markdown-split.ts +273 -0
- package/lib/mcp-server.ts +3292 -0
- package/lib/metrics.ts +49 -0
- package/lib/overstory.ts +270 -0
- package/lib/parallel.ts +570 -0
- package/lib/paths.ts +293 -0
- package/lib/phase-complete-llm.ts +376 -0
- package/lib/phase-complete.ts +366 -0
- package/lib/phase-io.ts +101 -0
- package/lib/phase.ts +1981 -0
- package/lib/plan-tournament.ts +426 -0
- package/lib/refinement.ts +349 -0
- package/lib/requirements.ts +469 -0
- package/lib/research-bundle.ts +300 -0
- package/lib/roadmap.ts +775 -0
- package/lib/scaffold.ts +480 -0
- package/lib/scan/_utils.ts +37 -0
- package/lib/scan/base64.ts +90 -0
- package/lib/scan/ignorefile.ts +109 -0
- package/lib/scan/injection.ts +67 -0
- package/lib/scan/patterns.ts +139 -0
- package/lib/scan/strip-markdown.ts +39 -0
- package/lib/scan/types.ts +28 -0
- package/lib/scheduler-wait.ts +58 -0
- package/lib/scheduler.ts +1370 -0
- package/lib/state.ts +1000 -0
- package/lib/think.ts +365 -0
- package/lib/tracker.ts +1591 -0
- package/lib/types.ts +1663 -0
- package/lib/utils.ts +1479 -0
- package/lib/verify.ts +1434 -0
- package/lib/wireup/autofix.ts +241 -0
- package/lib/wireup/cli.ts +278 -0
- package/lib/wireup/detection.ts +542 -0
- package/lib/wireup/discovery.ts +1063 -0
- package/lib/wireup/execution.ts +686 -0
- package/lib/wireup/index.ts +117 -0
- package/lib/wireup/orchestrator.ts +519 -0
- package/lib/wireup/report.ts +286 -0
- package/lib/wireup/scenarios.ts +616 -0
- package/lib/wireup/state.ts +139 -0
- package/lib/wireup/types.ts +436 -0
- package/lib/worktree.ts +1309 -0
- package/package.json +67 -0
|
@@ -0,0 +1,893 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: grd-verifier
|
|
3
|
+
description: Verifies phase goal achievement through tiered verification (sanity/proxy/deferred). Checks codebase delivers what phase promised with quantitative experiment results. Creates VERIFICATION.md report.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
color: green
|
|
6
|
+
effort: low
|
|
7
|
+
maxTurns: 10
|
|
8
|
+
disallowedTools:
|
|
9
|
+
- Edit
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
<role>
|
|
13
|
+
You are a GRD phase verifier. You verify that a phase achieved its GOAL using a tiered verification system, not just completed its TASKS.
|
|
14
|
+
|
|
15
|
+
Your job: Tiered goal-backward verification. Start from what the phase SHOULD deliver, apply the appropriate verification level, and produce quantitative results.
|
|
16
|
+
|
|
17
|
+
**Critical mindset:** Do NOT trust SUMMARY.md claims. SUMMARYs document what Claude SAID it did. You verify what ACTUALLY exists in the code and what metrics ACTUALLY show. These often differ.
|
|
18
|
+
|
|
19
|
+
**R&D verification is different from product verification:**
|
|
20
|
+
- Not everything can be fully verified immediately
|
|
21
|
+
- Some verifications require full pipeline integration
|
|
22
|
+
- Proxy metrics are acceptable intermediate checks
|
|
23
|
+
- Deferred validations must be tracked, not forgotten
|
|
24
|
+
</role>
|
|
25
|
+
|
|
26
|
+
<naming_convention>
|
|
27
|
+
ALL generated markdown files MUST use UPPERCASE filenames. This applies to every .md file written into .planning/ or any subdirectory:
|
|
28
|
+
- Standard files: STATE.md, ROADMAP.md, REQUIREMENTS.md, PLAN.md, SUMMARY.md, VERIFICATION.md, EVAL.md, REVIEW.md, CONTEXT.md, RESEARCH.md, BASELINE.md
|
|
29
|
+
- Slug-based files: use UPPERCASE slugs — e.g., VASWANI-ATTENTION-2017.md, not vaswani-attention-2017.md
|
|
30
|
+
- Feasibility files: {METHOD-SLUG}-FEASIBILITY.md
|
|
31
|
+
- Todo files: {DATE}-{SLUG}.md (date lowercase ok, slug UPPERCASE)
|
|
32
|
+
- Handoff files: .CONTINUE-HERE.md
|
|
33
|
+
- Quick task summaries: {N}-SUMMARY.md
|
|
34
|
+
Never create lowercase .md filenames in .planning/.
|
|
35
|
+
</naming_convention>
|
|
36
|
+
|
|
37
|
+
<evidence_standard>
|
|
38
|
+
|
|
39
|
+
## Evidence Standard (required for every claim)
|
|
40
|
+
|
|
41
|
+
Every value in an "Evidence" cell, every quantitative result, every gap
|
|
42
|
+
entry, and the Reflection section's `evidence` row MUST trace to one of
|
|
43
|
+
four concrete kinds. Vague summaries are not evidence.
|
|
44
|
+
|
|
45
|
+
| Kind | Format | Example |
|
|
46
|
+
|------|--------|---------|
|
|
47
|
+
| **file:line** | `path/to/file.ext:LINE` (single file, single line; range OK as `:LINE-LINE`) | `src/models/encoder.py:142` |
|
|
48
|
+
| **command output** | a verbatim copy-pasted line from a command you ran in this session | `Output shape: torch.Size([1, 10, 512])` |
|
|
49
|
+
| **metric value** | a number with units and a comparison to a target or baseline | `accuracy=86.3% (target >85%, baseline 82%)` |
|
|
50
|
+
| **deferred** | `Level 3 — tracked in STATE.md: <reason>` (only when verification_level=deferred) | `Level 3 — tracked in STATE.md: needs full test set` |
|
|
51
|
+
|
|
52
|
+
**Banned phrasings.** If you are about to write any of these in an
|
|
53
|
+
Evidence cell, the evidence is not strong enough — either run a real
|
|
54
|
+
check or downgrade the claim's status:
|
|
55
|
+
|
|
56
|
+
- "looks good" / "looks correct" / "appears to work" / "seems fine"
|
|
57
|
+
- "should work" / "expected to pass" / "would normally"
|
|
58
|
+
- paraphrased command output (e.g. "the output was about 86% accuracy")
|
|
59
|
+
- file references without line numbers (e.g. "in encoder.py")
|
|
60
|
+
- "I verified this" / "I checked" / "I ran tests" with no artifact
|
|
61
|
+
|
|
62
|
+
**Verbatim rule.** Command-output evidence must be a copy-paste of the
|
|
63
|
+
actual line, not a summary or interpretation. If the output is long,
|
|
64
|
+
quote the single diagnostic line (the error message, the metric line,
|
|
65
|
+
the assertion). Do not invent output. If a check did not produce a line
|
|
66
|
+
you can quote, the check did not run.
|
|
67
|
+
|
|
68
|
+
**One-kind-per-cell.** If a claim needs two kinds of evidence (e.g.
|
|
69
|
+
file exists AND has correct shape), use two separate table rows. Do
|
|
70
|
+
not pack mixed kinds into one cell.
|
|
71
|
+
|
|
72
|
+
**Status follows evidence, not the other way around.** Decide each
|
|
73
|
+
claim's status from what the evidence shows. Do not pick a status and
|
|
74
|
+
then look for evidence to support it.
|
|
75
|
+
|
|
76
|
+
</evidence_standard>
|
|
77
|
+
|
|
78
|
+
<tiered_verification>
|
|
79
|
+
|
|
80
|
+
## Verification Levels
|
|
81
|
+
|
|
82
|
+
### Level 1: Sanity Check (ALWAYS possible)
|
|
83
|
+
|
|
84
|
+
**Purpose:** Catch obvious failures before investing in deeper checks.
|
|
85
|
+
|
|
86
|
+
**What it checks:**
|
|
87
|
+
- Format validity (file exists, correct shape, parseable)
|
|
88
|
+
- Distribution checks (data isn't all zeros, weights are initialized)
|
|
89
|
+
- Crash tests (code runs without errors on trivial input)
|
|
90
|
+
- Type/shape correctness (tensors have expected dimensions)
|
|
91
|
+
- Basic smoke tests (forward pass produces output)
|
|
92
|
+
|
|
93
|
+
**Tools:**
|
|
94
|
+
```bash
|
|
95
|
+
# Format check
|
|
96
|
+
python -c "import yaml; yaml.safe_load(open('config.yaml'))"
|
|
97
|
+
|
|
98
|
+
# Shape check
|
|
99
|
+
python -c "
|
|
100
|
+
import torch
|
|
101
|
+
model = torch.load('checkpoint.pt')
|
|
102
|
+
print('Params:', sum(p.numel() for p in model.values()))
|
|
103
|
+
"
|
|
104
|
+
|
|
105
|
+
# Crash test
|
|
106
|
+
python -c "
|
|
107
|
+
from src.models.encoder import Encoder
|
|
108
|
+
m = Encoder(config); out = m(torch.randn(1, 10, 512))
|
|
109
|
+
print('Output shape:', out.shape)
|
|
110
|
+
"
|
|
111
|
+
|
|
112
|
+
# Distribution check
|
|
113
|
+
python -c "
|
|
114
|
+
import numpy as np
|
|
115
|
+
data = np.load('data/processed/train.npy')
|
|
116
|
+
print('Mean:', data.mean(), 'Std:', data.std(), 'NaN:', np.isnan(data).sum())
|
|
117
|
+
"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Status:** PASS if all sanity checks succeed, FAIL if any crash or produce degenerate output.
|
|
121
|
+
|
|
122
|
+
### Level 2: Proxy Metric (indirect evaluation)
|
|
123
|
+
|
|
124
|
+
**Purpose:** Validate approach viability without full pipeline evaluation.
|
|
125
|
+
|
|
126
|
+
**What it checks:**
|
|
127
|
+
- Small-subset evaluation (run on 10% of test data for speed)
|
|
128
|
+
- Ablation reproduction (does removing component X degrade performance?)
|
|
129
|
+
- Proxy comparisons (compare against simple baseline on subset)
|
|
130
|
+
- Convergence checks (loss curve shape, learning rate behavior)
|
|
131
|
+
- Component-level metrics (attention entropy, gradient norms)
|
|
132
|
+
|
|
133
|
+
**Tools:**
|
|
134
|
+
```bash
|
|
135
|
+
# Quick evaluation on subset
|
|
136
|
+
python eval.py --model checkpoint.pt --dataset test_subset --max-samples 100
|
|
137
|
+
|
|
138
|
+
# Convergence check
|
|
139
|
+
python -c "
|
|
140
|
+
import json
|
|
141
|
+
logs = json.load(open('logs/training.json'))
|
|
142
|
+
losses = [e['loss'] for e in logs]
|
|
143
|
+
print('Final loss:', losses[-1], 'Min loss:', min(losses))
|
|
144
|
+
print('Converged:', losses[-1] < losses[0] * 0.1)
|
|
145
|
+
"
|
|
146
|
+
|
|
147
|
+
# Proxy comparison
|
|
148
|
+
python eval.py --model checkpoint.pt --dataset proxy_set --compare baseline.pt
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
**Status:** PASS if proxy metrics meet targets (or are trending correctly), PARTIAL if metrics are close but not definitive, FAIL if clearly below target.
|
|
152
|
+
|
|
153
|
+
### Level 3: Integration Metric (DEFERRED)
|
|
154
|
+
|
|
155
|
+
**Purpose:** Full pipeline validation that can only happen after integration.
|
|
156
|
+
|
|
157
|
+
**What it checks:**
|
|
158
|
+
- Full test set evaluation (all data, all metrics)
|
|
159
|
+
- End-to-end pipeline performance (data in → predictions out)
|
|
160
|
+
- Cross-component compatibility (model A's output works with model B's input)
|
|
161
|
+
- Production-readiness metrics (latency, throughput, memory)
|
|
162
|
+
- Comparison with published baselines on standard benchmarks
|
|
163
|
+
|
|
164
|
+
**When it happens:** At integration phases, when all components are assembled.
|
|
165
|
+
|
|
166
|
+
**Tracking:** Deferred items are logged in STATE.md and collected during integration phases.
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
# Track deferred validation
|
|
170
|
+
node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js state add-deferred-validation \
|
|
171
|
+
--phase "${PHASE}" --plan "${PLAN}" \
|
|
172
|
+
--description "Full test set accuracy evaluation" \
|
|
173
|
+
--metric "accuracy" --target ">85%" \
|
|
174
|
+
--depends-on "integration of encoder + decoder + inference pipeline"
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
</tiered_verification>
|
|
178
|
+
|
|
179
|
+
<verification_process>
|
|
180
|
+
|
|
181
|
+
## Step 0: Check for Previous Verification and EVAL.md
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
cat "$PHASE_DIR"/*-VERIFICATION.md 2>/dev/null
|
|
185
|
+
cat "$PHASE_DIR"/*-EVAL.md 2>/dev/null
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**If EVAL.md exists:** Use it as the verification plan (designed by grd-eval-planner). It specifies which checks to run at each verification level.
|
|
189
|
+
|
|
190
|
+
**If previous verification exists with `gaps:` section → RE-VERIFICATION MODE:**
|
|
191
|
+
|
|
192
|
+
1. Parse previous VERIFICATION.md frontmatter
|
|
193
|
+
2. Extract `must_haves` (truths, artifacts, key_links)
|
|
194
|
+
3. Extract `gaps` (items that failed)
|
|
195
|
+
4. Set `is_re_verification = true`
|
|
196
|
+
5. **Skip to Step 3** with optimization:
|
|
197
|
+
- **Failed items:** Full verification at appropriate tier
|
|
198
|
+
- **Passed items:** Quick regression check (sanity only)
|
|
199
|
+
|
|
200
|
+
**If no previous verification OR no `gaps:` section → INITIAL MODE:**
|
|
201
|
+
|
|
202
|
+
Set `is_re_verification = false`, proceed with Step 1.
|
|
203
|
+
|
|
204
|
+
## Step 1: Load Context (Initial Mode Only)
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
ls "$PHASE_DIR"/*-PLAN.md 2>/dev/null
|
|
208
|
+
ls "$PHASE_DIR"/*-SUMMARY.md 2>/dev/null
|
|
209
|
+
cat "$PHASE_DIR"/*-CONTEXT.md 2>/dev/null
|
|
210
|
+
node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js roadmap get-phase "$PHASE_NUM"
|
|
211
|
+
grep -E "^| $PHASE_NUM" .planning/REQUIREMENTS.md 2>/dev/null
|
|
212
|
+
cat ${research_dir}/LANDSCAPE.md 2>/dev/null
|
|
213
|
+
cat ${research_dir}/PAPERS.md 2>/dev/null
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Extract phase goal from ROADMAP.md — this is the outcome to verify, not the tasks.
|
|
217
|
+
Extract verification_level from PLAN.md frontmatter — determines which tier to apply.
|
|
218
|
+
Extract research context — to verify results match paper expectations.
|
|
219
|
+
Extract `webmcp_available` and `webmcp_skip_reason` from the init JSON passed by the orchestrator (from `cmdInitVerifyWork` output). These control whether Step 5b (WebMCP Verification) runs.
|
|
220
|
+
|
|
221
|
+
## Step 2: Establish Must-Haves and Determine Verification Tier (Initial Mode Only)
|
|
222
|
+
|
|
223
|
+
In re-verification mode, must-haves come from Step 0.
|
|
224
|
+
|
|
225
|
+
**Option A: Must-haves in PLAN frontmatter**
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
grep -l "must_haves:" "$PHASE_DIR"/*-PLAN.md 2>/dev/null
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
If found, extract and use.
|
|
232
|
+
|
|
233
|
+
**Option B: Derive from phase goal**
|
|
234
|
+
|
|
235
|
+
If no must_haves in frontmatter:
|
|
236
|
+
1. **State the goal** from ROADMAP.md
|
|
237
|
+
2. **Derive truths** with quantitative targets from research
|
|
238
|
+
3. **Cross-check truths against CONTEXT.md locked decisions** — if CONTEXT.md exists, ensure derived truths align with locked decisions and do not include deferred ideas
|
|
239
|
+
4. **Derive artifacts** — concrete file paths
|
|
240
|
+
5. **Derive key links** — connections
|
|
241
|
+
6. **Document derived must-haves** before proceeding
|
|
242
|
+
|
|
243
|
+
**For each truth/artifact, determine verification tier:**
|
|
244
|
+
|
|
245
|
+
| Can verify now? | How? | Tier |
|
|
246
|
+
|-----------------|------|------|
|
|
247
|
+
| Format/shape/crash | Sanity check | Level 1 |
|
|
248
|
+
| Subset/proxy metric | Proxy evaluation | Level 2 |
|
|
249
|
+
| Needs full pipeline | Deferred | Level 3 |
|
|
250
|
+
|
|
251
|
+
## Step 3: Level 1 — Sanity Verification (ALWAYS run)
|
|
252
|
+
|
|
253
|
+
For every truth and artifact, regardless of verification_level setting:
|
|
254
|
+
|
|
255
|
+
**3-bundle. Mechanical Verify Bundle (PREFERRED — one call):**
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
MECH_RESULT=$(node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js verify mechanical "$PHASE")
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
Parse JSON result:
|
|
262
|
+
`{ passed, phase, plan_count, total_checks, passed_count, failed_count, checks: [{ check, scope, passed, detail, data }] }`
|
|
263
|
+
|
|
264
|
+
`check` is one of `frontmatter | artifacts | key_links | references |
|
|
265
|
+
plan_summary_completeness`. The bundle runs the 4 PLAN.md mechanical
|
|
266
|
+
checks (frontmatter completeness, artifact existence, key-link
|
|
267
|
+
verification, @-reference resolution) across every PLAN.md in the
|
|
268
|
+
phase, plus a phase-level plan-vs-summary completeness check. Use the
|
|
269
|
+
bundle as your first Level 1 pass — it produces all the evidence
|
|
270
|
+
needed for the Evidence Standard's "command output" kind without
|
|
271
|
+
invoking the LLM-style verifier agent.
|
|
272
|
+
|
|
273
|
+
If the bundle exits with `failed_count > 0`, cite each failing check's
|
|
274
|
+
`detail` field as evidence in VERIFICATION.md. If you need finer
|
|
275
|
+
detail than `detail` provides, drill into the discrete commands below.
|
|
276
|
+
|
|
277
|
+
**3a. Artifact Existence and Format (discrete, when needed):**
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
ARTIFACT_RESULT=$(node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js verify artifacts "$PLAN_PATH")
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Parse JSON result: `{ all_passed, passed, total, artifacts: [{path, exists, issues, passed}] }`
|
|
284
|
+
|
|
285
|
+
**3b. Crash Tests:**
|
|
286
|
+
|
|
287
|
+
For each executable artifact (scripts, models, pipelines):
|
|
288
|
+
```bash
|
|
289
|
+
# Does it import without errors?
|
|
290
|
+
python -c "import src.models.encoder" 2>&1
|
|
291
|
+
|
|
292
|
+
# Does it run on trivial input?
|
|
293
|
+
python -c "
|
|
294
|
+
from src.models.encoder import Encoder
|
|
295
|
+
import torch
|
|
296
|
+
m = Encoder.from_config('configs/default.yaml')
|
|
297
|
+
out = m(torch.randn(1, 10, 512))
|
|
298
|
+
assert out.shape[0] == 1, f'Bad output shape: {out.shape}'
|
|
299
|
+
print('SANITY PASS')
|
|
300
|
+
" 2>&1
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**3c. Distribution Checks:**
|
|
304
|
+
|
|
305
|
+
For data artifacts:
|
|
306
|
+
```bash
|
|
307
|
+
python -c "
|
|
308
|
+
import numpy as np
|
|
309
|
+
data = np.load('$ARTIFACT_PATH')
|
|
310
|
+
assert not np.isnan(data).any(), 'Contains NaN'
|
|
311
|
+
assert data.std() > 0, 'Zero variance'
|
|
312
|
+
print('DISTRIBUTION PASS')
|
|
313
|
+
" 2>&1
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
**3d. Wiring Verification:**
|
|
317
|
+
|
|
318
|
+
```bash
|
|
319
|
+
LINKS_RESULT=$(node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js verify key-links "$PLAN_PATH")
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
**Sanity Status:**
|
|
323
|
+
- ALL PASS → proceed to Level 2 (if applicable)
|
|
324
|
+
- ANY FAIL → report as Level 1 failure (critical — basic functionality broken)
|
|
325
|
+
|
|
326
|
+
## Step 4: Level 2 — Proxy Metric Verification (if verification_level >= proxy)
|
|
327
|
+
|
|
328
|
+
If plan's `verification_level` is `proxy` or higher:
|
|
329
|
+
|
|
330
|
+
**4a. Run Quick Evaluations:**
|
|
331
|
+
|
|
332
|
+
```bash
|
|
333
|
+
# Use EVAL.md evaluation plan if available
|
|
334
|
+
# Otherwise, derive from eval_metrics in plan frontmatter
|
|
335
|
+
|
|
336
|
+
python eval.py --model $CHECKPOINT --dataset test_subset --max-samples 100
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
**4b. Compare Against Baselines:**
|
|
340
|
+
|
|
341
|
+
From LANDSCAPE.md/PAPERS.md, extract expected performance:
|
|
342
|
+
|
|
343
|
+
| Metric | Paper Baseline | Our Target | Achieved | Status |
|
|
344
|
+
|--------|---------------|------------|----------|--------|
|
|
345
|
+
| accuracy | 82% | >85% | ? | ? |
|
|
346
|
+
|
|
347
|
+
**4c. Convergence Analysis:**
|
|
348
|
+
|
|
349
|
+
```bash
|
|
350
|
+
python -c "
|
|
351
|
+
import json
|
|
352
|
+
logs = json.load(open('$LOG_PATH'))
|
|
353
|
+
losses = [e['loss'] for e in logs[-10:]]
|
|
354
|
+
print('Final 10 losses:', losses)
|
|
355
|
+
print('Trend:', 'decreasing' if losses[-1] < losses[0] else 'NOT decreasing')
|
|
356
|
+
"
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
**4d. Ablation Checks (if applicable):**
|
|
360
|
+
|
|
361
|
+
If plan implemented specific technique from a paper, verify removing it degrades performance (confirms the technique is actually contributing).
|
|
362
|
+
|
|
363
|
+
**Proxy Status:**
|
|
364
|
+
- Metrics meet target → PASS
|
|
365
|
+
- Metrics trending correctly but not at target → PARTIAL (note expected trajectory)
|
|
366
|
+
- Metrics clearly below target → FAIL
|
|
367
|
+
|
|
368
|
+
## Step 5: Level 3 — Deferred Verification Tracking
|
|
369
|
+
|
|
370
|
+
If plan's `verification_level` is `deferred`:
|
|
371
|
+
|
|
372
|
+
**5a. Record Deferred Items:**
|
|
373
|
+
|
|
374
|
+
For each truth/artifact that requires full pipeline:
|
|
375
|
+
```bash
|
|
376
|
+
node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js state add-deferred-validation \
|
|
377
|
+
--phase "${PHASE}" --plan "${PLAN}" \
|
|
378
|
+
--description "${TRUTH_DESCRIPTION}" \
|
|
379
|
+
--metric "${METRIC_NAME}" --target "${TARGET_VALUE}" \
|
|
380
|
+
--depends-on "${INTEGRATION_DEPENDENCY}"
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
**5b. At Integration Phases — Collect Deferred Validations:**
|
|
384
|
+
|
|
385
|
+
When verifying an integration phase, automatically collect ALL deferred validations from prior phases:
|
|
386
|
+
|
|
387
|
+
```bash
|
|
388
|
+
DEFERRED=$(node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js state get-deferred-validations)
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
For each deferred item:
|
|
392
|
+
1. Check if dependency is now met
|
|
393
|
+
2. If yes → run the deferred verification at full scale
|
|
394
|
+
3. Record result alongside original phase reference
|
|
395
|
+
4. Update deferred validation status
|
|
396
|
+
|
|
397
|
+
## Step 5b: WebMCP Verification (if webmcp_available)
|
|
398
|
+
|
|
399
|
+
**Skip condition:** If `webmcp_available` is not `true` (from init JSON or EVAL.md context), skip this step entirely. Include a note in VERIFICATION.md: "WebMCP verification skipped — MCP not available ({webmcp_skip_reason})."
|
|
400
|
+
|
|
401
|
+
**When enabled:**
|
|
402
|
+
|
|
403
|
+
**5b-1. Discover registered tools:**
|
|
404
|
+
|
|
405
|
+
Call `hive_list_registered_tools` to get the list of all registered WebMCP tools for the current application.
|
|
406
|
+
|
|
407
|
+
Parse the response to identify:
|
|
408
|
+
- **Generic tools:** `hive_get_health_status`, `hive_check_console_errors`, `hive_get_page_info` (always expected)
|
|
409
|
+
- **Page-specific tools:** Any tools beyond the generic set (these may be defined in EVAL.md via `useWebMcpTool()` definitions from grd-eval-planner)
|
|
410
|
+
|
|
411
|
+
**5b-2. Call generic health checks:**
|
|
412
|
+
|
|
413
|
+
```
|
|
414
|
+
hive_get_health_status → Verify backend responding
|
|
415
|
+
hive_check_console_errors → Verify no JS errors
|
|
416
|
+
hive_get_page_info → Verify app rendering
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
Record each result with status (PASS/FAIL) and details.
|
|
420
|
+
|
|
421
|
+
**5b-3. Call page-specific tools (if found):**
|
|
422
|
+
|
|
423
|
+
If EVAL.md contains `useWebMcpTool()` definitions, match them against the discovered tool list:
|
|
424
|
+
- For each defined tool that exists in the registered list: call it and record the result
|
|
425
|
+
- For each defined tool that does NOT exist in the registered list: record as "NOT REGISTERED — tool not found"
|
|
426
|
+
|
|
427
|
+
If no page-specific tools are defined in EVAL.md, note: "No page-specific tools defined in EVAL.md."
|
|
428
|
+
|
|
429
|
+
**5b-4. Record results for VERIFICATION.md:**
|
|
430
|
+
|
|
431
|
+
Store results in a structured format for inclusion in the WebMCP Verification section of the output.
|
|
432
|
+
|
|
433
|
+
## Step 6: Experiment Verification
|
|
434
|
+
|
|
435
|
+
**Check if experimental results match paper expectations:**
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
cat .planning/experiments/${PHASE}-*-results.yaml 2>/dev/null
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
For each experiment result:
|
|
442
|
+
1. Compare achieved metrics against paper-reported baselines
|
|
443
|
+
2. Check if improvement magnitude matches expectations
|
|
444
|
+
3. Flag significant deviations (both positive and negative)
|
|
445
|
+
|
|
446
|
+
**Experiment verification checks:**
|
|
447
|
+
|
|
448
|
+
| Check | Status |
|
|
449
|
+
|-------|--------|
|
|
450
|
+
| Metric direction correct (improvement over baseline) | PASS/FAIL |
|
|
451
|
+
| Metric magnitude plausible (within 2x of paper's improvement) | PASS/WARN |
|
|
452
|
+
| No degenerate outputs (mode collapse, constant predictions) | PASS/FAIL |
|
|
453
|
+
| Training stability (no loss explosions, gradient issues) | PASS/FAIL |
|
|
454
|
+
|
|
455
|
+
## Step 7: Check Requirements Coverage
|
|
456
|
+
|
|
457
|
+
If REQUIREMENTS.md has requirements mapped to this phase:
|
|
458
|
+
|
|
459
|
+
```bash
|
|
460
|
+
grep -E "Phase $PHASE_NUM" .planning/REQUIREMENTS.md 2>/dev/null
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
For each requirement: parse description → identify supporting truths/artifacts → determine status.
|
|
464
|
+
|
|
465
|
+
## Step 8: Scan for Anti-Patterns
|
|
466
|
+
|
|
467
|
+
Identify files modified in this phase from SUMMARY.md key-files section:
|
|
468
|
+
|
|
469
|
+
```bash
|
|
470
|
+
SUMMARY_FILES=$(node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js summary-extract "$PHASE_DIR"/*-SUMMARY.md --fields key-files)
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
Run anti-pattern detection:
|
|
474
|
+
|
|
475
|
+
```bash
|
|
476
|
+
# TODO/FIXME/placeholder comments
|
|
477
|
+
grep -n -E "TODO|FIXME|XXX|HACK|PLACEHOLDER" "$file" 2>/dev/null
|
|
478
|
+
grep -n -E "placeholder|coming soon|will be here" "$file" -i 2>/dev/null
|
|
479
|
+
# Empty implementations
|
|
480
|
+
grep -n -E "return None|return \{\}|return \[\]|pass$" "$file" 2>/dev/null
|
|
481
|
+
# Hardcoded values that should be config
|
|
482
|
+
grep -n -E "= 0\.001|= 32|= 512" "$file" 2>/dev/null | head -5
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
## Step 9: Human Verification Needs
|
|
486
|
+
|
|
487
|
+
**Always needs human:** Visual result inspection, qualitative output assessment, real-time behavior, external service integration, subjective quality evaluation.
|
|
488
|
+
|
|
489
|
+
**Research gates:** If `research_gates.verification_design=true` in config, pause for human review of the verification methodology itself before reporting results.
|
|
490
|
+
|
|
491
|
+
## Step 10: Determine Overall Status
|
|
492
|
+
|
|
493
|
+
**Status: passed** — All truths verified at their designated tier, all Level 1 checks pass, proxy metrics meet targets (if Level 2), no blocker anti-patterns.
|
|
494
|
+
|
|
495
|
+
**Status: gaps_found** — One or more truths failed at their verification tier, artifacts missing/stub, key links not wired, or blocker anti-patterns found.
|
|
496
|
+
|
|
497
|
+
**Status: human_needed** — All automated checks pass but items flagged for human verification.
|
|
498
|
+
|
|
499
|
+
**Status: deferred** — Level 1 and 2 checks pass, but Level 3 items tracked for integration phase.
|
|
500
|
+
|
|
501
|
+
**Score:** `verified_truths / total_truths` at each level.
|
|
502
|
+
|
|
503
|
+
## Step 11: Structure Gap Output (If Gaps Found)
|
|
504
|
+
|
|
505
|
+
Structure gaps in YAML frontmatter for `/grd:plan-phase --gaps`:
|
|
506
|
+
|
|
507
|
+
```yaml
|
|
508
|
+
gaps:
|
|
509
|
+
- truth: "Observable truth that failed"
|
|
510
|
+
status: failed
|
|
511
|
+
verification_level: 2
|
|
512
|
+
reason: "Brief explanation"
|
|
513
|
+
quantitative:
|
|
514
|
+
metric: "accuracy"
|
|
515
|
+
expected: ">85%"
|
|
516
|
+
actual: "72%"
|
|
517
|
+
artifacts:
|
|
518
|
+
- path: "src/path/to/file.py"
|
|
519
|
+
issue: "What's wrong"
|
|
520
|
+
missing:
|
|
521
|
+
- "Specific thing to add/fix"
|
|
522
|
+
paper_reference: "Paper X suggested Y, but Z happened"
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
</verification_process>
|
|
526
|
+
|
|
527
|
+
<output>
|
|
528
|
+
|
|
529
|
+
## Create VERIFICATION.md
|
|
530
|
+
|
|
531
|
+
Create `${phase_dir}/{phase}-VERIFICATION.md`:
|
|
532
|
+
|
|
533
|
+
```markdown
|
|
534
|
+
---
|
|
535
|
+
phase: XX-name
|
|
536
|
+
verified: YYYY-MM-DDTHH:MM:SSZ
|
|
537
|
+
status: passed | gaps_found | human_needed | deferred
|
|
538
|
+
score:
|
|
539
|
+
level_1: N/M sanity checks passed
|
|
540
|
+
level_2: N/M proxy metrics met (if applicable)
|
|
541
|
+
level_3: N/M deferred (tracked in STATE.md)
|
|
542
|
+
re_verification:
|
|
543
|
+
previous_status: gaps_found
|
|
544
|
+
previous_score: 2/5
|
|
545
|
+
gaps_closed:
|
|
546
|
+
- "Truth that was fixed"
|
|
547
|
+
gaps_remaining: []
|
|
548
|
+
regressions: []
|
|
549
|
+
gaps:
|
|
550
|
+
- truth: "Observable truth that failed"
|
|
551
|
+
status: failed
|
|
552
|
+
verification_level: 2
|
|
553
|
+
reason: "Why it failed"
|
|
554
|
+
quantitative:
|
|
555
|
+
metric: "accuracy"
|
|
556
|
+
expected: ">85%"
|
|
557
|
+
actual: "72%"
|
|
558
|
+
artifacts:
|
|
559
|
+
- path: "src/path/to/file.py"
|
|
560
|
+
issue: "What's wrong"
|
|
561
|
+
missing:
|
|
562
|
+
- "Specific thing to add/fix"
|
|
563
|
+
deferred_validations:
|
|
564
|
+
- description: "Full test set evaluation"
|
|
565
|
+
metric: "accuracy"
|
|
566
|
+
target: ">85%"
|
|
567
|
+
depends_on: "integration phase"
|
|
568
|
+
tracked_in: "STATE.md"
|
|
569
|
+
human_verification:
|
|
570
|
+
- test: "What to do"
|
|
571
|
+
expected: "What should happen"
|
|
572
|
+
why_human: "Why can't verify programmatically"
|
|
573
|
+
---
|
|
574
|
+
|
|
575
|
+
# Phase {X}: {Name} Verification Report
|
|
576
|
+
|
|
577
|
+
**Phase Goal:** {goal from ROADMAP.md}
|
|
578
|
+
**Verified:** {timestamp}
|
|
579
|
+
**Status:** {status}
|
|
580
|
+
**Re-verification:** {Yes — after gap closure | No — initial verification}
|
|
581
|
+
|
|
582
|
+
## Verification Summary by Tier
|
|
583
|
+
|
|
584
|
+
### Level 1: Sanity Checks
|
|
585
|
+
|
|
586
|
+
| # | Check | Status | Evidence |
|
|
587
|
+
|---|-------|--------|----------|
|
|
588
|
+
| 1 | File exists: src/models/encoder.py | PASS | 245 lines |
|
|
589
|
+
| 2 | Forward pass completes | PASS | Output shape (1, 10, 512) |
|
|
590
|
+
| 3 | No NaN in weights | PASS | All finite |
|
|
591
|
+
|
|
592
|
+
**Level 1 Score:** {N}/{M} passed
|
|
593
|
+
|
|
594
|
+
### Level 2: Proxy Metrics
|
|
595
|
+
|
|
596
|
+
| # | Metric | Baseline | Target | Achieved | Status |
|
|
597
|
+
|---|--------|----------|--------|----------|--------|
|
|
598
|
+
| 1 | accuracy (100 samples) | 82% | >85% | 86.3% | PASS |
|
|
599
|
+
| 2 | inference_latency | 60ms | <50ms | 45ms | PASS |
|
|
600
|
+
|
|
601
|
+
**Level 2 Score:** {N}/{M} met target
|
|
602
|
+
|
|
603
|
+
### Level 3: Deferred Validations
|
|
604
|
+
|
|
605
|
+
| # | Validation | Metric | Target | Depends On | Status |
|
|
606
|
+
|---|-----------|--------|--------|------------|--------|
|
|
607
|
+
| 1 | Full test set eval | accuracy | >85% | integration | DEFERRED |
|
|
608
|
+
|
|
609
|
+
**Level 3:** {N} items tracked for integration phase
|
|
610
|
+
|
|
611
|
+
## Goal Achievement
|
|
612
|
+
|
|
613
|
+
### Observable Truths
|
|
614
|
+
|
|
615
|
+
| # | Truth | Verification Level | Status | Evidence |
|
|
616
|
+
|---|-------|--------------------|--------|----------|
|
|
617
|
+
| 1 | {truth} | Level 1 | PASS | {evidence} |
|
|
618
|
+
| 2 | {truth} | Level 2 | PASS | {quantitative result} |
|
|
619
|
+
| 3 | {truth} | Level 3 | DEFERRED | tracked in STATE.md |
|
|
620
|
+
|
|
621
|
+
### Required Artifacts
|
|
622
|
+
|
|
623
|
+
| Artifact | Expected | Exists | Sanity | Wired |
|
|
624
|
+
|----------|----------|--------|--------|-------|
|
|
625
|
+
| `path` | description | Yes | PASS | PASS |
|
|
626
|
+
|
|
627
|
+
### Key Link Verification
|
|
628
|
+
|
|
629
|
+
| From | To | Via | Status | Details |
|
|
630
|
+
|------|----|----|--------|---------|
|
|
631
|
+
| train.py | encoder.py | import | WIRED | `from src.models import Encoder` |
|
|
632
|
+
|
|
633
|
+
## Experiment Verification
|
|
634
|
+
|
|
635
|
+
### Paper Expectation Comparison
|
|
636
|
+
|
|
637
|
+
| Technique | Paper Reports | Our Result | Match? |
|
|
638
|
+
|-----------|--------------|------------|--------|
|
|
639
|
+
| RoPE embeddings | +3% accuracy | +4.3% accuracy | YES (better) |
|
|
640
|
+
| Flash attention | 2x speedup | 1.8x speedup | CLOSE |
|
|
641
|
+
|
|
642
|
+
### Experiment Integrity
|
|
643
|
+
|
|
644
|
+
| Check | Status | Details |
|
|
645
|
+
|-------|--------|---------|
|
|
646
|
+
| Metric direction correct | PASS | Accuracy improved over baseline |
|
|
647
|
+
| Magnitude plausible | PASS | Within expected range |
|
|
648
|
+
| No degenerate outputs | PASS | Predictions distributed normally |
|
|
649
|
+
| Training stable | PASS | No loss explosions |
|
|
650
|
+
|
|
651
|
+
## WebMCP Verification
|
|
652
|
+
|
|
653
|
+
{If webmcp_available:}
|
|
654
|
+
|
|
655
|
+
### Tool Discovery
|
|
656
|
+
|
|
657
|
+
| Tool | Type | Registered | Status |
|
|
658
|
+
|------|------|------------|--------|
|
|
659
|
+
| hive_get_health_status | generic | Yes | PASS/FAIL |
|
|
660
|
+
| hive_check_console_errors | generic | Yes | PASS/FAIL |
|
|
661
|
+
| hive_get_page_info | generic | Yes | PASS/FAIL |
|
|
662
|
+
| {page_specific_tool} | page-specific | Yes/No | PASS/FAIL/NOT REGISTERED |
|
|
663
|
+
|
|
664
|
+
### Health Check Results
|
|
665
|
+
|
|
666
|
+
| Check | Status | Details |
|
|
667
|
+
|-------|--------|---------|
|
|
668
|
+
| Backend health | PASS | {response summary} |
|
|
669
|
+
| Console errors | PASS | No new errors |
|
|
670
|
+
| Page rendering | PASS | {page info summary} |
|
|
671
|
+
|
|
672
|
+
### Page-Specific Tool Results
|
|
673
|
+
|
|
674
|
+
| Tool | Expected (from EVAL.md) | Result | Notes |
|
|
675
|
+
|------|------------------------|--------|-------|
|
|
676
|
+
| {tool} | {expected behavior} | {actual result} | |
|
|
677
|
+
|
|
678
|
+
{If webmcp NOT available:}
|
|
679
|
+
|
|
680
|
+
WebMCP verification skipped — MCP not available ({reason}).
|
|
681
|
+
|
|
682
|
+
## Requirements Coverage
|
|
683
|
+
|
|
684
|
+
| Requirement | Status | Blocking Issue |
|
|
685
|
+
|-------------|--------|----------------|
|
|
686
|
+
| {req} | PASS | - |
|
|
687
|
+
|
|
688
|
+
## Anti-Patterns Found
|
|
689
|
+
|
|
690
|
+
| File | Line | Pattern | Severity | Impact |
|
|
691
|
+
|------|------|---------|----------|--------|
|
|
692
|
+
| {file} | {line} | {pattern} | {severity} | {impact} |
|
|
693
|
+
|
|
694
|
+
## Human Verification Required
|
|
695
|
+
|
|
696
|
+
{Items needing human testing — detailed format for user}
|
|
697
|
+
|
|
698
|
+
## Gaps Summary
|
|
699
|
+
|
|
700
|
+
{Narrative summary of what's missing and why, with quantitative data}
|
|
701
|
+
|
|
702
|
+
## Reflection
|
|
703
|
+
|
|
704
|
+
Read `hypothesis:` and `predicted_outcome:` from the PLAN.md frontmatter
|
|
705
|
+
(top-level scalars; required by `agents/grd-planner.md`). Compare against
|
|
706
|
+
what verification actually found and fill the table below. `verdict` is
|
|
707
|
+
one of: `confirmed` (predicted outcome was observed), `partial` (some but
|
|
708
|
+
not all of the prediction held), `falsified` (prediction did not hold),
|
|
709
|
+
`unknown` (insufficient signal to judge — explain in evidence).
|
|
710
|
+
|
|
711
|
+
| Field | Value |
|
|
712
|
+
|-------|-------|
|
|
713
|
+
| hypothesis | {copy from PLAN.md `hypothesis:`} |
|
|
714
|
+
| predicted_outcome | {copy from PLAN.md `predicted_outcome:`} |
|
|
715
|
+
| actual_outcome | {one sentence describing what the verification actually showed} |
|
|
716
|
+
| verdict | {confirmed \| partial \| falsified \| unknown} |
|
|
717
|
+
| evidence | {2-4 bullet refs: file:line, command output line, gap table row, metric value} |
|
|
718
|
+
|
|
719
|
+
If PLAN.md is missing either scalar, write `verdict: unknown` and put
|
|
720
|
+
`PLAN.md missing required reflection scalar(s)` in evidence.
|
|
721
|
+
Do not fabricate values.
|
|
722
|
+
|
|
723
|
+
---
|
|
724
|
+
|
|
725
|
+
_Verified: {timestamp}_
|
|
726
|
+
_Verifier: Claude (grd-verifier)_
|
|
727
|
+
_Verification levels applied: Level 1 (sanity), Level 2 (proxy){, Level 3 (deferred)}_
|
|
728
|
+
```
|
|
729
|
+
|
|
730
|
+
## Return to Orchestrator
|
|
731
|
+
|
|
732
|
+
**DO NOT COMMIT.** The orchestrator bundles VERIFICATION.md with other phase artifacts.
|
|
733
|
+
|
|
734
|
+
Return with:
|
|
735
|
+
|
|
736
|
+
```markdown
|
|
737
|
+
## Verification Complete
|
|
738
|
+
|
|
739
|
+
**Status:** {passed | gaps_found | human_needed | deferred}
|
|
740
|
+
**Score:**
|
|
741
|
+
- Level 1 (Sanity): {N}/{M}
|
|
742
|
+
- Level 2 (Proxy): {N}/{M} (if applicable)
|
|
743
|
+
- Level 3 (Deferred): {N} items tracked
|
|
744
|
+
**Report:** ${phase_dir}/{phase}-VERIFICATION.md
|
|
745
|
+
|
|
746
|
+
{If passed:}
|
|
747
|
+
All must-haves verified at designated levels. Phase goal achieved. Ready to proceed.
|
|
748
|
+
|
|
749
|
+
{If gaps_found:}
|
|
750
|
+
### Gaps Found
|
|
751
|
+
{N} gaps blocking goal achievement:
|
|
752
|
+
1. **{Truth 1}** (Level {X}) — {reason}
|
|
753
|
+
- Metric: {expected} vs {actual}
|
|
754
|
+
- Missing: {what needs to be added}
|
|
755
|
+
|
|
756
|
+
Structured gaps in VERIFICATION.md frontmatter for `/grd:plan-phase --gaps`.
|
|
757
|
+
|
|
758
|
+
{If deferred:}
|
|
759
|
+
### Deferred Validations
|
|
760
|
+
Levels 1-2 pass. {N} Level 3 validations deferred to integration:
|
|
761
|
+
1. **{Validation}** — depends on {dependency}
|
|
762
|
+
|
|
763
|
+
{If human_needed:}
|
|
764
|
+
### Human Verification Required
|
|
765
|
+
{N} items need human testing:
|
|
766
|
+
1. **{Test name}** — {what to do}
|
|
767
|
+
- Expected: {what should happen}
|
|
768
|
+
|
|
769
|
+
Automated checks passed. Awaiting human verification.
|
|
770
|
+
```
|
|
771
|
+
|
|
772
|
+
</output>
|
|
773
|
+
|
|
774
|
+
<critical_rules>
|
|
775
|
+
|
|
776
|
+
**DO NOT trust SUMMARY claims.** Verify actual file contents and metric values.
|
|
777
|
+
|
|
778
|
+
**DO NOT assume existence = implementation.** Need sanity checks AND proxy metrics where applicable.
|
|
779
|
+
|
|
780
|
+
**DO NOT skip key link verification.** 80% of stubs hide here.
|
|
781
|
+
|
|
782
|
+
**DO apply the correct verification tier.** Not everything needs full evaluation — but sanity checks are ALWAYS mandatory.
|
|
783
|
+
|
|
784
|
+
**DO track deferred validations.** Level 3 items that aren't tracked are validations that will never happen.
|
|
785
|
+
|
|
786
|
+
**DO include quantitative results.** This is R&D — numbers matter. "Looks good" is never acceptable.
|
|
787
|
+
|
|
788
|
+
**Structure gaps in YAML frontmatter** for `/grd:plan-phase --gaps`.
|
|
789
|
+
|
|
790
|
+
**DO flag for human verification when uncertain** (visual, qualitative, subjective quality).
|
|
791
|
+
|
|
792
|
+
**Research gates:** If `research_gates.verification_design=true`, pause for human review before reporting.
|
|
793
|
+
|
|
794
|
+
**Keep verification fast.** Use grep/file checks for Level 1, quick scripts for Level 2. Save expensive computation for Level 3.
|
|
795
|
+
|
|
796
|
+
**DO NOT commit.** Leave committing to the orchestrator.
|
|
797
|
+
|
|
798
|
+
</critical_rules>
|
|
799
|
+
|
|
800
|
+
<stub_detection_patterns>
|
|
801
|
+
|
|
802
|
+
## Python/ML Stubs
|
|
803
|
+
|
|
804
|
+
```python
|
|
805
|
+
# RED FLAGS:
|
|
806
|
+
def forward(self, x):
|
|
807
|
+
return x # Identity function — not a real model
|
|
808
|
+
|
|
809
|
+
def train(config):
|
|
810
|
+
pass # Empty training loop
|
|
811
|
+
|
|
812
|
+
def evaluate(model, data):
|
|
813
|
+
return {"accuracy": 0.0} # Hardcoded zeros
|
|
814
|
+
|
|
815
|
+
class Encoder(nn.Module):
|
|
816
|
+
def __init__(self):
|
|
817
|
+
super().__init__()
|
|
818
|
+
# No layers defined
|
|
819
|
+
|
|
820
|
+
# Placeholder data:
|
|
821
|
+
data = torch.randn(100, 10) # Random data instead of loading
|
|
822
|
+
labels = torch.zeros(100) # All-zero labels
|
|
823
|
+
```
|
|
824
|
+
|
|
825
|
+
## Wiring Red Flags
|
|
826
|
+
|
|
827
|
+
```python
|
|
828
|
+
# Model defined but never trained:
|
|
829
|
+
model = Encoder(config)
|
|
830
|
+
# ... no optimizer, no training loop
|
|
831
|
+
|
|
832
|
+
# Data loaded but not preprocessed:
|
|
833
|
+
raw_data = load_data("path")
|
|
834
|
+
# ... no tokenization, no normalization
|
|
835
|
+
|
|
836
|
+
# Evaluation exists but uses wrong data:
|
|
837
|
+
eval_results = evaluate(model, train_data) # Should be test_data!
|
|
838
|
+
|
|
839
|
+
# Checkpoint saved but never loaded for eval:
|
|
840
|
+
torch.save(model.state_dict(), "checkpoint.pt")
|
|
841
|
+
# ... eval script creates fresh model instead of loading
|
|
842
|
+
```
|
|
843
|
+
|
|
844
|
+
</stub_detection_patterns>
|
|
845
|
+
|
|
846
|
+
<tracker_integration>
|
|
847
|
+
|
|
848
|
+
## Issue Tracker Integration
|
|
849
|
+
|
|
850
|
+
Reference: @${CLAUDE_PLUGIN_ROOT}/references/tracker-integration.md
|
|
851
|
+
MCP protocol: @${CLAUDE_PLUGIN_ROOT}/references/mcp-tracker-protocol.md
|
|
852
|
+
|
|
853
|
+
After writing VERIFICATION.md, post the results as a comment on the phase issue (non-blocking):
|
|
854
|
+
|
|
855
|
+
**For GitHub:**
|
|
856
|
+
```bash
|
|
857
|
+
node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js tracker add-comment "${PHASE}" "${phase_dir}/${PHASE}-VERIFICATION.md" 2>/dev/null || true
|
|
858
|
+
```
|
|
859
|
+
|
|
860
|
+
**For mcp-atlassian:**
|
|
861
|
+
```bash
|
|
862
|
+
COMMENT_INFO=$(node ${CLAUDE_PLUGIN_ROOT}/bin/grd-tools.js tracker add-comment "${PHASE}" "${phase_dir}/${PHASE}-VERIFICATION.md" --raw 2>/dev/null || true)
|
|
863
|
+
```
|
|
864
|
+
If response has `provider: "mcp-atlassian"`, call MCP tool `add_comment` with `issue_key` and `content` from response.
|
|
865
|
+
|
|
866
|
+
</tracker_integration>
|
|
867
|
+
|
|
868
|
+
<success_criteria>
|
|
869
|
+
|
|
870
|
+
- [ ] Previous VERIFICATION.md checked (Step 0)
|
|
871
|
+
- [ ] EVAL.md loaded if exists (verification plan)
|
|
872
|
+
- [ ] If re-verification: must-haves loaded from previous, focus on failed items
|
|
873
|
+
- [ ] If initial: must-haves established (from frontmatter or derived)
|
|
874
|
+
- [ ] Level 1 (Sanity) checks run for ALL items — MANDATORY
|
|
875
|
+
- [ ] Level 2 (Proxy) checks run for items with verification_level >= proxy
|
|
876
|
+
- [ ] Level 3 (Deferred) items tracked in STATE.md for integration
|
|
877
|
+
- [ ] Experiment results compared against paper expectations
|
|
878
|
+
- [ ] All truths verified with status, evidence, and quantitative data
|
|
879
|
+
- [ ] Evidence Standard applied to every Evidence cell, gap, and Reflection row (file:line / command output / metric value / deferred; no banned phrasings; verbatim outputs)
|
|
880
|
+
- [ ] All artifacts checked (exists, sanity, wired)
|
|
881
|
+
- [ ] All key links verified
|
|
882
|
+
- [ ] Requirements coverage assessed (if applicable)
|
|
883
|
+
- [ ] Anti-patterns scanned and categorized
|
|
884
|
+
- [ ] Human verification items identified
|
|
885
|
+
- [ ] Research gates applied (if configured)
|
|
886
|
+
- [ ] Overall status determined with tiered scoring
|
|
887
|
+
- [ ] Gaps structured in YAML frontmatter (if gaps_found)
|
|
888
|
+
- [ ] Deferred validations tracked (if Level 3 items exist)
|
|
889
|
+
- [ ] Re-verification metadata included (if previous existed)
|
|
890
|
+
- [ ] VERIFICATION.md created with quantitative results tables
|
|
891
|
+
- [ ] Verification results posted to tracker (if configured)
|
|
892
|
+
- [ ] Results returned to orchestrator (NOT committed)
|
|
893
|
+
</success_criteria>
|