get-research-done 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +560 -0
  3. package/agents/grd-architect.md +789 -0
  4. package/agents/grd-codebase-mapper.md +738 -0
  5. package/agents/grd-critic.md +1065 -0
  6. package/agents/grd-debugger.md +1203 -0
  7. package/agents/grd-evaluator.md +948 -0
  8. package/agents/grd-executor.md +784 -0
  9. package/agents/grd-explorer.md +2063 -0
  10. package/agents/grd-graduator.md +484 -0
  11. package/agents/grd-integration-checker.md +423 -0
  12. package/agents/grd-phase-researcher.md +641 -0
  13. package/agents/grd-plan-checker.md +745 -0
  14. package/agents/grd-planner.md +1386 -0
  15. package/agents/grd-project-researcher.md +865 -0
  16. package/agents/grd-research-synthesizer.md +256 -0
  17. package/agents/grd-researcher.md +2361 -0
  18. package/agents/grd-roadmapper.md +605 -0
  19. package/agents/grd-verifier.md +778 -0
  20. package/bin/install.js +1294 -0
  21. package/commands/grd/add-phase.md +207 -0
  22. package/commands/grd/add-todo.md +193 -0
  23. package/commands/grd/architect.md +283 -0
  24. package/commands/grd/audit-milestone.md +277 -0
  25. package/commands/grd/check-todos.md +228 -0
  26. package/commands/grd/complete-milestone.md +136 -0
  27. package/commands/grd/debug.md +169 -0
  28. package/commands/grd/discuss-phase.md +86 -0
  29. package/commands/grd/evaluate.md +1095 -0
  30. package/commands/grd/execute-phase.md +339 -0
  31. package/commands/grd/explore.md +258 -0
  32. package/commands/grd/graduate.md +323 -0
  33. package/commands/grd/help.md +482 -0
  34. package/commands/grd/insert-phase.md +227 -0
  35. package/commands/grd/insights.md +231 -0
  36. package/commands/grd/join-discord.md +18 -0
  37. package/commands/grd/list-phase-assumptions.md +50 -0
  38. package/commands/grd/map-codebase.md +71 -0
  39. package/commands/grd/new-milestone.md +721 -0
  40. package/commands/grd/new-project.md +1008 -0
  41. package/commands/grd/pause-work.md +134 -0
  42. package/commands/grd/plan-milestone-gaps.md +295 -0
  43. package/commands/grd/plan-phase.md +525 -0
  44. package/commands/grd/progress.md +364 -0
  45. package/commands/grd/quick-explore.md +236 -0
  46. package/commands/grd/quick.md +309 -0
  47. package/commands/grd/remove-phase.md +349 -0
  48. package/commands/grd/research-phase.md +200 -0
  49. package/commands/grd/research.md +681 -0
  50. package/commands/grd/resume-work.md +40 -0
  51. package/commands/grd/set-profile.md +106 -0
  52. package/commands/grd/settings.md +136 -0
  53. package/commands/grd/update.md +172 -0
  54. package/commands/grd/verify-work.md +219 -0
  55. package/get-research-done/config/default.json +15 -0
  56. package/get-research-done/references/checkpoints.md +1078 -0
  57. package/get-research-done/references/continuation-format.md +249 -0
  58. package/get-research-done/references/git-integration.md +254 -0
  59. package/get-research-done/references/model-profiles.md +73 -0
  60. package/get-research-done/references/planning-config.md +94 -0
  61. package/get-research-done/references/questioning.md +141 -0
  62. package/get-research-done/references/tdd.md +263 -0
  63. package/get-research-done/references/ui-brand.md +160 -0
  64. package/get-research-done/references/verification-patterns.md +612 -0
  65. package/get-research-done/templates/DEBUG.md +159 -0
  66. package/get-research-done/templates/UAT.md +247 -0
  67. package/get-research-done/templates/archive-reason.md +195 -0
  68. package/get-research-done/templates/codebase/architecture.md +255 -0
  69. package/get-research-done/templates/codebase/concerns.md +310 -0
  70. package/get-research-done/templates/codebase/conventions.md +307 -0
  71. package/get-research-done/templates/codebase/integrations.md +280 -0
  72. package/get-research-done/templates/codebase/stack.md +186 -0
  73. package/get-research-done/templates/codebase/structure.md +285 -0
  74. package/get-research-done/templates/codebase/testing.md +480 -0
  75. package/get-research-done/templates/config.json +35 -0
  76. package/get-research-done/templates/context.md +283 -0
  77. package/get-research-done/templates/continue-here.md +78 -0
  78. package/get-research-done/templates/critic-log.md +288 -0
  79. package/get-research-done/templates/data-report.md +173 -0
  80. package/get-research-done/templates/debug-subagent-prompt.md +91 -0
  81. package/get-research-done/templates/decision-log.md +58 -0
  82. package/get-research-done/templates/decision.md +138 -0
  83. package/get-research-done/templates/discovery.md +146 -0
  84. package/get-research-done/templates/experiment-readme.md +104 -0
  85. package/get-research-done/templates/graduated-script.md +180 -0
  86. package/get-research-done/templates/iteration-summary.md +234 -0
  87. package/get-research-done/templates/milestone-archive.md +123 -0
  88. package/get-research-done/templates/milestone.md +115 -0
  89. package/get-research-done/templates/objective.md +271 -0
  90. package/get-research-done/templates/phase-prompt.md +567 -0
  91. package/get-research-done/templates/planner-subagent-prompt.md +117 -0
  92. package/get-research-done/templates/project.md +184 -0
  93. package/get-research-done/templates/requirements.md +231 -0
  94. package/get-research-done/templates/research-project/ARCHITECTURE.md +204 -0
  95. package/get-research-done/templates/research-project/FEATURES.md +147 -0
  96. package/get-research-done/templates/research-project/PITFALLS.md +200 -0
  97. package/get-research-done/templates/research-project/STACK.md +120 -0
  98. package/get-research-done/templates/research-project/SUMMARY.md +170 -0
  99. package/get-research-done/templates/research.md +529 -0
  100. package/get-research-done/templates/roadmap.md +202 -0
  101. package/get-research-done/templates/scorecard.json +113 -0
  102. package/get-research-done/templates/state.md +287 -0
  103. package/get-research-done/templates/summary.md +246 -0
  104. package/get-research-done/templates/user-setup.md +311 -0
  105. package/get-research-done/templates/verification-report.md +322 -0
  106. package/get-research-done/workflows/complete-milestone.md +756 -0
  107. package/get-research-done/workflows/diagnose-issues.md +231 -0
  108. package/get-research-done/workflows/discovery-phase.md +289 -0
  109. package/get-research-done/workflows/discuss-phase.md +433 -0
  110. package/get-research-done/workflows/execute-phase.md +657 -0
  111. package/get-research-done/workflows/execute-plan.md +1844 -0
  112. package/get-research-done/workflows/list-phase-assumptions.md +178 -0
  113. package/get-research-done/workflows/map-codebase.md +322 -0
  114. package/get-research-done/workflows/resume-project.md +307 -0
  115. package/get-research-done/workflows/transition.md +556 -0
  116. package/get-research-done/workflows/verify-phase.md +628 -0
  117. package/get-research-done/workflows/verify-work.md +596 -0
  118. package/hooks/dist/grd-check-update.js +61 -0
  119. package/hooks/dist/grd-statusline.js +84 -0
  120. package/package.json +47 -0
  121. package/scripts/audit-help-commands.sh +115 -0
  122. package/scripts/build-hooks.js +42 -0
  123. package/scripts/verify-all-commands.sh +246 -0
  124. package/scripts/verify-architect-warning.sh +35 -0
  125. package/scripts/verify-insights-mode.sh +40 -0
  126. package/scripts/verify-quick-mode.sh +20 -0
  127. package/scripts/verify-revise-data-routing.sh +139 -0
@@ -0,0 +1,1095 @@
1
+ # /grd:evaluate
2
+
3
+ **Human evaluation gate for validated experiments (Phase 5 command)**
4
+
5
+ ---
6
+ name: grd:evaluate
7
+ description: Human evaluation gate for validated experiments
8
+ allowed-tools:
9
+ - Read
10
+ - Write
11
+ - Bash
12
+ - AskUserQuestion
13
+ phase: 5
14
+ requires: [experiments/run_NNN/SCORECARD.json]
15
+ produces: [experiments/run_NNN/DECISION.md, human_eval/decision_log.md]
16
+ ---
17
+
18
+ <objective>
19
+
20
+ Present complete evidence packages to humans for final validation decisions on ML experiments.
21
+
22
+ This command launches Phase 5 of the recursive validation loop—after the Critic approves (PROCEED) and the Evaluator generates quantitative benchmarks (SCORECARD.json), this command presents the full evidence package to the human researcher for a final decision: Seal (validate hypothesis), Iterate (continue experimentation), or Archive (abandon hypothesis).
23
+
24
+ **Creates:**
25
+ - `experiments/run_NNN/DECISION.md` — per-run decision record with timestamp, rationale, and context
26
+ - `human_eval/decision_log.md` — central append-only log of all decisions
27
+
28
+ **Use cases:**
29
+ - After Evaluator completes: Review SCORECARD.json and make final validation decision
30
+ - Multiple runs: Compare evidence across iterations before deciding
31
+ - Hypothesis validation: Seal successful experiments for production/publication
32
+ - Negative results: Archive failed hypotheses with documentation for future reference
33
+
34
+ **After this command:** Based on decision:
35
+ - Seal: Experiment validated, ready for downstream use
36
+ - Iterate: Continue with /grd:research (potentially with new direction)
37
+ - Archive: Hypothesis preserved in experiments/archive/ with reason
38
+
39
+ </objective>
40
+
41
+ <execution_context>
42
+
43
+ @~/.claude/get-research-done/templates/decision.md
44
+ @~/.claude/get-research-done/templates/archive-reason.md
45
+
46
+ </execution_context>
47
+
48
+ <process>
49
+
50
+ ## Phase 1: Setup and Evidence Loading
51
+
52
+ **Check if project initialized:**
53
+
54
+ ```bash
55
+ [ ! -f .planning/PROJECT.md ] && echo "ERROR: Project not initialized. Run /grd:new-project first." && exit 1
56
+ ```
57
+
58
+ **Determine target run:**
59
+
60
+ If [run_name] argument provided:
61
+ - Use specified run directory
62
+ - Validate it exists in experiments/
63
+
64
+ If no argument:
65
+ - Scan experiments/ for runs with SCORECARD.json
66
+ - Find most recent by modification time
67
+ - Use latest as target
68
+
69
+ ```bash
70
+ # Find runs with SCORECARD.json
71
+ if [ -n "$RUN_NAME" ]; then
72
+ RUN_DIR="experiments/$RUN_NAME"
73
+ [ ! -d "$RUN_DIR" ] && echo "ERROR: Run not found: $RUN_DIR" && exit 1
74
+ else
75
+ # Find latest run with SCORECARD
76
+ RUN_DIR=$(find experiments -name "SCORECARD.json" -type f | xargs -I {} dirname {} | xargs dirname | sort -r | head -1)
77
+ [ -z "$RUN_DIR" ] && echo "ERROR: No runs with SCORECARD.json found in experiments/" && exit 1
78
+ fi
79
+ ```
80
+
81
+ **Check SCORECARD.json exists (hard gate):**
82
+
83
+ ```bash
84
+ [ ! -f "$RUN_DIR/metrics/SCORECARD.json" ] && echo "ERROR: SCORECARD.json not found. Run must complete Evaluator phase first." && exit 1
85
+ ```
86
+
87
+ This is a HARD GATE - cannot proceed without quantitative benchmarks from Evaluator.
88
+
89
+ **Load evidence package:**
90
+
91
+ 1. **SCORECARD.json** (required):
92
+ ```bash
93
+ cat "$RUN_DIR/metrics/SCORECARD.json"
94
+ ```
95
+ Extract:
96
+ - Verdict (PROCEED/FAIL)
97
+ - Composite score
98
+ - Individual metrics (name, value, threshold, status)
99
+ - Timestamp
100
+
101
+ 2. **OBJECTIVE.md** (required for context):
102
+ ```bash
103
+ cat .planning/OBJECTIVE.md
104
+ ```
105
+ Extract:
106
+ - Hypothesis statement (what/why/expected)
107
+ - Success metrics with weights and thresholds
108
+ - Falsification criteria
109
+
110
+ 3. **CRITIC_LOG.md** (required):
111
+ ```bash
112
+ cat "$RUN_DIR/CRITIC_LOG.md"
113
+ ```
114
+ Extract:
115
+ - Critic verdict (PROCEED)
116
+ - Confidence level (HIGH/MEDIUM/LOW)
117
+ - Strengths identified
118
+ - Weaknesses/concerns
119
+ - Recommendations
120
+
121
+ 4. **DATA_REPORT.md** (optional context):
122
+ ```bash
123
+ cat .planning/DATA_REPORT.md 2>/dev/null
124
+ ```
125
+ If exists, extract data characteristics for context.
126
+
127
+ 5. **Run metadata:**
128
+ ```bash
129
+ # Extract from directory structure and files
130
+ RUN_NAME=$(basename "$RUN_DIR")
131
+ ITERATION=$(grep "Iteration:" "$RUN_DIR/CRITIC_LOG.md" | head -1 | sed 's/.*: //')
132
+ TIMESTAMP=$(stat -f "%Sm" -t "%Y-%m-%d %H:%M:%S" "$RUN_DIR")
133
+ DESCRIPTION=$(head -1 "$RUN_DIR/README.md" 2>/dev/null | sed 's/^# //')
134
+ ```
135
+
136
+ **Load iteration history:**
137
+
138
+ ```bash
139
+ # Count all runs in experiments/
140
+ TOTAL_RUNS=$(find experiments -maxdepth 1 -type d -name "run_*" | wc -l | tr -d ' ')
141
+
142
+ # Load verdict history from previous runs
143
+ for run_dir in experiments/run_*; do
144
+ if [ -f "$run_dir/CRITIC_LOG.md" ]; then
145
+ VERDICT=$(grep "^\*\*Verdict:\*\*" "$run_dir/CRITIC_LOG.md" | head -1 | sed 's/\*\*Verdict:\*\* //')
146
+ echo "$(basename $run_dir): $VERDICT"
147
+ fi
148
+ done > /tmp/verdict_history.txt
149
+ ```
150
+
151
+ ## Phase 2: Evidence Presentation
152
+
153
+ **Display evaluation banner:**
154
+ ```
155
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
156
+ GRD ► EVIDENCE PACKAGE: {run_name}
157
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
158
+
159
+ Run: {run_name}
160
+ Iteration: {N} of {total_runs}
161
+ ```
162
+
163
+ **Present executive summary first (always):**
164
+
165
+ Read and parse evidence files:
166
+ 1. SCORECARD.json → composite_score, overall_result, individual metrics
167
+ 2. CRITIC_LOG.md → verdict (PROCEED), confidence (HIGH/MEDIUM/LOW), strengths, weaknesses, recommendations
168
+ 3. OBJECTIVE.md → hypothesis statement (what section), success metrics with thresholds
169
+ 4. DATA_REPORT.md (optional) → sample size, class balance, leakage warnings
170
+
171
+ Determine verdict category:
172
+ - **VALIDATED**: Critic PROCEED + composite_score >= threshold + overall_result = PASS
173
+ - **FAILED**: composite_score < threshold OR overall_result = FAIL
174
+ - **INCONCLUSIVE**: Critic PROCEED with LOW confidence OR mixed metric results (some pass, some fail with borderline composite)
175
+
176
+ Display executive summary:
177
+
178
+ ```
179
+ ## Executive Summary
180
+
181
+ **Hypothesis:** {brief_what_statement_from_OBJECTIVE}
182
+
183
+ **Verdict:** {VALIDATED|FAILED|INCONCLUSIVE} ({confidence} confidence)
184
+ **Key Result:** {primary_metric}={value} (target: {comparison}{threshold}) {PASS|FAIL}
185
+ **Composite Score:** {score} (threshold: {threshold})
186
+
187
+ **Recommendation:** {one_sentence_from_critic_or_derived}
188
+ ```
189
+
190
+ **Determine drill-down depth adaptively:**
191
+
192
+ After executive summary, Claude decides which sections to present based on:
193
+ - **Complexity**: Multiple metrics → show full metrics table
194
+ - **Confidence**: LOW confidence → show critic reasoning in detail
195
+ - **Verdict clarity**: FAILED or INCONCLUSIVE → show more analysis
196
+ - **Data concerns**: If DATA_REPORT.md has warnings → show data characteristics
197
+ - **Multiple iterations**: If total_runs > 1 → show iteration timeline
198
+
199
+ **Potential drill-down sections (Claude selects relevant):**
200
+
201
+ ### Data Characteristics
202
+ Show if DATA_REPORT.md exists and has relevant warnings or context:
203
+ - **Sample size:** {N} samples, {M} features
204
+ - **Class balance:** {distribution if classification}
205
+ - **Leakage warnings:** {HIGH confidence warnings from DATA_REPORT.md}
206
+ - **Data quality:** {missing data percentage, outliers addressed}
207
+
208
+ ### Iteration Timeline
209
+ Show if multiple runs exist (total_runs > 1):
210
+ ```
211
+ Iteration history ({total_runs} runs):
212
+ - run_001_baseline: REVISE_METHOD (initial architecture)
213
+ - run_002_tuned: REVISE_METHOD (hyperparameter optimization)
214
+ - run_003_final: PROCEED (current) ✓
215
+ ```
216
+
217
+ Brief by default. User can request full history expansion.
218
+
219
+ ### Critic Reasoning
220
+ Show if verdict is borderline, confidence is LOW, or concerns exist:
221
+
222
+ **Strengths:**
223
+ - {strength_1 from CRITIC_LOG.md}
224
+ - {strength_2}
225
+
226
+ **Concerns:**
227
+ - {weakness_1 or "None identified"}
228
+
229
+ **Why PROCEED was given:**
230
+ {Extract reasoning from CRITIC_LOG.md}
231
+
232
+ **Residual concerns:**
233
+ {Any caveats or conditions mentioned}
234
+
235
+ ### Full Metrics Detail
236
+ Show if user needs detailed breakdown or multiple metrics with mixed results:
237
+
238
+ | Metric | Value | Threshold | Weight | Status |
239
+ |--------|-------|-----------|--------|--------|
240
+ | {metric_1} | {value} | {comparison}{threshold} | {weight} | {PASS|FAIL} |
241
+ | {metric_2} | {value} | {comparison}{threshold} | {weight} | {PASS|FAIL} |
242
+ ...
243
+
244
+ **Composite Score:** {weighted_average} = {calculation} (threshold: {overall_threshold})
245
+
246
+ ---
247
+
248
+ **Implementation notes:**
249
+ - DO NOT dump raw file contents
250
+ - Parse JSON/Markdown and extract relevant fields
251
+ - Present digestible summary with context
252
+ - Offer drill-down based on complexity and clarity
253
+ - Primary metric = highest weighted or first in metrics list
254
+
255
+ ## Phase 3: Decision Gate
256
+
257
+ **Present decision options:**
258
+
259
+ After evidence presentation, prompt user with three decision paths:
260
+
261
+ ```
262
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
263
+ GRD ► DECISION GATE
264
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
265
+
266
+ How would you like to proceed?
267
+
268
+ 1. **Seal** — Hypothesis validated, ready for production/publication
269
+ 2. **Iterate** — Continue experimentation (Critic suggests: {direction})
270
+ 3. **Archive** — Abandon hypothesis, preserve as negative result
271
+ ```
272
+
273
+ **Extract Critic recommendation before prompting:**
274
+
275
+ ```bash
276
+ # Parse CRITIC_LOG.md for last recommendation
277
+ CRITIC_RECOMMENDATION=$(grep -A 10 "## Recommendations" "$RUN_DIR/CRITIC_LOG.md" | tail -10)
278
+
279
+ # Determine suggested direction
280
+ if echo "$CRITIC_RECOMMENDATION" | grep -q "method\|algorithm\|architecture\|hyperparameter"; then
281
+ DIRECTION="REVISE_METHOD"
282
+ SUGGESTION="method refinement"
283
+ elif echo "$CRITIC_RECOMMENDATION" | grep -q "data\|feature\|sample\|leakage"; then
284
+ DIRECTION="REVISE_DATA"
285
+ SUGGESTION="data concerns"
286
+ else
287
+ DIRECTION="continue experimentation"
288
+ SUGGESTION="further iteration"
289
+ fi
290
+ ```
291
+
292
+ **Use AskUserQuestion for decision:**
293
+
294
+ ```javascript
295
+ AskUserQuestion({
296
+ header: "Experiment Decision: {run_name}",
297
+ question: "How would you like to proceed?",
298
+ options: [
299
+ "Seal — Hypothesis validated, ready for production/publication",
300
+ "Iterate — Continue experimentation (Critic suggests: {SUGGESTION})",
301
+ "Archive — Abandon hypothesis, preserve as negative result"
302
+ ]
303
+ })
304
+ ```
305
+
306
+ **Handle decision based on user selection:**
307
+
308
+ ### If user selects "Seal":
309
+ - No confirmation needed (affirmative action)
310
+ - Store decision: `DECISION="Seal"`
311
+ - Proceed directly to Phase 4 (Decision Logging)
312
+ - Display: "Hypothesis sealed. Decision logged to {run_dir}/DECISION.md"
313
+
314
+ ### If user selects "Iterate":
315
+ - No confirmation needed
316
+ - Store decision: `DECISION="Iterate"`
317
+ - Extract Critic's last recommendation from CRITIC_LOG.md
318
+ - Auto-suggest next steps based on direction:
319
+ - If REVISE_METHOD: "Continue with method refinement. Next: /grd:research --continue"
320
+ - If REVISE_DATA: "Data concerns identified. Next: /grd:explore with specific concerns, then /grd:research --continue"
321
+ - If generic: "Continue experimentation. Next: /grd:research --continue"
322
+ - Display recommendation with next command
323
+ - Proceed to Phase 4 (Decision Logging)
324
+
325
+ ### If user selects "Archive":
326
+ - **Confirmation gate required** (destructive action)
327
+
328
+ **Confirmation step:**
329
+
330
+ ```javascript
331
+ AskUserQuestion({
332
+ header: "Confirm Archive",
333
+ question: "This will archive all runs and mark the hypothesis as failed. Continue?",
334
+ options: [
335
+ "Yes, archive with rationale",
336
+ "Cancel"
337
+ ]
338
+ })
339
+ ```
340
+
341
+ If user selects "Cancel":
342
+ - Return to decision gate (re-prompt with Seal/Iterate/Archive options)
343
+ - Do not proceed to archival
344
+
345
+ If user selects "Yes, archive with rationale":
346
+ - **Require rationale** (mandatory for archive)
347
+
348
+ **Rationale capture:**
349
+
350
+ ```javascript
351
+ AskUserQuestion({
352
+ header: "Archive Rationale",
353
+ question: "Why is this hypothesis being abandoned? (Required - saved in ARCHIVE_REASON.md)",
354
+ // Free-form text input expected from user response
355
+ })
356
+ ```
357
+
358
+ **Validate rationale:**
359
+ - Check rationale is not empty or whitespace-only
360
+ - If empty, prompt again: "Rationale is required to preserve context for future researchers. Please provide reason for abandonment."
361
+ - Loop until valid rationale provided
362
+ - Store rationale in variable: `ARCHIVE_RATIONALE="{user_input}"`
363
+
364
+ Store decision and rationale:
365
+ ```bash
366
+ DECISION="Archive"
367
+ ARCHIVE_RATIONALE="{user_provided_text}"
368
+ ```
369
+
370
+ Proceed to Phase 4 (Decision Logging) and Phase 5 (Archive Handling)
371
+
372
+ ---
373
+
374
+ **Implementation notes:**
375
+ - Seal: Direct path to logging (no gates)
376
+ - Iterate: Direct path with auto-suggestion (no gates)
377
+ - Archive: Two-step confirmation (confirm → rationale) before proceeding
378
+ - All three paths eventually reach Phase 4 for DECISION.md creation
379
+
380
+ ## Phase 4: Decision Logging
381
+
382
+ After user decision captured, generate decision records and update project state.
383
+
384
+ **Step 1: Extract evidence data for logging**
385
+
386
+ ```bash
387
+ # Extract key data from evidence package for DECISION.md
388
+ TIMESTAMP_ISO=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
389
+ TIMESTAMP_LOCAL=$(date +"%Y-%m-%d %H:%M")
390
+ TIMESTAMP_DATE=$(date +"%Y-%m-%d")
391
+
392
+ # Extract hypothesis brief from OBJECTIVE.md (what section)
393
+ HYPOTHESIS_BRIEF=$(grep -A 5 "^### What" .planning/OBJECTIVE.md | tail -n +2 | head -5 | tr '\n' ' ' | sed 's/ */ /g' | cut -c 1-100)
394
+
395
+ # Extract Critic data from CRITIC_LOG.md
396
+ CRITIC_CONFIDENCE=$(grep "^\*\*Confidence:\*\*" "$RUN_DIR/CRITIC_LOG.md" | head -1 | sed 's/.*: //')
397
+
398
+ # Extract Evaluator data from SCORECARD.json
399
+ COMPOSITE_SCORE=$(jq -r '.composite_score' "$RUN_DIR/metrics/SCORECARD.json")
400
+ THRESHOLD=$(jq -r '.threshold' "$RUN_DIR/metrics/SCORECARD.json")
401
+
402
+ # Get primary metric (highest weight or first)
403
+ KEY_METRIC_NAME=$(jq -r '.metrics | to_entries | max_by(.value.weight) | .key' "$RUN_DIR/metrics/SCORECARD.json")
404
+ KEY_METRIC_VALUE=$(jq -r --arg name "$KEY_METRIC_NAME" '.metrics[$name].value' "$RUN_DIR/metrics/SCORECARD.json")
405
+ KEY_METRIC_THRESHOLD=$(jq -r --arg name "$KEY_METRIC_NAME" '.metrics[$name].threshold' "$RUN_DIR/metrics/SCORECARD.json")
406
+ KEY_METRIC_COMPARISON=$(jq -r --arg name "$KEY_METRIC_NAME" '.metrics[$name].comparison' "$RUN_DIR/metrics/SCORECARD.json")
407
+ ```
408
+
409
+ **Step 2: Build metrics table for DECISION.md**
410
+
411
+ ```bash
412
+ # Extract all metrics from SCORECARD.json, sorted by weight descending
413
+ METRICS_TABLE=$(jq -r '.metrics | to_entries | sort_by(-.value.weight) | .[] |
414
+ "| \(.key) | \(.value.value) | \(.value.comparison)\(.value.threshold) | \(.value.status) |"' \
415
+ "$RUN_DIR/metrics/SCORECARD.json")
416
+ ```
417
+
418
+ **Step 3: Generate decision-specific context**
419
+
420
+ ```bash
421
+ # Populate decision context section based on type
422
+ if [ "$DECISION" = "Seal" ]; then
423
+ DECISION_CONTEXT="### For Seal
424
+ - Hypothesis validated
425
+ - Ready for production/publication
426
+ - All success criteria met"
427
+
428
+ elif [ "$DECISION" = "Iterate" ]; then
429
+ # Extract Critic's recommendation if available
430
+ CRITIC_RECOMMENDATION=$(grep -A 10 "^\*\*Recommendation:\*\*" "$RUN_DIR/CRITIC_LOG.md" | tail -n +2 | head -5 | tr '\n' ' ' | sed 's/ */ /g')
431
+ DECISION_CONTEXT="### For Iterate
432
+ - Continuing experimentation
433
+ - Direction: Based on Critic recommendation
434
+ - Next focus: ${CRITIC_RECOMMENDATION:-Further refinement needed}"
435
+
436
+ elif [ "$DECISION" = "Archive" ]; then
437
+ DECISION_CONTEXT="### For Archive
438
+ - Hypothesis abandoned
439
+ - Reason: ${RATIONALE:-Not provided}
440
+ - Preserved as negative result"
441
+ fi
442
+ ```
443
+
444
+ **Step 4: Create per-run DECISION.md**
445
+
446
+ Use Write tool to create DECISION.md in run directory:
447
+
448
+ ```bash
449
+ # Generate DECISION.md from template
450
+ cat > "$RUN_DIR/DECISION.md" << EOF
451
+ # Human Decision: $RUN_NAME
452
+
453
+ **Timestamp:** $TIMESTAMP_ISO
454
+ **Hypothesis:** $HYPOTHESIS_BRIEF
455
+ **Decision:** $DECISION
456
+ **Rationale:** ${RATIONALE:-Not provided}
457
+
458
+ ## Evidence Summary
459
+
460
+ **Critic Verdict:** PROCEED (Confidence: $CRITIC_CONFIDENCE)
461
+ **Composite Score:** $COMPOSITE_SCORE (threshold: $THRESHOLD)
462
+ **Key Metric:** $KEY_METRIC_NAME=$KEY_METRIC_VALUE (target: $KEY_METRIC_COMPARISON$KEY_METRIC_THRESHOLD)
463
+
464
+ ## Metrics Detail
465
+
466
+ | Metric | Value | Threshold | Status |
467
+ |--------|-------|-----------|--------|
468
+ $METRICS_TABLE
469
+
470
+ ## Decision Context
471
+
472
+ $DECISION_CONTEXT
473
+
474
+ ---
475
+
476
+ *Decision recorded: $TIMESTAMP_ISO*
477
+ *Run directory: $RUN_DIR/*
478
+ EOF
479
+ ```
480
+
481
+ **Step 5: Create/append to central decision_log.md**
482
+
483
+ Use Write tool with append logic:
484
+
485
+ ```bash
486
+ # Ensure human_eval directory exists
487
+ mkdir -p human_eval
488
+
489
+ # Create log file if doesn't exist (use decision-log.md template header)
490
+ if [ ! -f human_eval/decision_log.md ]; then
491
+ cat > human_eval/decision_log.md << EOF
492
+ # Human Evaluation Decision Log
493
+
494
+ This log tracks all human evaluation decisions for this research project.
495
+
496
+ | Timestamp | Run | Decision | Key Metric | Reference |
497
+ |-----------|-----|----------|------------|-----------|
498
+ EOF
499
+ fi
500
+
501
+ # Append new entry (chronological, newest at bottom)
502
+ echo "| $TIMESTAMP_LOCAL | $RUN_NAME | $DECISION | $KEY_METRIC_NAME=$KEY_METRIC_VALUE | $RUN_DIR/ |" >> human_eval/decision_log.md
503
+ ```
504
+
505
+ **Step 6: Update STATE.md Research Loop State**
506
+
507
+ Update appropriate sections based on decision:
508
+
509
+ ```bash
510
+ # Update Research Loop State > Status based on decision
511
+ if [ "$DECISION" = "Seal" ]; then
512
+ # Mark hypothesis as validated
513
+ sed -i '' 's/^\*\*Status:\*\* .*/\*\*Status:\*\* validated/' .planning/STATE.md
514
+
515
+ elif [ "$DECISION" = "Iterate" ]; then
516
+ # Keep loop active, mark phase as researcher for next iteration
517
+ sed -i '' 's/^\*\*Phase:\*\* .*/\*\*Phase:\*\* researcher/' .planning/STATE.md
518
+
519
+ elif [ "$DECISION" = "Archive" ]; then
520
+ # Mark hypothesis as archived
521
+ sed -i '' 's/^\*\*Status:\*\* .*/\*\*Status:\*\* archived/' .planning/STATE.md
522
+ fi
523
+
524
+ # Add entry to Human Decisions table
525
+ RATIONALE_EXCERPT=$(echo "${RATIONALE:-Not provided}" | head -c 50)
526
+
527
+ # Find Human Decisions table and append entry
528
+ # Using awk to find table and insert after header row
529
+ awk -v date="$TIMESTAMP_DATE" -v dec="$DECISION" -v rat="$RATIONALE_EXCERPT" '
530
+ /^### Human Decisions/ { in_section=1 }
531
+ in_section && /^\| Timestamp \| Decision \| Rationale \|$/ {
532
+ print
533
+ print "| " date " | " dec " | " rat " |"
534
+ in_section=0
535
+ next
536
+ }
537
+ { print }
538
+ ' .planning/STATE.md > .planning/STATE.md.tmp && mv .planning/STATE.md.tmp .planning/STATE.md
539
+
540
+ # Update Loop History table with final decision annotation
541
+ # Mark current run with decision in parentheses
542
+ sed -i '' "s|\(^.*| $RUN_NAME |.*\)|\1 (Human: $DECISION)|" .planning/STATE.md
543
+ ```
544
+
545
+ **Step 7: Display confirmation**
546
+
547
+ ```
548
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
549
+ GRD ► DECISION LOGGED
550
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
551
+
552
+ Decision: {Seal|Iterate|Archive}
553
+
554
+ Logged to:
555
+ - experiments/{run_name}/DECISION.md
556
+ - human_eval/decision_log.md
557
+ - .planning/STATE.md (Research Loop State updated)
558
+
559
+ {Next steps based on decision:
560
+ - Seal: "Experiment validated. Review run directory for artifacts."
561
+ - Iterate: "Ready for next iteration. Run /grd:research --continue"
562
+ - Archive: "Proceeding with archive workflow..." (triggers Phase 5)}
563
+ ```
564
+
565
+ ## Phase 5: Archive Handling (if Archive decision)
566
+
567
+ **If decision is Archive:**
568
+
569
+ **Step 1: Determine archive path**
570
+
571
+ ```bash
572
+ # Extract hypothesis name from OBJECTIVE.md (sanitize for filesystem)
573
+ HYPOTHESIS_RAW=$(grep -A 1 "## Hypothesis" .planning/OBJECTIVE.md | tail -1 | head -c 50)
574
+ HYPOTHESIS_NAME=$(echo "$HYPOTHESIS_RAW" | tr ' ' '_' | tr -cd '[:alnum:]_-' | tr '[:upper:]' '[:lower:]')
575
+
576
+ # Date prefix
577
+ DATE_PREFIX=$(date +%Y-%m-%d)
578
+
579
+ # Archive directory path
580
+ ARCHIVE_DIR="experiments/archive/${DATE_PREFIX}_${HYPOTHESIS_NAME}"
581
+
582
+ # Create archive directory
583
+ mkdir -p "$ARCHIVE_DIR"
584
+
585
+ echo "Creating archive directory: $ARCHIVE_DIR"
586
+ ```
587
+
588
+ **Step 2: Identify final and intermediate runs**
589
+
590
+ ```bash
591
+ # Find all run directories
592
+ ALL_RUNS=$(ls -d experiments/run_* 2>/dev/null | sort)
593
+
594
+ # Final run is the current one being evaluated (with DECISION.md)
595
+ FINAL_RUN="$RUN_DIR"
596
+
597
+ # Intermediate runs are all others
598
+ INTERMEDIATE_RUNS=$(echo "$ALL_RUNS" | grep -v "$(basename $FINAL_RUN)")
599
+
600
+ # Count intermediate runs
601
+ INTERMEDIATE_COUNT=$(echo "$INTERMEDIATE_RUNS" | grep -c "run_" || echo 0)
602
+ TOTAL_ITERATIONS=$(echo "$ALL_RUNS" | grep -c "run_")
603
+
604
+ echo "Identified:"
605
+ echo " Final run: $FINAL_RUN"
606
+ echo " Intermediate runs: $INTERMEDIATE_COUNT"
607
+ echo " Total iterations: $TOTAL_ITERATIONS"
608
+ ```
609
+
610
+ **Step 3: Move final run to archive**
611
+
612
+ Display progress:
613
+ ```
614
+ Moving final run to archive...
615
+ From: $FINAL_RUN
616
+ To: $ARCHIVE_DIR/run_final/
617
+ ```
618
+
619
+ ```bash
620
+ # Move final run directory to archive (preserves full structure)
621
+ mv "$FINAL_RUN" "$ARCHIVE_DIR/run_final"
622
+
623
+ # Verify move succeeded
624
+ if [ ! -d "$ARCHIVE_DIR/run_final" ]; then
625
+ echo "ERROR: Failed to move final run to archive"
626
+ exit 1
627
+ fi
628
+
629
+ echo "✓ Final run moved to archive"
630
+ ```
631
+
632
+ **Step 4: Generate ARCHIVE_REASON.md**
633
+
634
+ Extract data for ARCHIVE_REASON.md template:
635
+
636
+ ```bash
637
+ # Extract hypothesis statement from OBJECTIVE.md
638
+ HYPOTHESIS_STATEMENT=$(grep -A 20 "^## Hypothesis" .planning/OBJECTIVE.md | grep -A 15 "^### What" | tail -n +2 | sed '/^###/,$d' | tr '\n' ' ' | sed 's/ */ /g')
639
+
640
+ # Extract final iteration and verdict
641
+ ITERATION_COUNT=$TOTAL_ITERATIONS
642
+ ITERATION_LIMIT=${ITERATION_LIMIT:-5} # Default 5 if not set
643
+ FINAL_VERDICT=$(grep "^\*\*Verdict:\*\*" "$ARCHIVE_DIR/run_final/CRITIC_LOG.md" | head -1 | sed 's/\*\*Verdict:\*\* //')
644
+
645
+ # Extract best metrics across all runs
646
+ BEST_METRICS_TABLE=""
647
+ # Get metric names from OBJECTIVE.md
648
+ METRIC_NAMES=$(grep -A 50 "^## Success Metrics" .planning/OBJECTIVE.md | grep "^- \*\*" | sed 's/^- \*\*\(.*\)\*\*:.*/\1/')
649
+
650
+ # For each metric, find best value across all iterations
651
+ for metric in $METRIC_NAMES; do
652
+ # Find best value from all SCORECARD.json files
653
+ BEST_VALUE=$(jq -r --arg m "$metric" '.metrics[$m].value // empty' experiments/archive/*/run_final/metrics/SCORECARD.json "$ARCHIVE_DIR"/run_final/metrics/SCORECARD.json 2>/dev/null | sort -rn | head -1)
654
+ TARGET=$(jq -r --arg m "$metric" '.metrics[$m].threshold // empty' "$ARCHIVE_DIR/run_final/metrics/SCORECARD.json" 2>/dev/null | head -1)
655
+ COMPARISON=$(jq -r --arg m "$metric" '.metrics[$m].comparison // empty' "$ARCHIVE_DIR/run_final/metrics/SCORECARD.json" 2>/dev/null | head -1)
656
+
657
+ if [ -n "$BEST_VALUE" ] && [ -n "$TARGET" ]; then
658
+ GAP=$(echo "$BEST_VALUE - $TARGET" | bc 2>/dev/null || echo "N/A")
659
+ BEST_METRICS_TABLE="$BEST_METRICS_TABLE
660
+ | $metric | $BEST_VALUE | $COMPARISON$TARGET | $GAP |"
661
+ fi
662
+ done
663
+
664
+ # Get user rationale (already captured in Phase 3)
665
+ # USER_RATIONALE variable should be set from Archive confirmation step
666
+
667
+ # Create ARCHIVE_REASON.md from template
668
+ cat > "$ARCHIVE_DIR/ARCHIVE_REASON.md" << EOF
669
+ # Archive Reason: $HYPOTHESIS_NAME
670
+
671
+ **Archived:** $(date -u +"%Y-%m-%dT%H:%M:%SZ")
672
+ **Original Hypothesis:** $HYPOTHESIS_STATEMENT
673
+ **Final Iteration:** $ITERATION_COUNT of $ITERATION_LIMIT
674
+ **Final Verdict:** $FINAL_VERDICT
675
+
676
+ ## Why This Failed
677
+
678
+ $ARCHIVE_RATIONALE
679
+
680
+ ## What We Learned
681
+
682
+ *To be filled: Insights from failed attempts*
683
+
684
+ - Key finding 1
685
+ - Key finding 2
686
+ - Key finding 3
687
+
688
+ ## What Would Need to Change
689
+
690
+ *To be filled: Conditions under which this might work*
691
+
692
+ - Required change 1
693
+ - Required change 2
694
+ - Required change 3
695
+
696
+ ## Final Metrics
697
+
698
+ | Metric | Best Value | Target | Gap |
699
+ |--------|------------|--------|-----|
700
+ $BEST_METRICS_TABLE
701
+
702
+ ## Iteration Timeline
703
+
704
+ See: ITERATION_SUMMARY.md for detailed history of all attempts.
705
+
706
+ ---
707
+
708
+ *This negative result is preserved to prevent future researchers from repeating this approach without the necessary conditions.*
709
+
710
+ ---
711
+
712
+ **Archive location:** experiments/archive/${DATE_PREFIX}_${HYPOTHESIS_NAME}/
713
+ **Decision recorded:** human_eval/decision_log.md
714
+ EOF
715
+
716
+ echo "✓ ARCHIVE_REASON.md created"
717
+ ```
718
+
719
+ **Step 5: Generate ITERATION_SUMMARY.md**
720
+
721
+ Scan all run directories and compile iteration history:
722
+
723
+ ```bash
724
+ # Build iteration history table from all runs
725
+ HISTORY_TABLE=""
726
+ ITERATION_NUM=1
727
+
728
+ # Collect verdict distribution counters
729
+ PROCEED_COUNT=0
730
+ REVISE_METHOD_COUNT=0
731
+ REVISE_DATA_COUNT=0
732
+ ESCALATE_COUNT=0
733
+
734
+ # Collect date range
735
+ FIRST_DATE=""
736
+ LAST_DATE=""
737
+
738
+ for run_dir in $ALL_RUNS; do
739
+ run_name=$(basename "$run_dir")
740
+
741
+ # Check if run has been archived (is now in archive/*/run_final/)
742
+ if [ "$run_dir" = "$ARCHIVE_DIR/run_final" ] || [ ! -d "$run_dir" ]; then
743
+ # This is the archived final run
744
+ run_dir="$ARCHIVE_DIR/run_final"
745
+ fi
746
+
747
+ # Extract verdict and confidence from CRITIC_LOG.md
748
+ if [ -f "$run_dir/CRITIC_LOG.md" ]; then
749
+ verdict=$(grep "^\*\*Verdict:\*\*" "$run_dir/CRITIC_LOG.md" | head -1 | sed 's/\*\*Verdict:\*\* //')
750
+ confidence=$(grep "^\*\*Confidence:\*\*" "$run_dir/CRITIC_LOG.md" | head -1 | sed 's/\*\*Confidence:\*\* //')
751
+
752
+ # Count verdicts
753
+ case "$verdict" in
754
+ PROCEED) PROCEED_COUNT=$((PROCEED_COUNT + 1)) ;;
755
+ REVISE_METHOD) REVISE_METHOD_COUNT=$((REVISE_METHOD_COUNT + 1)) ;;
756
+ REVISE_DATA) REVISE_DATA_COUNT=$((REVISE_DATA_COUNT + 1)) ;;
757
+ ESCALATE) ESCALATE_COUNT=$((ESCALATE_COUNT + 1)) ;;
758
+ esac
759
+ else
760
+ verdict="N/A"
761
+ confidence="N/A"
762
+ fi
763
+
764
+ # Extract key metric from SCORECARD.json (primary metric = highest weight)
765
+ if [ -f "$run_dir/metrics/SCORECARD.json" ]; then
766
+ key_metric_name=$(jq -r '.metrics | to_entries | max_by(.value.weight // 0) | .key // "N/A"' "$run_dir/metrics/SCORECARD.json" 2>/dev/null)
767
+ key_metric_value=$(jq -r --arg m "$key_metric_name" '.metrics[$m].value // "N/A"' "$run_dir/metrics/SCORECARD.json" 2>/dev/null)
768
+ key_metric="$key_metric_name=$key_metric_value"
769
+ else
770
+ key_metric="N/A"
771
+ fi
772
+
773
+ # Get run date
774
+ run_date=$(stat -f "%Sm" -t "%Y-%m-%d" "$run_dir" 2>/dev/null || date +%Y-%m-%d)
775
+
776
+ # Track date range
777
+ if [ -z "$FIRST_DATE" ]; then
778
+ FIRST_DATE=$run_date
779
+ fi
780
+ LAST_DATE=$run_date
781
+
782
+ # Extract notes from README.md
783
+ notes=""
784
+ if [ -f "$run_dir/README.md" ]; then
785
+ notes=$(head -1 "$run_dir/README.md" | sed 's/^# //' | cut -c 1-30)
786
+ fi
787
+
788
+ # Append to history table
789
+ HISTORY_TABLE="$HISTORY_TABLE
790
+ | $ITERATION_NUM | $run_name | $run_date | $verdict | $confidence | $key_metric | $notes |"
791
+
792
+ ITERATION_NUM=$((ITERATION_NUM + 1))
793
+ done
794
+
795
+ # Calculate metric trend
796
+ # Extract primary metric values across all iterations
797
+ METRIC_VALUES=$(for run in $ALL_RUNS; do
798
+ if [ -f "$run/metrics/SCORECARD.json" ]; then
799
+ jq -r '.metrics | to_entries | max_by(.value.weight // 0) | .value.value // empty' "$run/metrics/SCORECARD.json" 2>/dev/null
800
+ fi
801
+ done)
802
+
803
+ # Determine best metric
804
+ BEST_METRIC=$(echo "$METRIC_VALUES" | sort -rn | head -1)
805
+
806
+ # Get target from OBJECTIVE.md or final SCORECARD
807
+ TARGET_THRESHOLD=$(jq -r '.metrics | to_entries | max_by(.value.weight // 0) | .value.threshold // "N/A"' "$ARCHIVE_DIR/run_final/metrics/SCORECARD.json" 2>/dev/null)
808
+
809
+ # Calculate gap
810
+ if [ -n "$BEST_METRIC" ] && [ -n "$TARGET_THRESHOLD" ] && [ "$TARGET_THRESHOLD" != "N/A" ]; then
811
+ GAP=$(echo "$BEST_METRIC - $TARGET_THRESHOLD" | bc 2>/dev/null || echo "N/A")
812
+ else
813
+ GAP="N/A"
814
+ fi
815
+
816
+ # Determine trend (simple heuristic: compare first and last values)
817
+ FIRST_VALUE=$(echo "$METRIC_VALUES" | head -1)
818
+ LAST_VALUE=$(echo "$METRIC_VALUES" | tail -1)
819
+
820
+ if [ -n "$FIRST_VALUE" ] && [ -n "$LAST_VALUE" ]; then
821
+ TREND_DIFF=$(echo "$LAST_VALUE - $FIRST_VALUE" | bc 2>/dev/null || echo 0)
822
+ if (( $(echo "$TREND_DIFF > 0.05" | bc -l 2>/dev/null) )); then
823
+ TREND="improving"
824
+ elif (( $(echo "$TREND_DIFF < -0.05" | bc -l 2>/dev/null) )); then
825
+ TREND="degrading"
826
+ else
827
+ TREND="stagnant"
828
+ fi
829
+ else
830
+ TREND="N/A"
831
+ fi
832
+
833
+ # Write ITERATION_SUMMARY.md
834
+ cat > "$ARCHIVE_DIR/ITERATION_SUMMARY.md" << EOF
835
+ # Iteration Summary: $HYPOTHESIS_NAME
836
+
837
+ **Total Iterations:** $TOTAL_ITERATIONS
838
+ **Date Range:** $FIRST_DATE to $LAST_DATE
839
+ **Outcome:** Archived (hypothesis abandoned)
840
+
841
+ ## Iteration History
842
+
843
+ | # | Run | Date | Verdict | Confidence | Key Metric | Notes |
844
+ |---|-----|------|---------|------------|------------|-------|
845
+ $HISTORY_TABLE
846
+
847
+ ## Metric Trend
848
+
849
+ **Best achieved:** $BEST_METRIC
850
+ **Target:** $TARGET_THRESHOLD
851
+ **Gap:** $GAP
852
+ **Trend:** $TREND
853
+
854
+ ## Verdict Distribution
855
+
856
+ - PROCEED: $PROCEED_COUNT
857
+ - REVISE_METHOD: $REVISE_METHOD_COUNT
858
+ - REVISE_DATA: $REVISE_DATA_COUNT
859
+ - ESCALATE: $ESCALATE_COUNT
860
+
861
+ ## Key Observations
862
+
863
+ *Summary of what was tried and why it didn't work*
864
+
865
+ ## Preserved Artifacts
866
+
867
+ - Final run: run_final/
868
+ - All CRITIC_LOG.md files preserved in final run
869
+ - Final SCORECARD.json preserved in final run
870
+
871
+ ---
872
+
873
+ *Summary generated on archive. See ARCHIVE_REASON.md for human rationale.*
874
+ EOF
875
+
876
+ echo "✓ ITERATION_SUMMARY.md created"
877
+ ```
878
+
879
+ **Step 6: Create metadata.json**
880
+
881
+ Create archival metadata for programmatic access:
882
+
883
+ ```bash
884
+ # Extract first line of rationale for summary
885
+ RATIONALE_FIRST_LINE=$(echo "$ARCHIVE_RATIONALE" | head -1 | cut -c 1-100)
886
+
887
+ # Create archival metadata JSON
888
+ cat > "$ARCHIVE_DIR/metadata.json" << EOF
889
+ {
890
+ "archived_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
891
+ "hypothesis": "$HYPOTHESIS_STATEMENT",
892
+ "total_iterations": $TOTAL_ITERATIONS,
893
+ "final_verdict": "$FINAL_VERDICT",
894
+ "best_metric": "$BEST_METRIC",
895
+ "target_threshold": "$TARGET_THRESHOLD",
896
+ "archive_reason": "$RATIONALE_FIRST_LINE",
897
+ "intermediate_runs_removed": $INTERMEDIATE_COUNT,
898
+ "final_run_preserved": "run_final"
899
+ }
900
+ EOF
901
+
902
+ echo "✓ metadata.json created"
903
+ ```
904
+
905
+ **Step 7: Clean up intermediate runs**
906
+
907
+ Display progress:
908
+ ```
909
+ Cleaning up intermediate runs...
910
+ ```
911
+
912
+ ```bash
913
+ # Remove intermediate run directories (final run already moved to archive)
914
+ REMOVED_COUNT=0
915
+ for run_dir in $INTERMEDIATE_RUNS; do
916
+ if [ -d "$run_dir" ] && [ -n "$run_dir" ]; then
917
+ echo " Removed: $run_dir"
918
+ rm -rf "$run_dir"
919
+ REMOVED_COUNT=$((REMOVED_COUNT + 1))
920
+ fi
921
+ done
922
+
923
+ if [ $REMOVED_COUNT -eq 0 ]; then
924
+ echo " No intermediate runs to remove"
925
+ else
926
+ echo " Removed $REMOVED_COUNT intermediate run(s)"
927
+ fi
928
+
929
+ echo " Kept: Final run preserved in archive"
930
+ ```
931
+
932
+ **Step 8: Update decision_log.md reference**
933
+
934
+ Update the decision log entry to point to archive location:
935
+
936
+ ```bash
937
+ # The decision log entry was created in Phase 4 pointing to experiments/run_NNN/
938
+ # Update it to point to archive location instead
939
+ RUN_NAME=$(basename "$FINAL_RUN")
940
+
941
+ # Update last entry in decision_log.md
942
+ # Change: experiments/run_NNN/ -> experiments/archive/YYYY-MM-DD_hypothesis/
943
+ if [ -f "human_eval/decision_log.md" ]; then
944
+ # Use sed to replace the run path in the last occurrence
945
+ sed -i.bak "s|experiments/$RUN_NAME/|$ARCHIVE_DIR/|g" human_eval/decision_log.md
946
+ rm -f human_eval/decision_log.md.bak
947
+
948
+ echo "✓ decision_log.md updated to reference archive location"
949
+ fi
950
+ ```
951
+
952
+ **Step 9: Display archive completion**
953
+
954
+ ```
955
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
956
+ GRD ► HYPOTHESIS ARCHIVED
957
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
958
+
959
+ **Archive Location:** $ARCHIVE_DIR
960
+
961
+ **Contents:**
962
+ - ARCHIVE_REASON.md — Your rationale preserved
963
+ - ITERATION_SUMMARY.md — All $TOTAL_ITERATIONS attempts summarized
964
+ - run_final/ — Final run with all artifacts
965
+ - metadata.json — Archival metadata
966
+
967
+ **Cleanup:**
968
+ - $INTERMEDIATE_COUNT intermediate run(s) removed
969
+ - Final run preserved with full artifacts
970
+
971
+ **Next steps:**
972
+ - Review ARCHIVE_REASON.md to add learnings
973
+ - Optionally fill "What We Learned" and "What Would Need to Change" sections
974
+ - Start new hypothesis with /grd:architect if ready
975
+
976
+ This negative result is preserved for future reference.
977
+ ```
978
+
979
+ ## Phase 6: Completion Summary
980
+
981
+ **Display final summary:**
982
+
983
+ **If Seal:**
984
+ ```
985
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
986
+ GRD ► HYPOTHESIS VALIDATED ✓
987
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
988
+
989
+ **Run:** {run_name}
990
+ **Decision:** Seal
991
+ **Recorded:** experiments/{run_name}/DECISION.md
992
+
993
+ This experiment is validated and ready for production/publication.
994
+
995
+ **Next steps:**
996
+ - Review final artifacts in run directory
997
+ - Consider downstream actions (deployment, paper submission, etc.)
998
+ ```
999
+
1000
+ **If Iterate:**
1001
+ ```
1002
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1003
+ GRD ► CONTINUING EXPERIMENTATION
1004
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1005
+
1006
+ **Run:** {run_name}
1007
+ **Decision:** Iterate
1008
+ **Recorded:** experiments/{run_name}/DECISION.md
1009
+
1010
+ Continuing experimentation based on: {direction_or_recommendation}
1011
+
1012
+ **Next steps:**
1013
+ - Run: /grd:research --continue
1014
+ {if_specific_direction: with focus on {direction}}
1015
+ ```
1016
+
1017
+ **If Archive:**
1018
+ ```
1019
+ [Archive confirmation message already shown in Phase 5]
1020
+ ```
1021
+
1022
+ **Update decision log reference:**
1023
+ ```
1024
+ **Decision log:** human_eval/decision_log.md
1025
+ ```
1026
+
1027
+ </process>
1028
+
1029
+ <arguments>
1030
+
1031
+ **[run_name]** (optional)
1032
+ - Name of specific run to evaluate
1033
+ - Examples: "run_003_tuned", "run_001_baseline"
1034
+ - If omitted, evaluates latest run with SCORECARD.json
1035
+
1036
+ </arguments>
1037
+
1038
+ <examples>
1039
+
1040
+ **Evaluate latest run:**
1041
+ ```
1042
+ /grd:evaluate
1043
+ # Finds latest run with SCORECARD.json
1044
+ ```
1045
+
1046
+ **Evaluate specific run:**
1047
+ ```
1048
+ /grd:evaluate run_003_tuned
1049
+ # Evaluates specified run
1050
+ ```
1051
+
1052
+ **After Evaluator completes:**
1053
+ ```
1054
+ # Researcher → Critic → Evaluator → /grd:evaluate
1055
+ /grd:evaluate
1056
+ # Human reviews evidence and makes decision
1057
+ ```
1058
+
1059
+ </examples>
1060
+
1061
+ <output>
1062
+
1063
+ - `experiments/run_NNN/DECISION.md` — per-run decision record with:
1064
+ - Timestamp (ISO 8601)
1065
+ - Hypothesis statement
1066
+ - Decision (Seal/Iterate/Archive)
1067
+ - Rationale (if provided)
1068
+ - Evidence summary (verdict, scores, metrics)
1069
+ - Decision context (strengths, concerns, next steps)
1070
+
1071
+ - `human_eval/decision_log.md` — central append-only log with:
1072
+ - Chronological table of all decisions
1073
+ - Date, run name, decision type, run directory path
1074
+
1075
+ - `experiments/archive/YYYY-MM-DD_hypothesis_name/` (if Archive) with:
1076
+ - `final_run/` — final experiment directory with DECISION.md
1077
+ - `ARCHIVE_REASON.md` — required rationale and learnings
1078
+ - `ITERATION_SUMMARY.md` — collapsed iteration history
1079
+
1080
+ </output>
1081
+
1082
+ <success_criteria>
1083
+
1084
+ - [ ] SCORECARD.json hard gate enforced (required)
1085
+ - [ ] Evidence package assembled (SCORECARD, OBJECTIVE, CRITIC_LOG, metadata)
1086
+ - [ ] Executive summary presented with adaptive drill-down
1087
+ - [ ] Human decision captured (Seal/Iterate/Archive)
1088
+ - [ ] Confirmation required for Archive with rationale
1089
+ - [ ] DECISION.md created in run directory
1090
+ - [ ] Central decision_log.md updated
1091
+ - [ ] STATE.md updated with decision
1092
+ - [ ] Archive process executed if Archive selected
1093
+ - [ ] ARCHIVE_REASON.md and ITERATION_SUMMARY.md created for archives
1094
+
1095
+ </success_criteria>