get-research-done 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +560 -0
- package/agents/grd-architect.md +789 -0
- package/agents/grd-codebase-mapper.md +738 -0
- package/agents/grd-critic.md +1065 -0
- package/agents/grd-debugger.md +1203 -0
- package/agents/grd-evaluator.md +948 -0
- package/agents/grd-executor.md +784 -0
- package/agents/grd-explorer.md +2063 -0
- package/agents/grd-graduator.md +484 -0
- package/agents/grd-integration-checker.md +423 -0
- package/agents/grd-phase-researcher.md +641 -0
- package/agents/grd-plan-checker.md +745 -0
- package/agents/grd-planner.md +1386 -0
- package/agents/grd-project-researcher.md +865 -0
- package/agents/grd-research-synthesizer.md +256 -0
- package/agents/grd-researcher.md +2361 -0
- package/agents/grd-roadmapper.md +605 -0
- package/agents/grd-verifier.md +778 -0
- package/bin/install.js +1294 -0
- package/commands/grd/add-phase.md +207 -0
- package/commands/grd/add-todo.md +193 -0
- package/commands/grd/architect.md +283 -0
- package/commands/grd/audit-milestone.md +277 -0
- package/commands/grd/check-todos.md +228 -0
- package/commands/grd/complete-milestone.md +136 -0
- package/commands/grd/debug.md +169 -0
- package/commands/grd/discuss-phase.md +86 -0
- package/commands/grd/evaluate.md +1095 -0
- package/commands/grd/execute-phase.md +339 -0
- package/commands/grd/explore.md +258 -0
- package/commands/grd/graduate.md +323 -0
- package/commands/grd/help.md +482 -0
- package/commands/grd/insert-phase.md +227 -0
- package/commands/grd/insights.md +231 -0
- package/commands/grd/join-discord.md +18 -0
- package/commands/grd/list-phase-assumptions.md +50 -0
- package/commands/grd/map-codebase.md +71 -0
- package/commands/grd/new-milestone.md +721 -0
- package/commands/grd/new-project.md +1008 -0
- package/commands/grd/pause-work.md +134 -0
- package/commands/grd/plan-milestone-gaps.md +295 -0
- package/commands/grd/plan-phase.md +525 -0
- package/commands/grd/progress.md +364 -0
- package/commands/grd/quick-explore.md +236 -0
- package/commands/grd/quick.md +309 -0
- package/commands/grd/remove-phase.md +349 -0
- package/commands/grd/research-phase.md +200 -0
- package/commands/grd/research.md +681 -0
- package/commands/grd/resume-work.md +40 -0
- package/commands/grd/set-profile.md +106 -0
- package/commands/grd/settings.md +136 -0
- package/commands/grd/update.md +172 -0
- package/commands/grd/verify-work.md +219 -0
- package/get-research-done/config/default.json +15 -0
- package/get-research-done/references/checkpoints.md +1078 -0
- package/get-research-done/references/continuation-format.md +249 -0
- package/get-research-done/references/git-integration.md +254 -0
- package/get-research-done/references/model-profiles.md +73 -0
- package/get-research-done/references/planning-config.md +94 -0
- package/get-research-done/references/questioning.md +141 -0
- package/get-research-done/references/tdd.md +263 -0
- package/get-research-done/references/ui-brand.md +160 -0
- package/get-research-done/references/verification-patterns.md +612 -0
- package/get-research-done/templates/DEBUG.md +159 -0
- package/get-research-done/templates/UAT.md +247 -0
- package/get-research-done/templates/archive-reason.md +195 -0
- package/get-research-done/templates/codebase/architecture.md +255 -0
- package/get-research-done/templates/codebase/concerns.md +310 -0
- package/get-research-done/templates/codebase/conventions.md +307 -0
- package/get-research-done/templates/codebase/integrations.md +280 -0
- package/get-research-done/templates/codebase/stack.md +186 -0
- package/get-research-done/templates/codebase/structure.md +285 -0
- package/get-research-done/templates/codebase/testing.md +480 -0
- package/get-research-done/templates/config.json +35 -0
- package/get-research-done/templates/context.md +283 -0
- package/get-research-done/templates/continue-here.md +78 -0
- package/get-research-done/templates/critic-log.md +288 -0
- package/get-research-done/templates/data-report.md +173 -0
- package/get-research-done/templates/debug-subagent-prompt.md +91 -0
- package/get-research-done/templates/decision-log.md +58 -0
- package/get-research-done/templates/decision.md +138 -0
- package/get-research-done/templates/discovery.md +146 -0
- package/get-research-done/templates/experiment-readme.md +104 -0
- package/get-research-done/templates/graduated-script.md +180 -0
- package/get-research-done/templates/iteration-summary.md +234 -0
- package/get-research-done/templates/milestone-archive.md +123 -0
- package/get-research-done/templates/milestone.md +115 -0
- package/get-research-done/templates/objective.md +271 -0
- package/get-research-done/templates/phase-prompt.md +567 -0
- package/get-research-done/templates/planner-subagent-prompt.md +117 -0
- package/get-research-done/templates/project.md +184 -0
- package/get-research-done/templates/requirements.md +231 -0
- package/get-research-done/templates/research-project/ARCHITECTURE.md +204 -0
- package/get-research-done/templates/research-project/FEATURES.md +147 -0
- package/get-research-done/templates/research-project/PITFALLS.md +200 -0
- package/get-research-done/templates/research-project/STACK.md +120 -0
- package/get-research-done/templates/research-project/SUMMARY.md +170 -0
- package/get-research-done/templates/research.md +529 -0
- package/get-research-done/templates/roadmap.md +202 -0
- package/get-research-done/templates/scorecard.json +113 -0
- package/get-research-done/templates/state.md +287 -0
- package/get-research-done/templates/summary.md +246 -0
- package/get-research-done/templates/user-setup.md +311 -0
- package/get-research-done/templates/verification-report.md +322 -0
- package/get-research-done/workflows/complete-milestone.md +756 -0
- package/get-research-done/workflows/diagnose-issues.md +231 -0
- package/get-research-done/workflows/discovery-phase.md +289 -0
- package/get-research-done/workflows/discuss-phase.md +433 -0
- package/get-research-done/workflows/execute-phase.md +657 -0
- package/get-research-done/workflows/execute-plan.md +1844 -0
- package/get-research-done/workflows/list-phase-assumptions.md +178 -0
- package/get-research-done/workflows/map-codebase.md +322 -0
- package/get-research-done/workflows/resume-project.md +307 -0
- package/get-research-done/workflows/transition.md +556 -0
- package/get-research-done/workflows/verify-phase.md +628 -0
- package/get-research-done/workflows/verify-work.md +596 -0
- package/hooks/dist/grd-check-update.js +61 -0
- package/hooks/dist/grd-statusline.js +84 -0
- package/package.json +47 -0
- package/scripts/audit-help-commands.sh +115 -0
- package/scripts/build-hooks.js +42 -0
- package/scripts/verify-all-commands.sh +246 -0
- package/scripts/verify-architect-warning.sh +35 -0
- package/scripts/verify-insights-mode.sh +40 -0
- package/scripts/verify-quick-mode.sh +20 -0
- package/scripts/verify-revise-data-routing.sh +139 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# Hypothesis: {{hypothesis_title}}
|
|
2
|
+
|
|
3
|
+
**Created:** {{timestamp}}
|
|
4
|
+
**Phase:** 3 (Hypothesis Synthesis)
|
|
5
|
+
**Status:** {{draft|review|finalized}}
|
|
6
|
+
**Data Report:** {{path_to_data_report_or_none}}
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
metadata:
|
|
10
|
+
hypothesis_id: {{unique_id}}
|
|
11
|
+
version: 1
|
|
12
|
+
created: {{timestamp}}
|
|
13
|
+
phase: 3
|
|
14
|
+
status: {{draft|review|finalized}}
|
|
15
|
+
data_report: {{path_or_null}}
|
|
16
|
+
|
|
17
|
+
# Success metrics (for validation)
|
|
18
|
+
metrics:
|
|
19
|
+
- name: {{metric_name}}
|
|
20
|
+
threshold: {{value}}
|
|
21
|
+
comparison: {{greater_than|less_than|equal_to}}
|
|
22
|
+
weight: {{0.0-1.0}}
|
|
23
|
+
# Note: Weights must sum to 1.0
|
|
24
|
+
|
|
25
|
+
# Evaluation methodology
|
|
26
|
+
evaluation:
|
|
27
|
+
strategy: {{k-fold|stratified-k-fold|time-series-split|holdout}}
|
|
28
|
+
k_folds: {{number_or_null}}
|
|
29
|
+
test_size: {{proportion_or_null}}
|
|
30
|
+
random_state: 42
|
|
31
|
+
justification: {{why_this_strategy}}
|
|
32
|
+
|
|
33
|
+
# Baseline status
|
|
34
|
+
baseline_defined: {{true|false}}
|
|
35
|
+
baseline_count: {{number_of_baselines_or_0}}
|
|
36
|
+
|
|
37
|
+
# Falsification criteria
|
|
38
|
+
has_falsification_criteria: {{true|false}}
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Context
|
|
42
|
+
|
|
43
|
+
<!-- Background and motivation for this hypothesis (min 50 chars) -->
|
|
44
|
+
|
|
45
|
+
### Problem Statement
|
|
46
|
+
|
|
47
|
+
{{what_problem_or_question_is_being_addressed}}
|
|
48
|
+
|
|
49
|
+
### Motivation
|
|
50
|
+
|
|
51
|
+
{{why_this_hypothesis_matters}}
|
|
52
|
+
|
|
53
|
+
### Data Characteristics
|
|
54
|
+
|
|
55
|
+
{{data_insights_that_inform_this_hypothesis}}
|
|
56
|
+
|
|
57
|
+
<!-- If DATA_REPORT.md exists, reference specific findings:
|
|
58
|
+
- Missing data patterns that need handling
|
|
59
|
+
- Class imbalance requiring special attention
|
|
60
|
+
- Outliers or anomalies discovered
|
|
61
|
+
- Feature correlations that suggest relationships
|
|
62
|
+
- Leakage concerns that constrain approach
|
|
63
|
+
-->
|
|
64
|
+
|
|
65
|
+
### Known Constraints
|
|
66
|
+
|
|
67
|
+
{{limitations_or_constraints_that_apply}}
|
|
68
|
+
|
|
69
|
+
<!-- Examples:
|
|
70
|
+
- Computational resource limits
|
|
71
|
+
- Data quality issues from DATA_REPORT.md
|
|
72
|
+
- Domain-specific restrictions
|
|
73
|
+
- Time constraints
|
|
74
|
+
- Regulatory/ethical considerations
|
|
75
|
+
-->
|
|
76
|
+
|
|
77
|
+
## Hypothesis
|
|
78
|
+
|
|
79
|
+
<!-- Flexible prose format with required elements -->
|
|
80
|
+
|
|
81
|
+
### What
|
|
82
|
+
|
|
83
|
+
{{clear_statement_of_what_is_being_tested}}
|
|
84
|
+
|
|
85
|
+
<!-- Example: "A gradient boosting model trained on engineered temporal features will outperform a baseline logistic regression in predicting customer churn." -->
|
|
86
|
+
|
|
87
|
+
### Why
|
|
88
|
+
|
|
89
|
+
{{rationale_based_on_data_insights_or_domain_knowledge}}
|
|
90
|
+
|
|
91
|
+
<!-- Example: "DATA_REPORT.md shows strong temporal patterns in customer activity leading up to churn events. Non-linear methods like gradient boosting can capture these complex time-dependent relationships better than linear models." -->
|
|
92
|
+
|
|
93
|
+
### Expected Outcome
|
|
94
|
+
|
|
95
|
+
{{predicted_result_if_hypothesis_is_true}}
|
|
96
|
+
|
|
97
|
+
<!-- Example: "We expect to see:
|
|
98
|
+
- AUC-ROC improvement of at least 0.05 over baseline
|
|
99
|
+
- Precision at top 10% recall exceeding 0.70
|
|
100
|
+
- Temporally engineered features appearing in top 10 feature importances
|
|
101
|
+
" -->
|
|
102
|
+
|
|
103
|
+
## Success Metrics
|
|
104
|
+
|
|
105
|
+
<!-- How success is measured. Weights must sum to 1.0. -->
|
|
106
|
+
|
|
107
|
+
| Metric | Threshold | Comparison | Weight | Notes |
|
|
108
|
+
|--------|-----------|------------|--------|-------|
|
|
109
|
+
| {{metric_name_1}} | {{value}} | {{greater_than|less_than}} | {{0.0-1.0}} | {{context}} |
|
|
110
|
+
| {{metric_name_2}} | {{value}} | {{greater_than|less_than}} | {{0.0-1.0}} | {{context}} |
|
|
111
|
+
|
|
112
|
+
**Success Definition:** Weighted average of metrics must meet or exceed composite threshold of {{value}}.
|
|
113
|
+
|
|
114
|
+
**Metric Types Supported:**
|
|
115
|
+
- **Absolute thresholds**: "Accuracy >= 0.85"
|
|
116
|
+
- **Relative improvements**: "AUC improvement >= 0.05 over baseline"
|
|
117
|
+
|
|
118
|
+
**Note:** If any metric requires human judgment (e.g., "qualitative assessment of interpretability"), this will trigger a human evaluation gate during experimentation.
|
|
119
|
+
|
|
120
|
+
## Evaluation Methodology
|
|
121
|
+
|
|
122
|
+
<!-- Must be defined upfront to prevent p-hacking and post-hoc metric selection -->
|
|
123
|
+
|
|
124
|
+
**Strategy:** {{k-fold-cv|stratified-k-fold|time-series-split|holdout}}
|
|
125
|
+
|
|
126
|
+
**Parameters:**
|
|
127
|
+
- K-folds: {{number_or_null}}
|
|
128
|
+
- Test size: {{proportion_or_null}}
|
|
129
|
+
- Random state: 42 (for reproducibility)
|
|
130
|
+
- {{additional_parameters}}
|
|
131
|
+
|
|
132
|
+
**Justification:**
|
|
133
|
+
|
|
134
|
+
{{why_this_evaluation_strategy_is_appropriate}}
|
|
135
|
+
|
|
136
|
+
<!-- Examples:
|
|
137
|
+
- "Stratified k-fold (k=5) preserves class distribution in each fold, critical given 8:1 imbalance in target variable (per DATA_REPORT.md)"
|
|
138
|
+
- "Time-series split with 80/20 train/test prevents temporal leakage identified in DATA_REPORT.md"
|
|
139
|
+
- "Holdout validation (70/30 split) sufficient given large dataset size (>100k samples)"
|
|
140
|
+
-->
|
|
141
|
+
|
|
142
|
+
**Statistical Significance:**
|
|
143
|
+
|
|
144
|
+
{{how_statistical_significance_will_be_assessed}}
|
|
145
|
+
|
|
146
|
+
<!-- Examples:
|
|
147
|
+
- "95% confidence intervals via bootstrapping (1000 iterations)"
|
|
148
|
+
- "Paired t-test between model and baseline (p<0.05 threshold)"
|
|
149
|
+
- "Not applicable - deterministic evaluation"
|
|
150
|
+
-->
|
|
151
|
+
|
|
152
|
+
## Baselines
|
|
153
|
+
|
|
154
|
+
<!-- Comparison points for hypothesis validation. -->
|
|
155
|
+
|
|
156
|
+
| Baseline | Type | Expected Performance | Citation | Status |
|
|
157
|
+
|----------|------|---------------------|----------|--------|
|
|
158
|
+
| {{baseline_name}} | {{own_implementation|literature_citation}} | {{metric_value}} | {{paper_or_url}} | {{pending|complete}} |
|
|
159
|
+
|
|
160
|
+
**Baseline Ordering (IMPORTANT):**
|
|
161
|
+
- **First baseline listed = PRIMARY baseline (required)**
|
|
162
|
+
- Researcher agent blocks if primary baseline results not found
|
|
163
|
+
- Must have completed run with metrics.json before main experiment
|
|
164
|
+
- **Subsequent baselines = SECONDARY baselines (optional)**
|
|
165
|
+
- Researcher warns if missing but proceeds
|
|
166
|
+
- SCORECARD shows comparison for all available baselines
|
|
167
|
+
|
|
168
|
+
**Running Baselines:**
|
|
169
|
+
```bash
|
|
170
|
+
# Run primary baseline first
|
|
171
|
+
/grd:research --baseline {{primary_baseline_name}}
|
|
172
|
+
|
|
173
|
+
# Then run main experiment
|
|
174
|
+
/grd:research
|
|
175
|
+
|
|
176
|
+
# Optionally add secondary baselines
|
|
177
|
+
/grd:research --baseline {{secondary_baseline_name}}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
**Baseline Types:**
|
|
181
|
+
- **own_implementation**: You will run this baseline yourself during experimentation
|
|
182
|
+
- Results stored in experiments/run_*_{{baseline_name}}/metrics.json
|
|
183
|
+
- Full provenance tracked (code, config, data hash)
|
|
184
|
+
- **literature_citation**: Published result from paper/benchmark
|
|
185
|
+
- Expected performance entered manually in table
|
|
186
|
+
- No per-fold data for statistical significance testing
|
|
187
|
+
|
|
188
|
+
**Warning:** If this section is empty, the system will warn but proceed. Baselines provide essential context for evaluating hypothesis success.
|
|
189
|
+
|
|
190
|
+
**Skip Validation:** Use `--skip-baseline` flag to bypass validation (not recommended):
|
|
191
|
+
```bash
|
|
192
|
+
/grd:research --skip-baseline
|
|
193
|
+
```
|
|
194
|
+
This is logged for audit trail and noted in SCORECARD.
|
|
195
|
+
|
|
196
|
+
**Caching:** Baseline results can be cached if configuration hasn't changed. No need to re-run baseline for every experiment iteration.
|
|
197
|
+
|
|
198
|
+
## Falsification Criteria
|
|
199
|
+
|
|
200
|
+
<!-- What would disprove this hypothesis? At least one criterion required. -->
|
|
201
|
+
|
|
202
|
+
| Criterion | Metric | Threshold | Type | Explanation |
|
|
203
|
+
|-----------|--------|-----------|------|-------------|
|
|
204
|
+
| {{criterion_name}} | {{metric_name}} | {{value}} | {{quantitative|qualitative}} | {{what_falsification_means}} |
|
|
205
|
+
|
|
206
|
+
**Types:**
|
|
207
|
+
- **quantitative** (preferred): Numeric threshold that objectively disproves hypothesis
|
|
208
|
+
- **qualitative**: Subjective assessment (use sparingly, document clearly)
|
|
209
|
+
|
|
210
|
+
**Falsification Meaning:**
|
|
211
|
+
|
|
212
|
+
{{what_it_means_scientifically_if_criteria_are_met}}
|
|
213
|
+
|
|
214
|
+
<!-- Example: "If the model's AUC-ROC is within 0.02 of the baseline, the hypothesis that temporal features provide meaningful signal is falsified. This would suggest that churn is not driven by time-dependent patterns, and we should explore other feature engineering approaches or problem formulations." -->
|
|
215
|
+
|
|
216
|
+
**Critic Routing:**
|
|
217
|
+
|
|
218
|
+
When falsification criteria are met, the Critic agent will decide:
|
|
219
|
+
- **REVISE_DATA**: Return to data exploration with specific concerns
|
|
220
|
+
- **REVISE_METHOD**: Adjust methodology while keeping hypothesis
|
|
221
|
+
- **HUMAN**: Hand off to human for strategic decision
|
|
222
|
+
|
|
223
|
+
## Constraints
|
|
224
|
+
|
|
225
|
+
<!-- Optional section: Known limitations that bound the experiment -->
|
|
226
|
+
|
|
227
|
+
{{constraints_from_data_report}}
|
|
228
|
+
|
|
229
|
+
<!-- Examples from DATA_REPORT.md:
|
|
230
|
+
- "30% missing values in feature X require imputation strategy"
|
|
231
|
+
- "Class imbalance (15:1) requires stratified sampling"
|
|
232
|
+
- "High correlation (0.92) between features A and B suggests redundancy"
|
|
233
|
+
- "Temporal leakage risk in rolling features - must compute train-only"
|
|
234
|
+
-->
|
|
235
|
+
|
|
236
|
+
{{resource_constraints}}
|
|
237
|
+
|
|
238
|
+
<!-- Examples:
|
|
239
|
+
- "Training time limited to 2 hours per experiment"
|
|
240
|
+
- "Memory constraint: 16GB RAM available"
|
|
241
|
+
- "GPU not available - CPU training only"
|
|
242
|
+
-->
|
|
243
|
+
|
|
244
|
+
{{scope_boundaries}}
|
|
245
|
+
|
|
246
|
+
<!-- Examples:
|
|
247
|
+
- "Experiment limited to tabular features only (no text/images)"
|
|
248
|
+
- "Single-label classification only (no multi-label)"
|
|
249
|
+
- "Batch inference only (no real-time serving requirements)"
|
|
250
|
+
-->
|
|
251
|
+
|
|
252
|
+
## Non-Goals
|
|
253
|
+
|
|
254
|
+
<!-- Optional section: Explicit exclusions to prevent scope creep -->
|
|
255
|
+
|
|
256
|
+
This hypothesis does NOT attempt to prove:
|
|
257
|
+
|
|
258
|
+
- {{explicit_exclusion_1}}
|
|
259
|
+
- {{explicit_exclusion_2}}
|
|
260
|
+
|
|
261
|
+
<!-- Examples:
|
|
262
|
+
- "This does not prove production readiness (only predictive performance)"
|
|
263
|
+
- "This does not address model interpretability (explainability is out of scope)"
|
|
264
|
+
- "This does not claim generalization beyond the current dataset"
|
|
265
|
+
- "This does not optimize for inference latency (accuracy-focused only)"
|
|
266
|
+
-->
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
*Template: get-research-done/templates/objective.md*
|
|
271
|
+
*To be populated by: grd-architect agent or user directly*
|