mindforge-cc 11.5.0 → 11.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/.agent/mindforge/skill-tdd.md +53 -0
  2. package/.agent/mindforge/skills-index.md +118 -0
  3. package/.agent/mindforge/systematic-debug.md +60 -0
  4. package/.agent/skills/1password-skill/SKILL.md +156 -0
  5. package/.agent/skills/1password-skill/references/cli-examples.md +31 -0
  6. package/.agent/skills/1password-skill/references/get-started.md +21 -0
  7. package/.agent/skills/article-illustrator/SKILL.md +199 -0
  8. package/.agent/skills/article-illustrator/references/prompt-construction.md +426 -0
  9. package/.agent/skills/article-illustrator/references/style-presets.md +80 -0
  10. package/.agent/skills/article-illustrator/references/styles.md +224 -0
  11. package/.agent/skills/article-illustrator/references/usage.md +50 -0
  12. package/.agent/skills/article-illustrator/references/workflow.md +332 -0
  13. package/.agent/skills/arxiv/SKILL.md +275 -0
  14. package/.agent/skills/blogwatcher/SKILL.md +130 -0
  15. package/.agent/skills/code-wiki/SKILL.md +438 -0
  16. package/.agent/skills/code-wiki/templates/README.md +31 -0
  17. package/.agent/skills/code-wiki/templates/architecture.md +30 -0
  18. package/.agent/skills/code-wiki/templates/getting-started.md +47 -0
  19. package/.agent/skills/code-wiki/templates/module.md +38 -0
  20. package/.agent/skills/codebase-inspection/SKILL.md +109 -0
  21. package/.agent/skills/comic-creator/SKILL.md +240 -0
  22. package/.agent/skills/comic-creator/references/analysis-framework.md +176 -0
  23. package/.agent/skills/comic-creator/references/auto-selection.md +71 -0
  24. package/.agent/skills/comic-creator/references/base-prompt.md +98 -0
  25. package/.agent/skills/comic-creator/references/character-template.md +180 -0
  26. package/.agent/skills/comic-creator/references/ohmsha-guide.md +85 -0
  27. package/.agent/skills/comic-creator/references/partial-workflows.md +106 -0
  28. package/.agent/skills/comic-creator/references/storyboard-template.md +143 -0
  29. package/.agent/skills/comic-creator/references/workflow.md +401 -0
  30. package/.agent/skills/concept-diagrams/SKILL.md +355 -0
  31. package/.agent/skills/concept-diagrams/references/dashboard-patterns.md +43 -0
  32. package/.agent/skills/concept-diagrams/references/infrastructure-patterns.md +144 -0
  33. package/.agent/skills/concept-diagrams/references/physical-shape-cookbook.md +42 -0
  34. package/.agent/skills/creative-ideation/SKILL.md +144 -0
  35. package/.agent/skills/creative-ideation/references/full-prompt-library.md +110 -0
  36. package/.agent/skills/devops-cli/SKILL.md +149 -0
  37. package/.agent/skills/devops-cli/references/app-discovery.md +112 -0
  38. package/.agent/skills/devops-cli/references/authentication.md +59 -0
  39. package/.agent/skills/devops-cli/references/cli-reference.md +104 -0
  40. package/.agent/skills/devops-cli/references/running-apps.md +171 -0
  41. package/.agent/skills/devops-watchers/SKILL.md +103 -0
  42. package/.agent/skills/docker-management/SKILL.md +273 -0
  43. package/.agent/skills/domain-intel/SKILL.md +96 -0
  44. package/.agent/skills/duckduckgo-search/SKILL.md +230 -0
  45. package/.agent/skills/github-auth/SKILL.md +240 -0
  46. package/.agent/skills/github-code-review/SKILL.md +474 -0
  47. package/.agent/skills/github-code-review/references/review-output-template.md +74 -0
  48. package/.agent/skills/github-issues/SKILL.md +363 -0
  49. package/.agent/skills/github-issues/templates/bug-report.md +35 -0
  50. package/.agent/skills/github-issues/templates/feature-request.md +31 -0
  51. package/.agent/skills/github-pr-workflow/SKILL.md +360 -0
  52. package/.agent/skills/github-pr-workflow/references/ci-troubleshooting.md +183 -0
  53. package/.agent/skills/github-pr-workflow/references/conventional-commits.md +71 -0
  54. package/.agent/skills/github-pr-workflow/templates/pr-body-bugfix.md +35 -0
  55. package/.agent/skills/github-pr-workflow/templates/pr-body-feature.md +33 -0
  56. package/.agent/skills/github-repo-management/SKILL.md +509 -0
  57. package/.agent/skills/github-repo-management/references/github-api-cheatsheet.md +161 -0
  58. package/.agent/skills/godmode/SKILL.md +396 -0
  59. package/.agent/skills/godmode/references/jailbreak-templates.md +128 -0
  60. package/.agent/skills/godmode/references/refusal-detection.md +142 -0
  61. package/.agent/skills/hyperframes/SKILL.md +182 -0
  62. package/.agent/skills/hyperframes/references/cli.md +185 -0
  63. package/.agent/skills/hyperframes/references/composition.md +129 -0
  64. package/.agent/skills/hyperframes/references/features.md +289 -0
  65. package/.agent/skills/hyperframes/references/gsap.md +136 -0
  66. package/.agent/skills/hyperframes/references/troubleshooting.md +137 -0
  67. package/.agent/skills/hyperframes/references/website-to-video.md +145 -0
  68. package/.agent/skills/jupyter-live-kernel/SKILL.md +160 -0
  69. package/.agent/skills/kanban-orchestrator/SKILL.md +209 -0
  70. package/.agent/skills/kanban-worker/SKILL.md +188 -0
  71. package/.agent/skills/llm-wiki/SKILL.md +499 -0
  72. package/.agent/skills/meme-generation/SKILL.md +122 -0
  73. package/.agent/skills/node-inspect-debugger/SKILL.md +312 -0
  74. package/.agent/skills/obsidian/SKILL.md +60 -0
  75. package/.agent/skills/osint-investigation/SKILL.md +269 -0
  76. package/.agent/skills/osint-investigation/templates/source-template.md +59 -0
  77. package/.agent/skills/oss-forensics/SKILL.md +422 -0
  78. package/.agent/skills/oss-forensics/references/evidence-types.md +89 -0
  79. package/.agent/skills/oss-forensics/references/github-archive-guide.md +184 -0
  80. package/.agent/skills/oss-forensics/references/investigation-templates.md +131 -0
  81. package/.agent/skills/oss-forensics/references/recovery-techniques.md +164 -0
  82. package/.agent/skills/oss-forensics/templates/forensic-report.md +151 -0
  83. package/.agent/skills/oss-forensics/templates/malicious-package-report.md +43 -0
  84. package/.agent/skills/parallel-cli/SKILL.md +384 -0
  85. package/.agent/skills/pinggy-tunnel/SKILL.md +302 -0
  86. package/.agent/skills/pixel-art/SKILL.md +209 -0
  87. package/.agent/skills/pixel-art/references/palettes.md +49 -0
  88. package/.agent/skills/plan/SKILL.md +331 -0
  89. package/.agent/skills/polymarket/SKILL.md +75 -0
  90. package/.agent/skills/polymarket/references/api-endpoints.md +220 -0
  91. package/.agent/skills/python-debugpy/SKILL.md +368 -0
  92. package/.agent/skills/requesting-code-review/SKILL.md +273 -0
  93. package/.agent/skills/research-paper-writing/SKILL.md +2367 -0
  94. package/.agent/skills/research-paper-writing/references/autoreason-methodology.md +394 -0
  95. package/.agent/skills/research-paper-writing/references/checklists.md +434 -0
  96. package/.agent/skills/research-paper-writing/references/citation-workflow.md +563 -0
  97. package/.agent/skills/research-paper-writing/references/experiment-patterns.md +728 -0
  98. package/.agent/skills/research-paper-writing/references/human-evaluation.md +476 -0
  99. package/.agent/skills/research-paper-writing/references/paper-types.md +481 -0
  100. package/.agent/skills/research-paper-writing/references/reviewer-guidelines.md +433 -0
  101. package/.agent/skills/research-paper-writing/references/sources.md +191 -0
  102. package/.agent/skills/research-paper-writing/references/writing-guide.md +474 -0
  103. package/.agent/skills/research-paper-writing/templates/README.md +251 -0
  104. package/.agent/skills/rest-graphql-debug/SKILL.md +507 -0
  105. package/.agent/skills/s6-container-supervision/SKILL.md +171 -0
  106. package/.agent/skills/scrapling/SKILL.md +328 -0
  107. package/.agent/skills/sherlock/SKILL.md +186 -0
  108. package/.agent/skills/simplify-code/SKILL.md +168 -0
  109. package/.agent/skills/skill-authoring/SKILL.md +158 -0
  110. package/.agent/skills/spike/SKILL.md +190 -0
  111. package/.agent/skills/subagent-driven-development/SKILL.md +345 -0
  112. package/.agent/skills/subagent-driven-development/references/context-budget-discipline.md +53 -0
  113. package/.agent/skills/subagent-driven-development/references/gates-taxonomy.md +93 -0
  114. package/.agent/skills/systematic-debugging/SKILL.md +360 -0
  115. package/.agent/skills/test-driven-development/SKILL.md +336 -0
  116. package/.agent/skills/video-orchestrator/SKILL.md +194 -0
  117. package/.agent/skills/video-orchestrator/references/examples.md +227 -0
  118. package/.agent/skills/video-orchestrator/references/intake.md +166 -0
  119. package/.agent/skills/video-orchestrator/references/kanban-setup.md +278 -0
  120. package/.agent/skills/video-orchestrator/references/monitoring.md +180 -0
  121. package/.agent/skills/video-orchestrator/references/role-archetypes.md +298 -0
  122. package/.agent/skills/video-orchestrator/references/tool-matrix.md +317 -0
  123. package/.agent/skills/web-pentest/SKILL.md +332 -0
  124. package/.agent/skills/web-pentest/references/bypass-techniques.md +133 -0
  125. package/.agent/skills/web-pentest/references/exploitation-techniques.md +204 -0
  126. package/.agent/skills/web-pentest/references/scope-enforcement.md +110 -0
  127. package/.agent/skills/web-pentest/references/vuln-taxonomy.md +81 -0
  128. package/.agent/skills/web-pentest/templates/authorization.md +69 -0
  129. package/.agent/skills/web-pentest/templates/pentest-report.md +178 -0
  130. package/.claude/commands/mindforge/skill-tdd.md +53 -0
  131. package/.claude/commands/mindforge/skills-index.md +118 -0
  132. package/.claude/commands/mindforge/systematic-debug.md +60 -0
  133. package/.mindforge/config.json +2 -2
  134. package/.mindforge/memory/sync-manifest.json +1 -1
  135. package/.mindforge/skills/arxiv/SKILL.md +294 -0
  136. package/.mindforge/skills/blogwatcher/SKILL.md +147 -0
  137. package/.mindforge/skills/code-wiki/SKILL.md +457 -0
  138. package/.mindforge/skills/codebase-inspection/SKILL.md +126 -0
  139. package/.mindforge/skills/concept-diagrams/SKILL.md +373 -0
  140. package/.mindforge/skills/creative-ideation/SKILL.md +162 -0
  141. package/.mindforge/skills/domain-intel/SKILL.md +116 -0
  142. package/.mindforge/skills/duckduckgo-search/SKILL.md +249 -0
  143. package/.mindforge/skills/github-code-review/SKILL.md +493 -0
  144. package/.mindforge/skills/github-issues/SKILL.md +382 -0
  145. package/.mindforge/skills/github-pr-workflow/SKILL.md +379 -0
  146. package/.mindforge/skills/jupyter-live-kernel/SKILL.md +179 -0
  147. package/.mindforge/skills/kanban-orchestrator/SKILL.md +227 -0
  148. package/.mindforge/skills/kanban-worker/SKILL.md +206 -0
  149. package/.mindforge/skills/meme-generation/SKILL.md +141 -0
  150. package/.mindforge/skills/obsidian/SKILL.md +80 -0
  151. package/.mindforge/skills/osint-investigation/SKILL.md +288 -0
  152. package/.mindforge/skills/oss-forensics/SKILL.md +421 -0
  153. package/.mindforge/skills/pixel-art/SKILL.md +228 -0
  154. package/.mindforge/skills/plan/SKILL.md +350 -0
  155. package/.mindforge/skills/requesting-code-review/SKILL.md +292 -0
  156. package/.mindforge/skills/research-paper-writing/SKILL.md +2384 -0
  157. package/.mindforge/skills/scrapling/SKILL.md +345 -0
  158. package/.mindforge/skills/sherlock/SKILL.md +203 -0
  159. package/.mindforge/skills/simplify-code/SKILL.md +187 -0
  160. package/.mindforge/skills/spike/SKILL.md +209 -0
  161. package/.mindforge/skills/subagent-driven-development/SKILL.md +364 -0
  162. package/.mindforge/skills/systematic-debugging/SKILL.md +379 -0
  163. package/.mindforge/skills/test-driven-development/SKILL.md +355 -0
  164. package/.mindforge/skills/web-pentest/SKILL.md +327 -0
  165. package/CHANGELOG.md +88 -0
  166. package/MINDFORGE.md +3 -3
  167. package/README.md +38 -3
  168. package/RELEASENOTES.md +100 -0
  169. package/bin/dashboard/api-router.js +10 -1
  170. package/bin/governance/approve.js +5 -1
  171. package/bin/memory/federated-sync.js +11 -2
  172. package/bin/memory/knowledge-capture.js +10 -1
  173. package/bin/memory/pillar-health-tracker.js +9 -1
  174. package/bin/review/ads-engine.js +2 -2
  175. package/bin/security/trust-boundaries.js +5 -0
  176. package/docs/getting-started.md +42 -5
  177. package/package.json +1 -1
@@ -0,0 +1,728 @@
1
+ # Experiment Design Patterns
2
+
3
+ Patterns and best practices distilled from running research experiments at scale with the
4
+
5
+ ---
6
+
7
+ ## Experiment Infrastructure
8
+
9
+ ### Directory Structure
10
+
11
+ Organize experiments with a consistent structure:
12
+
13
+ ```
14
+ workspace/
15
+ experiments/
16
+ run_main.py # Core experiment runner
17
+ run_baselines.py # Baseline comparison
18
+ run_ablation.py # Ablation studies
19
+ strategies.py # Method implementations
20
+ config.yaml # Shared configuration
21
+ results/
22
+ <experiment_name>/
23
+ <task_or_problem>/
24
+ <strategy>/
25
+ result.json # Final metrics
26
+ final_output.md # Final output artifact
27
+ history.json # Full trajectory/log
28
+ pass_01/ # Per-iteration artifacts (if iterative)
29
+ intermediate.md
30
+ analysis/
31
+ analyze_results.py # Statistical analysis
32
+ compute_stats.py # Significance tests
33
+ make_charts.py # Visualization
34
+ paper/
35
+ paper.tex # LaTeX source
36
+ fig_*.pdf # Generated figures
37
+ ```
38
+
39
+ ### Script Design Principles
40
+
41
+ **1. Incremental Saving (Crash Recovery)**
42
+
43
+ Every experiment script should save results after each unit of work, and skip already-completed work on restart:
44
+
45
+ ```python
46
+ import json, os
47
+ from pathlib import Path
48
+
49
+ def run_experiment(problems, strategies, output_dir):
50
+ for problem in problems:
51
+ for strategy in strategies:
52
+ result_path = Path(output_dir) / problem["id"] / strategy / "result.json"
53
+ if result_path.exists():
54
+ print(f"Skipping {problem['id']}/{strategy} (already done)")
55
+ continue
56
+
57
+ # Run the experiment
58
+ result = execute_strategy(problem, strategy)
59
+
60
+ # Save immediately
61
+ result_path.parent.mkdir(parents=True, exist_ok=True)
62
+ with open(result_path, 'w') as f:
63
+ json.dump(result, f, indent=2)
64
+ ```
65
+
66
+ This pattern makes re-runs safe and efficient. If a process crashes at problem 47/150, restarting skips the first 46.
67
+
68
+ **2. Artifact Preservation**
69
+
70
+ Save all intermediate outputs, not just final results. This enables post-hoc analysis without re-running:
71
+
72
+ ```python
73
+ def save_pass_artifacts(output_dir, pass_num, artifacts):
74
+ """Save all artifacts from a single pass of an iterative method."""
75
+ pass_dir = Path(output_dir) / f"pass_{pass_num:02d}"
76
+ pass_dir.mkdir(parents=True, exist_ok=True)
77
+
78
+ for name, content in artifacts.items():
79
+ with open(pass_dir / f"{name}.md", 'w') as f:
80
+ f.write(content)
81
+ ```
82
+
83
+ **3. Configuration Management**
84
+
85
+ Use YAML configs for reproducibility:
86
+
87
+ ```yaml
88
+ # config.yaml
89
+ model: anthropic/claude-sonnet-4-20250514
90
+ author_temperature: 0.8
91
+ judge_temperature: 0.3
92
+ max_tokens: 4096
93
+ num_judges: 3
94
+ max_passes: 15
95
+ convergence_k: 2
96
+ ```
97
+
98
+ ```python
99
+ import yaml
100
+
101
+ with open("config.yaml") as f:
102
+ config = yaml.safe_load(f)
103
+ ```
104
+
105
+ **4. Separation of Concerns**
106
+
107
+ Keep generation, evaluation, and visualization in separate scripts:
108
+
109
+ | Script | Purpose |
110
+ |--------|---------|
111
+ | `run_experiment.py` | Core method execution |
112
+ | `run_baselines.py` | Baseline comparisons at same compute |
113
+ | `run_eval.py` | Blind evaluation / judge panels |
114
+ | `analyze_results.py` | Statistical analysis |
115
+ | `make_charts.py` | Figure generation |
116
+
117
+ This lets you re-run evaluation without re-running expensive generation, and regenerate figures without re-running analysis.
118
+
119
+ ---
120
+
121
+ ## Evaluation Protocols
122
+
123
+ ### Blind Judge Panels (for Subjective Tasks)
124
+
125
+ When evaluating subjective outputs (writing, analysis, recommendations), use a blind judge panel:
126
+
127
+ ```python
128
+ import random
129
+
130
+ def run_blind_evaluation(outputs: dict, task_prompt: str, num_judges: int = 7):
131
+ """
132
+ Run blind evaluation of multiple method outputs.
133
+
134
+ Args:
135
+ outputs: {"method_name": "output_text", ...}
136
+ task_prompt: The original task description
137
+ num_judges: Number of independent judge evaluations
138
+ """
139
+ rankings = []
140
+
141
+ for judge_i in range(num_judges):
142
+ # Randomize labels and presentation order per judge
143
+ methods = list(outputs.keys())
144
+ random.shuffle(methods)
145
+ labels = {m: chr(65 + i) for i, m in enumerate(methods)} # A, B, C...
146
+
147
+ # Present to judge with randomized labels
148
+ prompt = f"Task: {task_prompt}\n\n"
149
+ for method in methods:
150
+ prompt += f"--- Proposal {labels[method]} ---\n{outputs[method]}\n\n"
151
+ prompt += "Rank all proposals from best to worst. Format: RANKING: [best], [second], [worst]"
152
+
153
+ ranking = call_judge(prompt)
154
+ rankings.append({"labels": labels, "ranking": ranking})
155
+
156
+ # Aggregate via Borda count
157
+ return compute_borda(rankings)
158
+
159
+ def compute_borda(rankings, n_methods=3):
160
+ """Borda count: 3/2/1 points for 1st/2nd/3rd."""
161
+ scores = {}
162
+ points = {0: n_methods, 1: n_methods - 1, 2: n_methods - 2} # Adjust for n_methods
163
+
164
+ for r in rankings:
165
+ for position, method in enumerate(r["ranking"]):
166
+ scores[method] = scores.get(method, 0) + points.get(position, 0)
167
+
168
+ return scores
169
+ ```
170
+
171
+ Key design decisions:
172
+ - **Randomize both labels AND order** per judge to prevent position bias
173
+ - **Use odd number of judges** (3, 5, 7) to break ties
174
+ - **Conservative tiebreak**: Incumbent/baseline wins ties (prevents false positives)
175
+ - **CoT judges** match non-CoT quality at ~40% cost (1 CoT judge ≈ 3 standard judges)
176
+
177
+ ### Code/Objective Evaluation
178
+
179
+ For tasks with ground-truth evaluation (code, math, factual):
180
+
181
+ ```python
182
+ import subprocess
183
+
184
+ def evaluate_code(solution: str, test_cases: list, timeout: int = 30):
185
+ """Run code solution against test cases with sandboxed execution."""
186
+ results = {"public": [], "private": []}
187
+
188
+ for test in test_cases:
189
+ try:
190
+ proc = subprocess.run(
191
+ ["python3", "-c", solution],
192
+ input=test["input"],
193
+ capture_output=True,
194
+ timeout=timeout,
195
+ text=True
196
+ )
197
+ actual = proc.stdout.strip()
198
+ expected = test["expected"].strip()
199
+ passed = actual == expected
200
+ except subprocess.TimeoutExpired:
201
+ passed = False
202
+
203
+ category = "public" if test.get("public") else "private"
204
+ results[category].append(passed)
205
+
206
+ return {
207
+ "public_pass_rate": sum(results["public"]) / max(len(results["public"]), 1),
208
+ "private_pass_rate": sum(results["private"]) / max(len(results["private"]), 1),
209
+ }
210
+ ```
211
+
212
+ ### Compute-Matched Comparison
213
+
214
+ Always compare methods at equal compute budget. If your method uses N API calls, baselines get N calls too:
215
+
216
+ | Method | Call Budget | Allocation |
217
+ |--------|-----------|------------|
218
+ | Single pass | 6 calls | 6 independent generations |
219
+ | Critique & revise | 6 calls | 1 generate + 5 revise rounds |
220
+ | Autoreason | 6 calls | 1 generate + 1 analysis + 4 revisions |
221
+ | Best-of-N | 6 calls | 6 independent, pick best on public test |
222
+
223
+ ### Human Evaluation Design
224
+
225
+ Many ML/NLP papers require human evaluation, especially for subjective tasks (text generation, summarization, dialogue, creative writing). Poorly designed human evals are a common rejection reason.
226
+
227
+ #### When Human Evaluation Is Required
228
+
229
+ | Task Type | Required? | Notes |
230
+ |-----------|-----------|-------|
231
+ | Text generation (open-ended) | Yes | LLM-as-judge alone is insufficient for acceptance at ACL/EMNLP |
232
+ | Summarization | Usually | At minimum for a subset of outputs |
233
+ | Dialogue systems | Yes | User studies or annotation |
234
+ | Code generation | No | Test suites are objective ground truth |
235
+ | Classification | No | Standard metrics suffice |
236
+ | Any task with subjective quality | Strongly recommended | Strengthens the paper significantly |
237
+
238
+ #### Annotation Protocol Design
239
+
240
+ ```
241
+ Human Evaluation Protocol:
242
+ 1. Define the evaluation dimensions (fluency, relevance, factual accuracy, etc.)
243
+ 2. Create annotation guidelines with examples of each score level
244
+ 3. Run a pilot with 2-3 annotators on 20-30 examples
245
+ 4. Compute pilot inter-annotator agreement — if low, revise guidelines
246
+ 5. Run full evaluation
247
+ 6. Report: annotator count, agreement metrics, compensation, time per item
248
+ ```
249
+
250
+ **Evaluation dimensions** (pick relevant subset):
251
+
252
+ | Dimension | Definition | Scale |
253
+ |-----------|-----------|-------|
254
+ | Fluency | Grammaticality and naturalness | 1-5 Likert |
255
+ | Relevance | Does it address the task? | 1-5 Likert |
256
+ | Factual accuracy | Are stated facts correct? | Binary or 1-5 |
257
+ | Coherence | Logical flow and consistency | 1-5 Likert |
258
+ | Informativeness | Does it provide useful information? | 1-5 Likert |
259
+ | Overall preference | Which output is better? | A/B/Tie (pairwise) |
260
+
261
+ **Pairwise comparison** (preferred over absolute scoring — more reliable):
262
+ - Present two outputs side-by-side (randomize left/right position)
263
+ - Ask: "Which is better? A / B / Tie"
264
+ - More discriminative and less susceptible to annotator calibration drift
265
+
266
+ #### Inter-Annotator Agreement
267
+
268
+ Always report agreement metrics. Without them, reviewers assume your annotations are unreliable.
269
+
270
+ ```python
271
+ # Krippendorff's alpha (preferred — handles missing data, any scale)
272
+ # pip install krippendorffs-alpha
273
+ import krippendorff
274
+
275
+ # Ratings: rows = annotators, columns = items, values = scores
276
+ ratings = [
277
+ [3, 4, 1, 2, 5, None, 3], # Annotator 1
278
+ [3, 5, 1, 3, 5, 2, 3], # Annotator 2
279
+ [4, 4, 2, 2, 4, 2, None], # Annotator 3
280
+ ]
281
+ alpha = krippendorff.alpha(reliability_data=ratings, level_of_measurement="ordinal")
282
+ print(f"Krippendorff's alpha: {alpha:.3f}")
283
+ # Interpretation: >0.80 good, 0.67-0.80 acceptable, <0.67 questionable
284
+ ```
285
+
286
+ ```python
287
+ # Cohen's kappa (for exactly 2 annotators, categorical data)
288
+ from sklearn.metrics import cohen_kappa_score
289
+
290
+ annotator_1 = [1, 2, 3, 1, 2, 3, 2]
291
+ annotator_2 = [1, 2, 2, 1, 3, 3, 2]
292
+ kappa = cohen_kappa_score(annotator_1, annotator_2)
293
+ print(f"Cohen's kappa: {kappa:.3f}")
294
+ # Interpretation: >0.80 excellent, 0.60-0.80 substantial, 0.40-0.60 moderate
295
+ ```
296
+
297
+ | Metric | When to Use | Annotators | Scale |
298
+ |--------|------------|-----------|-------|
299
+ | Krippendorff's alpha | Default choice | Any number | Any (ordinal, nominal, ratio) |
300
+ | Cohen's kappa | 2 annotators, categorical | Exactly 2 | Nominal/ordinal |
301
+ | Fleiss' kappa | 3+ annotators, categorical | 3+ | Nominal |
302
+ | Pearson/Spearman | Continuous scores | 2 | Interval/ratio |
303
+
304
+ #### Crowdsourcing Platforms
305
+
306
+ | Platform | Best For | Cost | Quality |
307
+ |----------|----------|------|---------|
308
+ | **Prolific** | Academic research, higher quality | $8-15/hr | High — academic participant pool |
309
+ | **MTurk** | Large-scale, fast turnaround | $2-10/hr | Variable — use qualifications |
310
+ | **Surge AI** | NLP-specific annotations | Premium | High — trained annotators |
311
+ | **Expert annotators** | Domain-specific (medical, legal) | Highest | Highest — but slow |
312
+
313
+ **Ethics requirements**:
314
+ - Report compensation rate (must be at minimum local minimum wage)
315
+ - Describe annotator demographics if relevant
316
+ - Obtain IRB/ethics approval if required by your institution
317
+ - ACL venues explicitly require compensation documentation
318
+
319
+ #### What to Report in the Paper
320
+
321
+ ```
322
+ Human Evaluation Section Checklist:
323
+ - [ ] Number of annotators
324
+ - [ ] Annotator qualifications / recruitment method
325
+ - [ ] Number of items evaluated
326
+ - [ ] Evaluation dimensions with definitions
327
+ - [ ] Scale used (Likert, pairwise, binary)
328
+ - [ ] Inter-annotator agreement (Krippendorff's alpha or Cohen's kappa)
329
+ - [ ] Compensation rate
330
+ - [ ] Time per annotation item
331
+ - [ ] Whether annotators saw model identities (should be blind)
332
+ - [ ] Randomization of presentation order
333
+ ```
334
+
335
+ ---
336
+
337
+ ## Statistical Analysis
338
+
339
+ ### Required Tests
340
+
341
+ | Test | When to Use | Python |
342
+ |------|------------|--------|
343
+ | McNemar's test | Comparing two methods on same problems | `scipy.stats.binomtest` for small n |
344
+ | Two-proportion z-test | Comparing success rates | Custom or `statsmodels` |
345
+ | Fisher's exact test | Small sample pairwise comparison | `scipy.stats.fisher_exact` |
346
+ | Bootstrapped CI | Confidence intervals for any metric | Custom bootstrap |
347
+ | Cohen's h | Effect size for proportions | Manual calculation |
348
+
349
+ ### Standard Analysis Script
350
+
351
+ ```python
352
+ import numpy as np
353
+ from scipy import stats
354
+ from pathlib import Path
355
+ import json
356
+
357
+ def load_all_results(results_dir):
358
+ """Load all results into a structured format."""
359
+ results = {}
360
+ for result_file in Path(results_dir).rglob("result.json"):
361
+ parts = result_file.relative_to(results_dir).parts
362
+ if len(parts) >= 3:
363
+ experiment, task, strategy = parts[0], parts[1], parts[2]
364
+ data = json.loads(result_file.read_text())
365
+ results.setdefault(experiment, {}).setdefault(strategy, {})[task] = data
366
+ return results
367
+
368
+ def pairwise_mcnemar(method_a_results, method_b_results):
369
+ """McNemar's test for paired binary outcomes."""
370
+ a_win_b_lose = sum(1 for a, b in zip(method_a_results, method_b_results) if a and not b)
371
+ b_win_a_lose = sum(1 for a, b in zip(method_a_results, method_b_results) if b and not a)
372
+
373
+ n = a_win_b_lose + b_win_a_lose
374
+ if n < 25:
375
+ # Use exact binomial for small samples
376
+ result = stats.binomtest(a_win_b_lose, n, 0.5)
377
+ p_value = result.pvalue
378
+ else:
379
+ # Chi-squared approximation
380
+ chi2 = (abs(a_win_b_lose - b_win_a_lose) - 1)**2 / (a_win_b_lose + b_win_a_lose)
381
+ p_value = 1 - stats.chi2.cdf(chi2, df=1)
382
+
383
+ return {
384
+ "a_wins": a_win_b_lose,
385
+ "b_wins": b_win_a_lose,
386
+ "n_discordant": n,
387
+ "p_value": p_value,
388
+ "significant": p_value < 0.05
389
+ }
390
+
391
+ def bootstrap_ci(data, n_bootstrap=10000, ci=0.95):
392
+ """Bootstrap confidence interval for mean."""
393
+ means = []
394
+ for _ in range(n_bootstrap):
395
+ sample = np.random.choice(data, size=len(data), replace=True)
396
+ means.append(np.mean(sample))
397
+ lower = np.percentile(means, (1 - ci) / 2 * 100)
398
+ upper = np.percentile(means, (1 + ci) / 2 * 100)
399
+ return {"mean": np.mean(data), "ci_lower": lower, "ci_upper": upper}
400
+
401
+ def cohens_h(p1, p2):
402
+ """Cohen's h effect size for two proportions."""
403
+ return 2 * np.arcsin(np.sqrt(p1)) - 2 * np.arcsin(np.sqrt(p2))
404
+ ```
405
+
406
+ ### Reporting Standards
407
+
408
+ Always include in the paper:
409
+ - **Sample sizes**: n=X problems/tasks
410
+ - **Number of runs**: K independent runs if applicable
411
+ - **Error bars**: Specify standard deviation or standard error
412
+ - **Confidence intervals**: 95% CI for key results
413
+ - **Significance tests**: p-values for key comparisons
414
+ - **Effect sizes**: Cohen's d or h for practical significance
415
+
416
+ ---
417
+
418
+ ## Monitoring (Cron Pattern)
419
+
420
+ ### Cron Prompt Template
421
+
422
+ For each experiment batch, create a monitoring prompt:
423
+
424
+ ```
425
+ Check the status of the [EXPERIMENT_NAME] experiment:
426
+
427
+ 1. Process check: ps aux | grep [PROCESS_PATTERN]
428
+ 2. Log check: tail -30 [LOG_FILE]
429
+ 3. Results check: ls [RESULT_DIR]/eval/ (or appropriate result location)
430
+ 4. If results are available:
431
+ - Read the result JSON files
432
+ - Report metrics in a table (Borda scores, accuracy, etc.)
433
+ - Compute key comparisons between methods
434
+ 5. If all experiments in this batch are complete:
435
+ - git add -A && git commit -m "[COMMIT_MESSAGE]" && git push
436
+ - Report final summary
437
+ 6. Key question: [SPECIFIC ANALYTICAL QUESTION]
438
+
439
+ If nothing has changed since the last check, respond with [SILENT].
440
+ ```
441
+
442
+ ### Monitoring Best Practices
443
+
444
+ 1. **Check processes first** — don't read results if the experiment is still running and results are incomplete
445
+ 2. **Read the log tail** — look for errors, progress indicators, completion messages
446
+ 3. **Count completed vs expected** — "45/150 problems done" is more useful than "some results exist"
447
+ 4. **Report in structured tables** — always include key metrics in a table
448
+ 5. **Answer the key question** — each experiment should have a specific analytical question to answer when done
449
+ 6. **[SILENT] for no-news** — suppress notifications when nothing has changed
450
+ 7. **Commit on completion** — every completed batch gets committed with a descriptive message
451
+
452
+ ### Example Monitoring Report
453
+
454
+ ```
455
+ ## Code Experiments (Haiku 3.5) - COMPLETE
456
+
457
+ | Strategy | Pass Rate (150 problems) | vs Single |
458
+ |----------|------------------------|-----------|
459
+ | single_pass | 38.0% | — |
460
+ | critique_revise | 35.2% | -2.8pp |
461
+ | **autoreason** | **40.0%** | **+2.0pp** |
462
+ | best_of_6 | 31.0% | -7.0pp |
463
+
464
+ Key finding: Autoreason shows +2pp improvement over single pass, while
465
+ best-of-6 collapses due to single-public-test selection issue.
466
+
467
+ Committed: `git commit -m "Add Haiku code results (150 problems, 4 strategies)"`
468
+ Next: Run significance tests on these results.
469
+ ```
470
+
471
+ ---
472
+
473
+ ## Failure Recovery
474
+
475
+ ### Common Failures and Recovery
476
+
477
+ | Failure | Detection | Recovery |
478
+ |---------|-----------|----------|
479
+ | **API credit exhaustion** | 402 errors in logs, incomplete results | Top up credits, re-run (skips completed work automatically) |
480
+ | **Rate limiting** | 429 errors, slow progress | Add retry logic with exponential backoff |
481
+ | **Process crash** | PID gone, log stops mid-problem | Re-run script (resumes from last checkpoint) |
482
+ | **Wrong model ID** | Model not found errors | Fix ID (e.g., `claude-opus-4-6` not `claude-opus-4.6`) |
483
+ | **Parallel slowdown** | Each experiment taking 2x longer | Reduce parallel experiments to 2-3 max |
484
+ | **Security scan blocks** | Commands blocked by security | Use `execute_code` instead of piped `terminal` commands |
485
+ | **Delegation failures** | `delegate_task` returns errors | Fall back to doing work directly |
486
+ | **Timeout on hard problems** | Process stuck, no log progress | Kill, skip problem, note in results |
487
+ | **Dataset path mismatch** | File not found errors | Verify paths before launching |
488
+
489
+ ### Retry Naming Convention
490
+
491
+ When re-running failed experiments, use a suffix to track rounds:
492
+
493
+ ```
494
+ logs/experiment_haiku_0_50.log # Round 1
495
+ logs/experiment_haiku_0_50_r2.log # Round 2 (after credit exhaustion)
496
+ logs/experiment_haiku_0_50_r3.log # Round 3 (after bug fix)
497
+ ```
498
+
499
+ ### Pre-Flight Checklist
500
+
501
+ Before launching any experiment batch:
502
+
503
+ ```
504
+ Pre-Flight:
505
+ - [ ] API credits sufficient for estimated calls
506
+ - [ ] Model IDs correct (test with 1 problem first)
507
+ - [ ] Output directory exists and is writable
508
+ - [ ] Resume logic works (re-run won't overwrite existing results)
509
+ - [ ] Log file path is unique (won't overwrite previous logs)
510
+ - [ ] Dataset/task files are accessible
511
+ - [ ] Config matches intended experiment
512
+ ```
513
+
514
+ ---
515
+
516
+ ## Task/Benchmark Design
517
+
518
+ ### Open-Ended Tasks (Subjective Evaluation)
519
+
520
+ Design tasks that have clear objectives but subjective quality:
521
+
522
+ ```markdown
523
+ # Task: [Title]
524
+
525
+ ## Context
526
+ [Specific scenario with concrete details: company size, constraints, timeline]
527
+
528
+ ## Deliverable
529
+ [Exact format and structure required]
530
+
531
+ ## Requirements
532
+ - [Specific, measurable requirements]
533
+ - [Not vague — "be comprehensive" is bad, "include exactly 6 sections" is good]
534
+ ```
535
+
536
+ ### Constrained Tasks (for Testing Scope Effects)
537
+
538
+ Constrained tasks test whether methods respect scope boundaries. Design with:
539
+
540
+ - **Fixed facts**: "Use only these N data points, add nothing else"
541
+ - **Fixed deliverable**: Specific format (pitch, postmortem, memo — not "improve this")
542
+ - **Fixed structure**: "These sections in this order, do not add/remove"
543
+ - **Fixed change items**: "Address exactly these N points, nothing else"
544
+
545
+ **Do NOT use word count as a scope constraint.** Word limits cause false convergence — outputs get rejected for length, not quality. Constrain scope (what to include) not length.
546
+
547
+ ### Example: Good vs Bad Constraints
548
+
549
+ | Bad Constraint | Why | Good Constraint |
550
+ |---------------|-----|-----------------|
551
+ | "Max 500 words" | Judges reject for length | "Exactly 4 sections, each with 3 numbered items" |
552
+ | "Be concise" | Too vague | "Each prohibition must reference a specific base fact" |
553
+ | "Improve this" | Unbounded scope | "Write a 600-word incident postmortem with this exact structure" |
554
+ | "Make it better" | No clear criterion | "Address exactly these 3 reviewer concerns" |
555
+
556
+ ---
557
+
558
+ ## Visualization Best Practices
559
+
560
+ ### Setup: SciencePlots + matplotlib
561
+
562
+ Install SciencePlots for publication-ready defaults:
563
+
564
+ ```bash
565
+ pip install SciencePlots matplotlib numpy
566
+ ```
567
+
568
+ **Option A: SciencePlots styles** (recommended — handles most defaults automatically):
569
+
570
+ ```python
571
+ import matplotlib.pyplot as plt
572
+ import scienceplots # registers the styles
573
+
574
+ # Pick a style:
575
+ # 'science' — clean, serif fonts, suitable for most venues
576
+ # 'science+ieee' — IEEE-style (good for two-column papers)
577
+ # 'science+nature' — Nature-style
578
+ # Add 'no-latex' if LaTeX is not installed on the machine generating plots
579
+
580
+ with plt.style.context(['science', 'no-latex']):
581
+ fig, ax = plt.subplots(figsize=(3.5, 2.5)) # single-column width
582
+ # ... plot ...
583
+ fig.savefig('paper/fig_results.pdf', bbox_inches='tight')
584
+ ```
585
+
586
+ **Option B: Manual rcParams** (when you need full control):
587
+
588
+ ```python
589
+ import matplotlib.pyplot as plt
590
+
591
+ plt.rcParams.update({
592
+ 'font.size': 10,
593
+ 'font.family': 'serif',
594
+ 'axes.labelsize': 11,
595
+ 'axes.titlesize': 11,
596
+ 'xtick.labelsize': 9,
597
+ 'ytick.labelsize': 9,
598
+ 'legend.fontsize': 9,
599
+ 'figure.figsize': (3.5, 2.5), # single-column default
600
+ 'figure.dpi': 300,
601
+ 'savefig.dpi': 300,
602
+ 'savefig.bbox': 'tight',
603
+ 'savefig.pad_inches': 0.05,
604
+ 'axes.linewidth': 0.8,
605
+ 'lines.linewidth': 1.5,
606
+ 'lines.markersize': 5,
607
+ 'axes.grid': True,
608
+ 'grid.alpha': 0.3,
609
+ 'grid.linewidth': 0.5,
610
+ })
611
+ ```
612
+
613
+ ### Standard Figure Sizes (Two-Column Format)
614
+
615
+ | Use Case | figsize | Notes |
616
+ |----------|---------|-------|
617
+ | Single column | `(3.5, 2.5)` | Fits in one column of two-column layout |
618
+ | Double column | `(7.0, 3.0)` | Spans full page width |
619
+ | Square (heatmap, confusion matrix) | `(3.5, 3.5)` | Single column |
620
+ | Tall single (many rows) | `(3.5, 5.0)` | Use sparingly |
621
+
622
+ ### Colorblind-Safe Palette (Okabe-Ito)
623
+
624
+ Use this palette for all paper figures. It is distinguishable by people with all common forms of color vision deficiency:
625
+
626
+ ```python
627
+ COLORS = {
628
+ 'blue': '#0072B2',
629
+ 'orange': '#E69F00',
630
+ 'green': '#009E73',
631
+ 'red': '#D55E00',
632
+ 'purple': '#CC79A7',
633
+ 'cyan': '#56B4E9',
634
+ 'yellow': '#F0E442',
635
+ 'black': '#000000',
636
+ }
637
+
638
+ # As a list for cycling:
639
+ COLOR_CYCLE = ['#0072B2', '#D55E00', '#009E73', '#E69F00', '#CC79A7', '#56B4E9']
640
+ ```
641
+
642
+ Also differentiate lines by **marker and linestyle**, not just color:
643
+ ```python
644
+ STYLES = [
645
+ {'color': '#0072B2', 'marker': 'o', 'linestyle': '-'},
646
+ {'color': '#D55E00', 'marker': 's', 'linestyle': '--'},
647
+ {'color': '#009E73', 'marker': '^', 'linestyle': '-.'},
648
+ {'color': '#E69F00', 'marker': 'D', 'linestyle': ':'},
649
+ ]
650
+ ```
651
+
652
+ ### Complete Example: Method Comparison Bar Chart
653
+
654
+ ```python
655
+ import matplotlib.pyplot as plt
656
+ import numpy as np
657
+
658
+ try:
659
+ import scienceplots
660
+ style = ['science', 'no-latex']
661
+ except ImportError:
662
+ style = 'default'
663
+
664
+ with plt.style.context(style):
665
+ methods = ['Single Pass', 'Critique+Revise', 'Best-of-N', 'Ours']
666
+ scores = [73.2, 74.1, 68.5, 77.0]
667
+ errors = [2.1, 1.8, 3.2, 1.5]
668
+ colors = ['#56B4E9', '#E69F00', '#CC79A7', '#0072B2']
669
+
670
+ fig, ax = plt.subplots(figsize=(3.5, 2.5))
671
+ bars = ax.bar(methods, scores, yerr=errors, capsize=3,
672
+ color=colors, edgecolor='black', linewidth=0.5)
673
+
674
+ # Highlight "Ours"
675
+ bars[-1].set_edgecolor('#0072B2')
676
+ bars[-1].set_linewidth(1.5)
677
+
678
+ ax.set_ylabel('Pass Rate (%)')
679
+ ax.set_ylim(60, 85)
680
+ ax.spines['top'].set_visible(False)
681
+ ax.spines['right'].set_visible(False)
682
+
683
+ fig.savefig('paper/fig_comparison.pdf', bbox_inches='tight')
684
+ ```
685
+
686
+ ### Complete Example: Convergence/Trajectory Line Chart
687
+
688
+ ```python
689
+ with plt.style.context(style):
690
+ fig, ax = plt.subplots(figsize=(3.5, 2.5))
691
+
692
+ passes = np.arange(1, 16)
693
+ ours = [65, 72, 78, 82, 85, 87, 88, 89, 89.5, 90, 90, 90, 90, 90, 90]
694
+ baseline = [65, 68, 70, 71, 69, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58]
695
+
696
+ ax.plot(passes, ours, **STYLES[0], label='Ours', markersize=4)
697
+ ax.plot(passes, baseline, **STYLES[1], label='Critique+Revise', markersize=4)
698
+
699
+ # Mark convergence point
700
+ ax.axvline(x=10, color='gray', linestyle=':', alpha=0.5, linewidth=0.8)
701
+ ax.annotate('Converged', xy=(10, 90), fontsize=8, ha='center',
702
+ xytext=(10, 93), arrowprops=dict(arrowstyle='->', color='gray'))
703
+
704
+ ax.set_xlabel('Iteration')
705
+ ax.set_ylabel('Quality Score')
706
+ ax.legend(loc='lower right')
707
+ ax.spines['top'].set_visible(False)
708
+ ax.spines['right'].set_visible(False)
709
+
710
+ fig.savefig('paper/fig_trajectory.pdf', bbox_inches='tight')
711
+ ```
712
+
713
+ ### Output Rules
714
+
715
+ - **Always save as PDF**: `fig.savefig('fig.pdf')` — vector graphics, sharp at any zoom
716
+ - **Never save as PNG** for paper figures — raster PNGs look blurry when printed/zoomed
717
+ - **Exception**: Screenshots, photographs, or pixel-art visualizations → PNG at 600 DPI
718
+ - **Verify grayscale**: Print to grayscale PDF and check all information is still visible
719
+
720
+ ### Chart Types for Common Comparisons
721
+
722
+ | Comparison Type | Chart | Notes |
723
+ |----------------|-------|-------|
724
+ | Method vs method | Grouped bar chart | Include error bars |
725
+ | Across model sizes | Line chart with CI bands | Log scale for model size axis |
726
+ | Ablation study | Stacked/grouped bar | Highlight removed component |
727
+ | Trajectory/convergence | Line chart over iterations | Show winner per iteration |
728
+ | Per-task breakdown | Heatmap or grouped bar | Show variance across tasks |