claude-turing 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +4 -2
- package/commands/lit.md +47 -0
- package/commands/paper.md +44 -0
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +2 -0
- package/templates/scripts/__pycache__/draft_paper_sections.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/literature_search.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/draft_paper_sections.py +498 -0
- package/templates/scripts/literature_search.py +421 -0
- package/templates/scripts/scaffold.py +4 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "2.1.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 27 commands, 2 specialized agents, literature integration + paper section drafting, production model export (6 formats, equivalence, latency), performance profiling, smart Pareto-based checkpoint management, experiment intelligence (error analysis, ablation, Pareto frontiers), statistical rigor (seed studies, reproducibility), tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -330,6 +330,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
330
330
|
| `/turing:frontier [--metrics]` | Pareto frontier — multi-objective tradeoff visualization |
|
|
331
331
|
| `/turing:profile [exp-id]` | Computational profiling — timing, memory, throughput, bottleneck detection |
|
|
332
332
|
| `/turing:checkpoint <action>` | Smart checkpoint management — list, prune (Pareto), average, resume, stats |
|
|
333
|
+
| `/turing:lit <query>` | Literature search — papers, SOTA baselines, related work |
|
|
334
|
+
| `/turing:paper [--sections] [--format]` | Draft paper sections from experiment logs (setup, results, ablation, hyperparams) |
|
|
333
335
|
| `/turing:export [--format]` | Export model to production format with equivalence check + latency benchmark |
|
|
334
336
|
| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
|
|
335
337
|
| `/turing:logbook` | Generate HTML experiment logbook |
|
|
@@ -520,11 +522,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
520
522
|
|
|
521
523
|
## Architecture of Turing Itself
|
|
522
524
|
|
|
523
|
-
|
|
525
|
+
27 commands, 2 agents, 8 config files, 46 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, 664 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
524
526
|
|
|
525
527
|
```
|
|
526
528
|
turing/
|
|
527
|
-
├── commands/
|
|
529
|
+
├── commands/ 26 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow)
|
|
528
530
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
529
531
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
530
532
|
├── templates/ Scaffolded into user projects by /turing:init
|
package/commands/lit.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: lit
|
|
3
|
+
description: Literature search scoped to the current experiment domain — find papers, SOTA baselines, and related work without leaving the terminal.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<query> | --baseline | --related <exp-id>"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob, WebSearch
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Search the literature for papers, baselines, and related work.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- **Free query:** `"gradient boosting for tabular data"` — searches Semantic Scholar
|
|
20
|
+
- **Baseline:** `--baseline` — finds SOTA results for the current task, compares against your best
|
|
21
|
+
- **Related:** `--related exp-042` — finds papers using similar methods to a specific experiment
|
|
22
|
+
- `--auto-queue` — auto-queues hypotheses from literature with `source: "literature"`
|
|
23
|
+
- `--limit 10` — max number of results
|
|
24
|
+
|
|
25
|
+
3. **Run literature search:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/literature_search.py $ARGUMENTS
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **Report results:**
|
|
31
|
+
- **Papers:** title, authors, year, venue, citations, abstract snippet, URL
|
|
32
|
+
- **Baseline mode:** SOTA comparison with gap analysis against current best
|
|
33
|
+
- **Related mode:** methodological differences worth investigating
|
|
34
|
+
- **Hypotheses:** if `--auto-queue`, shows queued experiments from findings
|
|
35
|
+
|
|
36
|
+
5. **Saved output:** results written to `experiments/literature/query-YYYY-MM-DD-HHMMSS.md`
|
|
37
|
+
|
|
38
|
+
6. **If API unavailable:** reports error and suggests manual search.
|
|
39
|
+
|
|
40
|
+
## Examples
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
/turing:lit "gradient boosting missing values" # Free query
|
|
44
|
+
/turing:lit --baseline # SOTA comparison
|
|
45
|
+
/turing:lit --related exp-042 # Related work
|
|
46
|
+
/turing:lit --auto-queue "ensemble methods" # Queue hypotheses
|
|
47
|
+
```
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: paper
|
|
3
|
+
description: Draft mechanical paper sections (setup, results, ablation, hyperparameters) from experiment logs. LaTeX and markdown output.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--sections setup,results,ablation] [--format latex|markdown]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Draft paper sections directly from experiment data.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--sections setup,results,ablation,hyperparameters` — which sections to draft (default: all)
|
|
20
|
+
- `--format latex|markdown` — output format (default: latex)
|
|
21
|
+
|
|
22
|
+
3. **Run paper drafting:**
|
|
23
|
+
```bash
|
|
24
|
+
python scripts/draft_paper_sections.py $ARGUMENTS
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
4. **Report results:**
|
|
28
|
+
- **setup:** Experimental setup prose (dataset, metrics, split, seed methodology)
|
|
29
|
+
- **results:** Comparison table with all model types, best bolded, seed study stats
|
|
30
|
+
- **ablation:** Ablation table from `/turing:ablate` results
|
|
31
|
+
- **hyperparameters:** Appendix-style parameter table per model
|
|
32
|
+
|
|
33
|
+
5. **Output:** Each section saved to `paper/sections/` as `.tex` or `.md`
|
|
34
|
+
|
|
35
|
+
6. **Numbers are pulled directly from experiment logs** — no manual transcription needed.
|
|
36
|
+
|
|
37
|
+
## Examples
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
/turing:paper # All sections, LaTeX
|
|
41
|
+
/turing:paper --format markdown # All sections, markdown
|
|
42
|
+
/turing:paper --sections setup,results # Just setup + results
|
|
43
|
+
/turing:paper --sections ablation --format latex # Just ablation table
|
|
44
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -31,6 +31,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
31
31
|
| "diagnose", "error analysis", "failure modes", "where does it fail", "confusion matrix" | `/turing:diagnose` | Analyze |
|
|
32
32
|
| "ablate", "ablation", "remove component", "which features matter", "component impact" | `/turing:ablate` | Analyze |
|
|
33
33
|
| "frontier", "pareto", "tradeoff", "tradeoffs", "multi-objective", "which model is best" | `/turing:frontier` | Analyze |
|
|
34
|
+
| "lit", "literature", "papers", "SOTA", "baseline", "related work", "citations" | `/turing:lit` | Research |
|
|
35
|
+
| "paper", "draft paper", "write paper", "results table", "latex", "experimental setup" | `/turing:paper` | Document |
|
|
34
36
|
| "export", "deploy", "production", "onnx", "torchscript", "tflite", "ship model" | `/turing:export` | Deploy |
|
|
35
37
|
| "profile", "profiling", "bottleneck", "slow training", "why is it slow", "timing" | `/turing:profile` | Check |
|
|
36
38
|
| "checkpoint", "checkpoints", "prune checkpoints", "disk space", "resume training" | `/turing:checkpoint` | Check |
|
|
@@ -61,6 +63,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
61
63
|
| `/turing:diagnose [exp-id]` | Error analysis: failure modes, confused pairs, feature-range bias | (inline) |
|
|
62
64
|
| `/turing:ablate [--components]` | Ablation study: remove components, measure impact, flag dead weight | (inline) |
|
|
63
65
|
| `/turing:frontier [--metrics]` | Pareto frontier: multi-objective tradeoff visualization | (inline) |
|
|
66
|
+
| `/turing:lit <query>` | Literature search: papers, SOTA baselines, related work | (inline, uses WebSearch) |
|
|
67
|
+
| `/turing:paper [--sections] [--format]` | Draft paper sections from experiment logs (setup, results, ablation, hyperparams) | (inline) |
|
|
64
68
|
| `/turing:export [exp-id] [--format]` | Export model to production format with equivalence check + latency benchmark | (inline) |
|
|
65
69
|
| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | (inline) |
|
|
66
70
|
| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | (inline) |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -24,6 +24,7 @@ const SUB_COMMANDS = [
|
|
|
24
24
|
"try", "brief", "suggest", "explore", "design", "logbook", "poster",
|
|
25
25
|
"report", "mode", "preflight", "card", "seed", "reproduce",
|
|
26
26
|
"diagnose", "ablate", "frontier", "profile", "checkpoint", "export",
|
|
27
|
+
"lit", "paper",
|
|
27
28
|
];
|
|
28
29
|
|
|
29
30
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Paper section drafting from experiment logs.
|
|
3
|
+
|
|
4
|
+
Drafts the mechanical sections of an ML paper directly from experiment
|
|
5
|
+
data: experimental setup, results tables, ablation tables, and
|
|
6
|
+
hyperparameter appendices. Eliminates transcription errors.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/draft_paper_sections.py # All sections
|
|
10
|
+
python scripts/draft_paper_sections.py --sections setup,results # Specific sections
|
|
11
|
+
python scripts/draft_paper_sections.py --format latex # LaTeX output
|
|
12
|
+
python scripts/draft_paper_sections.py --format markdown # Markdown output
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
from scripts.turing_io import load_config, load_experiments
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
VALID_SECTIONS = ["setup", "results", "ablation", "hyperparameters"]
|
|
29
|
+
DEFAULT_FORMAT = "latex"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_seed_studies_for_paper(seed_dir: str = "experiments/seed_studies") -> dict[str, dict]:
|
|
33
|
+
"""Load all seed studies indexed by experiment ID."""
|
|
34
|
+
path = Path(seed_dir)
|
|
35
|
+
studies = {}
|
|
36
|
+
if not path.exists():
|
|
37
|
+
return studies
|
|
38
|
+
for f in path.glob("*-seeds.yaml"):
|
|
39
|
+
try:
|
|
40
|
+
with open(f) as fh:
|
|
41
|
+
study = yaml.safe_load(fh)
|
|
42
|
+
if study and isinstance(study, dict) and "experiment_id" in study:
|
|
43
|
+
studies[study["experiment_id"]] = study
|
|
44
|
+
except (yaml.YAMLError, OSError):
|
|
45
|
+
continue
|
|
46
|
+
return studies
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_ablation_studies(ablation_dir: str = "experiments/ablations") -> dict[str, dict]:
|
|
50
|
+
"""Load all ablation studies indexed by experiment ID."""
|
|
51
|
+
path = Path(ablation_dir)
|
|
52
|
+
studies = {}
|
|
53
|
+
if not path.exists():
|
|
54
|
+
return studies
|
|
55
|
+
for f in path.glob("*-ablation.yaml"):
|
|
56
|
+
try:
|
|
57
|
+
with open(f) as fh:
|
|
58
|
+
study = yaml.safe_load(fh)
|
|
59
|
+
if study and isinstance(study, dict) and "experiment_id" in study:
|
|
60
|
+
studies[study["experiment_id"]] = study
|
|
61
|
+
except (yaml.YAMLError, OSError):
|
|
62
|
+
continue
|
|
63
|
+
return studies
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_top_experiments(
|
|
67
|
+
experiments: list[dict],
|
|
68
|
+
metric: str,
|
|
69
|
+
lower_is_better: bool,
|
|
70
|
+
top_k: int = 10,
|
|
71
|
+
) -> list[dict]:
|
|
72
|
+
"""Get top-K kept experiments by primary metric."""
|
|
73
|
+
kept = [e for e in experiments if e.get("status") == "kept" and e.get("metrics", {}).get(metric) is not None]
|
|
74
|
+
kept.sort(key=lambda e: e["metrics"][metric], reverse=not lower_is_better)
|
|
75
|
+
return kept[:top_k]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def group_by_model_type(experiments: list[dict]) -> dict[str, list[dict]]:
|
|
79
|
+
"""Group experiments by model type, keeping best per type."""
|
|
80
|
+
groups: dict[str, list[dict]] = {}
|
|
81
|
+
for exp in experiments:
|
|
82
|
+
mt = exp.get("config", {}).get("model_type", "unknown")
|
|
83
|
+
groups.setdefault(mt, []).append(exp)
|
|
84
|
+
return groups
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def draft_setup_section(
|
|
88
|
+
config: dict,
|
|
89
|
+
experiments: list[dict],
|
|
90
|
+
seed_studies: dict[str, dict],
|
|
91
|
+
output_format: str = "latex",
|
|
92
|
+
) -> str:
|
|
93
|
+
"""Draft the experimental setup section."""
|
|
94
|
+
eval_cfg = config.get("evaluation", {})
|
|
95
|
+
data_cfg = config.get("data", {})
|
|
96
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
97
|
+
metrics = eval_cfg.get("metrics", [primary_metric])
|
|
98
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
99
|
+
|
|
100
|
+
task_desc = config.get("task_description", "the classification task")
|
|
101
|
+
data_source = data_cfg.get("source", "the provided dataset")
|
|
102
|
+
split_ratios = data_cfg.get("split_ratios", {})
|
|
103
|
+
random_state = data_cfg.get("random_state", 42)
|
|
104
|
+
|
|
105
|
+
# Determine seed study info
|
|
106
|
+
n_seeds = 0
|
|
107
|
+
for study in seed_studies.values():
|
|
108
|
+
n_seeds = max(n_seeds, len(study.get("seeds_run", [])))
|
|
109
|
+
|
|
110
|
+
# Build prose
|
|
111
|
+
split_text = ""
|
|
112
|
+
if split_ratios:
|
|
113
|
+
parts = [f"{int(v*100)}\\%" if output_format == "latex" else f"{int(v*100)}%" for v in split_ratios.values()]
|
|
114
|
+
split_names = list(split_ratios.keys())
|
|
115
|
+
split_text = "/".join(parts) + " " + "/".join(split_names) + " split"
|
|
116
|
+
|
|
117
|
+
direction = "lower" if lower_is_better else "higher"
|
|
118
|
+
metric_list = ", ".join(metrics)
|
|
119
|
+
|
|
120
|
+
if output_format == "latex":
|
|
121
|
+
lines = [
|
|
122
|
+
r"\subsection{Experimental Setup}",
|
|
123
|
+
"",
|
|
124
|
+
f"We evaluate on {data_source} using {metric_list} as evaluation metrics "
|
|
125
|
+
f"({direction} is better for {primary_metric}).",
|
|
126
|
+
]
|
|
127
|
+
if split_text:
|
|
128
|
+
lines.append(f"Data is partitioned using a {split_text} with random state {random_state}.")
|
|
129
|
+
if n_seeds > 0:
|
|
130
|
+
lines.append(
|
|
131
|
+
f"Results are reported as mean $\\pm$ standard deviation over {n_seeds} random seeds "
|
|
132
|
+
f"to account for seed sensitivity."
|
|
133
|
+
)
|
|
134
|
+
lines.append(f"All experiments use {task_desc} as the target task.")
|
|
135
|
+
else:
|
|
136
|
+
lines = [
|
|
137
|
+
"## Experimental Setup",
|
|
138
|
+
"",
|
|
139
|
+
f"We evaluate on {data_source} using {metric_list} as evaluation metrics "
|
|
140
|
+
f"({direction} is better for {primary_metric}).",
|
|
141
|
+
]
|
|
142
|
+
if split_text:
|
|
143
|
+
lines.append(f"Data is partitioned using a {split_text} with random state {random_state}.")
|
|
144
|
+
if n_seeds > 0:
|
|
145
|
+
lines.append(
|
|
146
|
+
f"Results are reported as mean +/- standard deviation over {n_seeds} random seeds "
|
|
147
|
+
f"to account for seed sensitivity."
|
|
148
|
+
)
|
|
149
|
+
lines.append(f"All experiments use {task_desc} as the target task.")
|
|
150
|
+
|
|
151
|
+
return "\n".join(lines)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def draft_results_table(
|
|
155
|
+
experiments: list[dict],
|
|
156
|
+
metrics: list[str],
|
|
157
|
+
primary_metric: str,
|
|
158
|
+
lower_is_better: bool,
|
|
159
|
+
seed_studies: dict[str, dict],
|
|
160
|
+
output_format: str = "latex",
|
|
161
|
+
dataset_name: str = "the dataset",
|
|
162
|
+
) -> str:
|
|
163
|
+
"""Draft the results comparison table."""
|
|
164
|
+
# Group by model type, take best per type
|
|
165
|
+
groups = group_by_model_type(experiments)
|
|
166
|
+
rows = []
|
|
167
|
+
for mt, exps in groups.items():
|
|
168
|
+
# Best experiment per model type
|
|
169
|
+
best = exps[0] # Already sorted by caller
|
|
170
|
+
row = {"model_type": mt, "experiment_id": best.get("experiment_id", "?")}
|
|
171
|
+
for m in metrics:
|
|
172
|
+
val = best.get("metrics", {}).get(m)
|
|
173
|
+
seed = seed_studies.get(best.get("experiment_id", ""))
|
|
174
|
+
if seed and seed.get("metric") == m:
|
|
175
|
+
row[m] = {"value": val, "mean": seed.get("mean"), "std": seed.get("std")}
|
|
176
|
+
else:
|
|
177
|
+
row[m] = {"value": val}
|
|
178
|
+
rows.append(row)
|
|
179
|
+
|
|
180
|
+
# Find best value per metric
|
|
181
|
+
best_per_metric = {}
|
|
182
|
+
for m in metrics:
|
|
183
|
+
values = [(r["model_type"], r[m].get("mean") or r[m].get("value")) for r in rows if r[m].get("value") is not None]
|
|
184
|
+
if values:
|
|
185
|
+
if lower_is_better:
|
|
186
|
+
best_per_metric[m] = min(values, key=lambda x: x[1] if x[1] is not None else float("inf"))[0]
|
|
187
|
+
else:
|
|
188
|
+
best_per_metric[m] = max(values, key=lambda x: x[1] if x[1] is not None else float("-inf"))[0]
|
|
189
|
+
|
|
190
|
+
if output_format == "latex":
|
|
191
|
+
return _format_results_latex(rows, metrics, best_per_metric, dataset_name)
|
|
192
|
+
else:
|
|
193
|
+
return _format_results_markdown(rows, metrics, best_per_metric, dataset_name)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _format_results_latex(rows: list[dict], metrics: list[str], best_per: dict, dataset: str) -> str:
|
|
197
|
+
"""Format results as LaTeX table."""
|
|
198
|
+
n_cols = len(metrics)
|
|
199
|
+
col_spec = "l" + "c" * n_cols
|
|
200
|
+
metric_headers = " & ".join(m.replace("_", r"\_") for m in metrics)
|
|
201
|
+
|
|
202
|
+
lines = [
|
|
203
|
+
r"\begin{table}[h]",
|
|
204
|
+
r"\centering",
|
|
205
|
+
f"\\caption{{Comparison of model architectures on {dataset}.}}",
|
|
206
|
+
r"\label{tab:results}",
|
|
207
|
+
f"\\begin{{tabular}}{{{col_spec}}}",
|
|
208
|
+
r"\toprule",
|
|
209
|
+
f"Model & {metric_headers} \\\\",
|
|
210
|
+
r"\midrule",
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
for row in rows:
|
|
214
|
+
mt = row["model_type"].replace("_", r"\_")
|
|
215
|
+
cells = [mt]
|
|
216
|
+
for m in metrics:
|
|
217
|
+
data = row[m]
|
|
218
|
+
val = data.get("mean") or data.get("value")
|
|
219
|
+
std = data.get("std")
|
|
220
|
+
if val is None:
|
|
221
|
+
cells.append("---")
|
|
222
|
+
elif std:
|
|
223
|
+
cell = f"{val:.3f} $\\pm$ {std:.3f}"
|
|
224
|
+
if best_per.get(m) == row["model_type"]:
|
|
225
|
+
cell = f"\\textbf{{{cell}}}"
|
|
226
|
+
cells.append(cell)
|
|
227
|
+
else:
|
|
228
|
+
cell = f"{val:.4f}"
|
|
229
|
+
if best_per.get(m) == row["model_type"]:
|
|
230
|
+
cell = f"\\textbf{{{cell}}}"
|
|
231
|
+
cells.append(cell)
|
|
232
|
+
lines.append(" & ".join(cells) + r" \\")
|
|
233
|
+
|
|
234
|
+
lines.extend([
|
|
235
|
+
r"\bottomrule",
|
|
236
|
+
r"\end{tabular}",
|
|
237
|
+
r"\end{table}",
|
|
238
|
+
])
|
|
239
|
+
return "\n".join(lines)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _format_results_markdown(rows: list[dict], metrics: list[str], best_per: dict, dataset: str) -> str:
|
|
243
|
+
"""Format results as markdown table."""
|
|
244
|
+
header = f"| Model |"
|
|
245
|
+
sep = "|-------|"
|
|
246
|
+
for m in metrics:
|
|
247
|
+
header += f" {m} |"
|
|
248
|
+
sep += f"{'---' * max(len(m) // 3, 1)}--|"
|
|
249
|
+
|
|
250
|
+
lines = [
|
|
251
|
+
f"## Results on {dataset}",
|
|
252
|
+
"",
|
|
253
|
+
header,
|
|
254
|
+
sep,
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
for row in rows:
|
|
258
|
+
line = f"| {row['model_type']} |"
|
|
259
|
+
for m in metrics:
|
|
260
|
+
data = row[m]
|
|
261
|
+
val = data.get("mean") or data.get("value")
|
|
262
|
+
std = data.get("std")
|
|
263
|
+
if val is None:
|
|
264
|
+
line += " --- |"
|
|
265
|
+
elif std:
|
|
266
|
+
cell = f"{val:.3f} +/- {std:.3f}"
|
|
267
|
+
if best_per.get(m) == row["model_type"]:
|
|
268
|
+
cell = f"**{cell}**"
|
|
269
|
+
line += f" {cell} |"
|
|
270
|
+
else:
|
|
271
|
+
cell = f"{val:.4f}"
|
|
272
|
+
if best_per.get(m) == row["model_type"]:
|
|
273
|
+
cell = f"**{cell}**"
|
|
274
|
+
line += f" {cell} |"
|
|
275
|
+
lines.append(line)
|
|
276
|
+
|
|
277
|
+
return "\n".join(lines)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def draft_ablation_table(
|
|
281
|
+
ablation_studies: dict[str, dict],
|
|
282
|
+
output_format: str = "latex",
|
|
283
|
+
) -> str:
|
|
284
|
+
"""Draft ablation table from ablation study results."""
|
|
285
|
+
if not ablation_studies:
|
|
286
|
+
return "No ablation studies available. Run `/turing:ablate` first."
|
|
287
|
+
|
|
288
|
+
# Use the most recent ablation study
|
|
289
|
+
study = list(ablation_studies.values())[-1]
|
|
290
|
+
metric = study.get("metric", "accuracy")
|
|
291
|
+
full_metric = study.get("full_model_metric", 0)
|
|
292
|
+
results = study.get("results", [])
|
|
293
|
+
|
|
294
|
+
if not results:
|
|
295
|
+
return "Ablation study has no results."
|
|
296
|
+
|
|
297
|
+
if output_format == "latex":
|
|
298
|
+
metric_escaped = metric.replace("_", r"\_")
|
|
299
|
+
lines = [
|
|
300
|
+
r"\begin{table}[h]",
|
|
301
|
+
r"\centering",
|
|
302
|
+
f"\\caption{{Ablation study results ({metric_escaped}).}}",
|
|
303
|
+
r"\label{tab:ablation}",
|
|
304
|
+
r"\begin{tabular}{lcc}",
|
|
305
|
+
r"\toprule",
|
|
306
|
+
f"Configuration & {metric_escaped} & $\\Delta$ from Full \\\\",
|
|
307
|
+
r"\midrule",
|
|
308
|
+
f"Full model & {full_metric:.4f} & --- \\\\",
|
|
309
|
+
]
|
|
310
|
+
for r in results:
|
|
311
|
+
if r.get("status") == "failed":
|
|
312
|
+
continue
|
|
313
|
+
config = r.get("configuration", "?").replace("_", r"\_")
|
|
314
|
+
val = r.get("metric_value", 0)
|
|
315
|
+
delta = r.get("delta", 0)
|
|
316
|
+
delta_str = f"{delta:+.4f}" if delta is not None else "---"
|
|
317
|
+
lines.append(f"{config} & {val:.4f} & {delta_str} \\\\")
|
|
318
|
+
lines.extend([r"\bottomrule", r"\end{tabular}", r"\end{table}"])
|
|
319
|
+
return "\n".join(lines)
|
|
320
|
+
else:
|
|
321
|
+
lines = [
|
|
322
|
+
f"## Ablation Study ({metric})",
|
|
323
|
+
"",
|
|
324
|
+
f"| Configuration | {metric} | Delta from Full |",
|
|
325
|
+
f"|---------------|{'---' * max(len(metric) // 3, 1)}--|-----------------|",
|
|
326
|
+
f"| Full model | {full_metric:.4f} | --- |",
|
|
327
|
+
]
|
|
328
|
+
for r in results:
|
|
329
|
+
if r.get("status") == "failed":
|
|
330
|
+
continue
|
|
331
|
+
config = r.get("configuration", "?")
|
|
332
|
+
val = r.get("metric_value", 0)
|
|
333
|
+
delta = r.get("delta", 0)
|
|
334
|
+
delta_str = f"{delta:+.4f}" if delta is not None else "---"
|
|
335
|
+
lines.append(f"| {config} | {val:.4f} | {delta_str} |")
|
|
336
|
+
return "\n".join(lines)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def draft_hyperparameter_table(
|
|
340
|
+
experiments: list[dict],
|
|
341
|
+
output_format: str = "latex",
|
|
342
|
+
) -> str:
|
|
343
|
+
"""Draft hyperparameter appendix table."""
|
|
344
|
+
groups = group_by_model_type(experiments)
|
|
345
|
+
|
|
346
|
+
if output_format == "latex":
|
|
347
|
+
lines = [
|
|
348
|
+
r"\begin{table}[h]",
|
|
349
|
+
r"\centering",
|
|
350
|
+
r"\caption{Hyperparameters for reported models.}",
|
|
351
|
+
r"\label{tab:hyperparams}",
|
|
352
|
+
r"\begin{tabular}{llr}",
|
|
353
|
+
r"\toprule",
|
|
354
|
+
r"Model & Parameter & Value \\",
|
|
355
|
+
r"\midrule",
|
|
356
|
+
]
|
|
357
|
+
for mt, exps in groups.items():
|
|
358
|
+
best = exps[0]
|
|
359
|
+
hyperparams = best.get("config", {}).get("hyperparams", {})
|
|
360
|
+
first = True
|
|
361
|
+
for param, value in sorted(hyperparams.items()):
|
|
362
|
+
model_col = mt.replace("_", r"\_") if first else ""
|
|
363
|
+
param_escaped = param.replace("_", r"\_")
|
|
364
|
+
lines.append(f"{model_col} & {param_escaped} & {value} \\\\")
|
|
365
|
+
first = False
|
|
366
|
+
lines.append(r"\midrule")
|
|
367
|
+
if lines[-1] == r"\midrule":
|
|
368
|
+
lines.pop()
|
|
369
|
+
lines.extend([r"\bottomrule", r"\end{tabular}", r"\end{table}"])
|
|
370
|
+
return "\n".join(lines)
|
|
371
|
+
else:
|
|
372
|
+
lines = [
|
|
373
|
+
"## Hyperparameters",
|
|
374
|
+
"",
|
|
375
|
+
"| Model | Parameter | Value |",
|
|
376
|
+
"|-------|-----------|-------|",
|
|
377
|
+
]
|
|
378
|
+
for mt, exps in groups.items():
|
|
379
|
+
best = exps[0]
|
|
380
|
+
hyperparams = best.get("config", {}).get("hyperparams", {})
|
|
381
|
+
for param, value in sorted(hyperparams.items()):
|
|
382
|
+
lines.append(f"| {mt} | {param} | {value} |")
|
|
383
|
+
return "\n".join(lines)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def save_paper_sections(sections: dict[str, str], output_dir: str = "paper/sections") -> list[Path]:
|
|
387
|
+
"""Save each section to its own file."""
|
|
388
|
+
out_path = Path(output_dir)
|
|
389
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
390
|
+
|
|
391
|
+
saved = []
|
|
392
|
+
for name, content in sections.items():
|
|
393
|
+
ext = ".tex" if r"\begin" in content else ".md"
|
|
394
|
+
filepath = out_path / f"{name}{ext}"
|
|
395
|
+
with open(filepath, "w") as f:
|
|
396
|
+
f.write(content)
|
|
397
|
+
saved.append(filepath)
|
|
398
|
+
|
|
399
|
+
return saved
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def draft_paper(
|
|
403
|
+
sections_str: str | None = None,
|
|
404
|
+
output_format: str = DEFAULT_FORMAT,
|
|
405
|
+
config_path: str = "config.yaml",
|
|
406
|
+
log_path: str = "experiments/log.jsonl",
|
|
407
|
+
) -> dict:
|
|
408
|
+
"""Draft all requested paper sections.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
sections_str: Comma-separated section names (setup,results,ablation,hyperparameters).
|
|
412
|
+
output_format: "latex" or "markdown".
|
|
413
|
+
config_path: Path to config.yaml.
|
|
414
|
+
log_path: Path to experiment log.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
Dict with section name -> content mappings.
|
|
418
|
+
"""
|
|
419
|
+
if sections_str:
|
|
420
|
+
sections = [s.strip() for s in sections_str.split(",")]
|
|
421
|
+
else:
|
|
422
|
+
sections = VALID_SECTIONS
|
|
423
|
+
|
|
424
|
+
config = load_config(config_path)
|
|
425
|
+
eval_cfg = config.get("evaluation", {})
|
|
426
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
427
|
+
all_metrics = eval_cfg.get("metrics", [primary_metric])
|
|
428
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
429
|
+
|
|
430
|
+
experiments = load_experiments(log_path)
|
|
431
|
+
top_exps = get_top_experiments(experiments, primary_metric, lower_is_better)
|
|
432
|
+
|
|
433
|
+
seed_studies = load_seed_studies_for_paper()
|
|
434
|
+
ablation_studies = load_ablation_studies()
|
|
435
|
+
|
|
436
|
+
dataset_name = config.get("data", {}).get("source", "the dataset")
|
|
437
|
+
|
|
438
|
+
result = {"format": output_format, "sections": {}}
|
|
439
|
+
|
|
440
|
+
for section in sections:
|
|
441
|
+
if section == "setup":
|
|
442
|
+
result["sections"]["setup"] = draft_setup_section(
|
|
443
|
+
config, experiments, seed_studies, output_format,
|
|
444
|
+
)
|
|
445
|
+
elif section == "results":
|
|
446
|
+
result["sections"]["results"] = draft_results_table(
|
|
447
|
+
top_exps, all_metrics, primary_metric, lower_is_better,
|
|
448
|
+
seed_studies, output_format, dataset_name,
|
|
449
|
+
)
|
|
450
|
+
elif section == "ablation":
|
|
451
|
+
result["sections"]["ablation"] = draft_ablation_table(
|
|
452
|
+
ablation_studies, output_format,
|
|
453
|
+
)
|
|
454
|
+
elif section == "hyperparameters":
|
|
455
|
+
result["sections"]["hyperparameters"] = draft_hyperparameter_table(
|
|
456
|
+
top_exps, output_format,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
return result
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def main() -> None:
|
|
463
|
+
"""CLI entry point."""
|
|
464
|
+
parser = argparse.ArgumentParser(description="Draft paper sections from experiment logs")
|
|
465
|
+
parser.add_argument("--sections", default=None, help="Comma-separated sections: setup,results,ablation,hyperparameters")
|
|
466
|
+
parser.add_argument("--format", default=DEFAULT_FORMAT, dest="output_format", choices=["latex", "markdown"])
|
|
467
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
468
|
+
parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
|
|
469
|
+
parser.add_argument("--output", default="paper/sections", help="Output directory")
|
|
470
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
471
|
+
args = parser.parse_args()
|
|
472
|
+
|
|
473
|
+
result = draft_paper(
|
|
474
|
+
sections_str=args.sections,
|
|
475
|
+
output_format=args.output_format,
|
|
476
|
+
config_path=args.config,
|
|
477
|
+
log_path=args.log,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Save sections
|
|
481
|
+
if result.get("sections"):
|
|
482
|
+
saved = save_paper_sections(result["sections"], args.output)
|
|
483
|
+
for f in saved:
|
|
484
|
+
print(f"Saved: {f}", file=sys.stderr)
|
|
485
|
+
|
|
486
|
+
if args.json:
|
|
487
|
+
print(json.dumps(result, indent=2, default=str))
|
|
488
|
+
else:
|
|
489
|
+
for name, content in result.get("sections", {}).items():
|
|
490
|
+
print(f"\n{'=' * 60}")
|
|
491
|
+
print(f" {name.upper()}")
|
|
492
|
+
print(f"{'=' * 60}\n")
|
|
493
|
+
print(content)
|
|
494
|
+
print()
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
if __name__ == "__main__":
|
|
498
|
+
main()
|
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Literature integration for ML experiments.
|
|
3
|
+
|
|
4
|
+
Targeted literature search scoped to the current experiment's domain.
|
|
5
|
+
Three modes: free query, baseline SOTA comparison, related papers.
|
|
6
|
+
|
|
7
|
+
Uses Semantic Scholar API (free, no key required for basic search)
|
|
8
|
+
with fallback to local-only mode when offline.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
python scripts/literature_search.py "gradient boosting tabular" # Free query
|
|
12
|
+
python scripts/literature_search.py --baseline # SOTA comparison
|
|
13
|
+
python scripts/literature_search.py --related exp-042 # Related papers
|
|
14
|
+
python scripts/literature_search.py --auto-queue "query" # Queue hypotheses
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import sys
|
|
22
|
+
import urllib.error
|
|
23
|
+
import urllib.parse
|
|
24
|
+
import urllib.request
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
import yaml
|
|
29
|
+
|
|
30
|
+
from scripts.turing_io import load_config, load_experiments
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1"
|
|
34
|
+
DEFAULT_RESULT_COUNT = 5
|
|
35
|
+
REQUEST_TIMEOUT = 15
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def search_semantic_scholar(
|
|
39
|
+
query: str,
|
|
40
|
+
limit: int = DEFAULT_RESULT_COUNT,
|
|
41
|
+
fields: str = "title,authors,year,venue,abstract,citationCount,externalIds",
|
|
42
|
+
) -> list[dict]:
|
|
43
|
+
"""Search Semantic Scholar for papers matching a query.
|
|
44
|
+
|
|
45
|
+
Returns list of paper dicts with title, authors, year, venue,
|
|
46
|
+
abstract, citation_count, and URLs.
|
|
47
|
+
"""
|
|
48
|
+
params = urllib.parse.urlencode({
|
|
49
|
+
"query": query,
|
|
50
|
+
"limit": limit,
|
|
51
|
+
"fields": fields,
|
|
52
|
+
})
|
|
53
|
+
url = f"{SEMANTIC_SCHOLAR_API}/paper/search?{params}"
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
req = urllib.request.Request(url, headers={"User-Agent": "turing-ml/2.0"})
|
|
57
|
+
with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
|
|
58
|
+
data = json.loads(resp.read().decode())
|
|
59
|
+
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e:
|
|
60
|
+
return [{"error": f"Semantic Scholar API failed: {e}"}]
|
|
61
|
+
|
|
62
|
+
papers = []
|
|
63
|
+
for item in data.get("data", []):
|
|
64
|
+
authors = [a.get("name", "?") for a in (item.get("authors") or [])]
|
|
65
|
+
ext_ids = item.get("externalIds") or {}
|
|
66
|
+
|
|
67
|
+
paper = {
|
|
68
|
+
"title": item.get("title", "Untitled"),
|
|
69
|
+
"authors": authors[:5],
|
|
70
|
+
"year": item.get("year"),
|
|
71
|
+
"venue": item.get("venue") or "N/A",
|
|
72
|
+
"abstract": (item.get("abstract") or "")[:300],
|
|
73
|
+
"citation_count": item.get("citationCount", 0),
|
|
74
|
+
"paper_id": item.get("paperId"),
|
|
75
|
+
"doi": ext_ids.get("DOI"),
|
|
76
|
+
"arxiv_id": ext_ids.get("ArXiv"),
|
|
77
|
+
"url": f"https://www.semanticscholar.org/paper/{item.get('paperId', '')}" if item.get("paperId") else None,
|
|
78
|
+
}
|
|
79
|
+
papers.append(paper)
|
|
80
|
+
|
|
81
|
+
return papers
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def build_query_from_config(config: dict) -> str:
|
|
85
|
+
"""Build a search query from project config."""
|
|
86
|
+
parts = []
|
|
87
|
+
|
|
88
|
+
task_desc = config.get("task_description", "")
|
|
89
|
+
if task_desc:
|
|
90
|
+
parts.append(task_desc)
|
|
91
|
+
|
|
92
|
+
model_type = config.get("model", {}).get("type", "")
|
|
93
|
+
if model_type:
|
|
94
|
+
parts.append(model_type)
|
|
95
|
+
|
|
96
|
+
primary_metric = config.get("evaluation", {}).get("primary_metric", "")
|
|
97
|
+
if primary_metric:
|
|
98
|
+
parts.append(primary_metric)
|
|
99
|
+
|
|
100
|
+
data_source = config.get("data", {}).get("source", "")
|
|
101
|
+
if data_source and not data_source.startswith("{"):
|
|
102
|
+
parts.append(data_source)
|
|
103
|
+
|
|
104
|
+
return " ".join(parts) if parts else "machine learning"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def build_query_from_experiment(experiment: dict) -> str:
|
|
108
|
+
"""Build a search query from experiment metadata."""
|
|
109
|
+
parts = []
|
|
110
|
+
|
|
111
|
+
model_type = experiment.get("config", {}).get("model_type", "")
|
|
112
|
+
if model_type:
|
|
113
|
+
parts.append(model_type)
|
|
114
|
+
|
|
115
|
+
description = experiment.get("description", "")
|
|
116
|
+
if description:
|
|
117
|
+
parts.append(description[:100])
|
|
118
|
+
|
|
119
|
+
return " ".join(parts) if parts else "machine learning experiment"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def search_baseline(
|
|
123
|
+
config: dict,
|
|
124
|
+
experiments: list[dict],
|
|
125
|
+
primary_metric: str,
|
|
126
|
+
lower_is_better: bool,
|
|
127
|
+
) -> dict:
|
|
128
|
+
"""Search for SOTA baselines and compare against current best.
|
|
129
|
+
|
|
130
|
+
Returns dict with SOTA results and gap analysis.
|
|
131
|
+
"""
|
|
132
|
+
query = build_query_from_config(config)
|
|
133
|
+
query += " state of the art benchmark"
|
|
134
|
+
|
|
135
|
+
papers = search_semantic_scholar(query, limit=10)
|
|
136
|
+
if papers and "error" in papers[0]:
|
|
137
|
+
return {"error": papers[0]["error"], "query": query}
|
|
138
|
+
|
|
139
|
+
# Find current best
|
|
140
|
+
best = None
|
|
141
|
+
best_val = float("inf") if lower_is_better else float("-inf")
|
|
142
|
+
for exp in experiments:
|
|
143
|
+
if exp.get("status") != "kept":
|
|
144
|
+
continue
|
|
145
|
+
val = exp.get("metrics", {}).get(primary_metric)
|
|
146
|
+
if val is None:
|
|
147
|
+
continue
|
|
148
|
+
if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
149
|
+
best_val = val
|
|
150
|
+
best = exp
|
|
151
|
+
|
|
152
|
+
result = {
|
|
153
|
+
"query": query,
|
|
154
|
+
"papers": papers,
|
|
155
|
+
"current_best": {
|
|
156
|
+
"experiment_id": best.get("experiment_id") if best else None,
|
|
157
|
+
"metric": primary_metric,
|
|
158
|
+
"value": round(best_val, 4) if best else None,
|
|
159
|
+
},
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def search_related(
|
|
166
|
+
experiment: dict,
|
|
167
|
+
limit: int = DEFAULT_RESULT_COUNT,
|
|
168
|
+
) -> dict:
|
|
169
|
+
"""Find papers related to a specific experiment."""
|
|
170
|
+
query = build_query_from_experiment(experiment)
|
|
171
|
+
papers = search_semantic_scholar(query, limit=limit)
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
"experiment_id": experiment.get("experiment_id", "?"),
|
|
175
|
+
"query": query,
|
|
176
|
+
"papers": papers,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def generate_literature_hypotheses(papers: list[dict]) -> list[dict]:
|
|
181
|
+
"""Generate hypotheses from literature findings.
|
|
182
|
+
|
|
183
|
+
Extracts methodological suggestions from paper titles/abstracts.
|
|
184
|
+
"""
|
|
185
|
+
hypotheses = []
|
|
186
|
+
for i, paper in enumerate(papers):
|
|
187
|
+
if "error" in paper:
|
|
188
|
+
continue
|
|
189
|
+
title = paper.get("title", "")
|
|
190
|
+
if not title:
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
hypotheses.append({
|
|
194
|
+
"id": f"hyp-lit-{i+1:03d}",
|
|
195
|
+
"description": f"Investigate approach from: {title}",
|
|
196
|
+
"source": "literature",
|
|
197
|
+
"status": "queued",
|
|
198
|
+
"priority": "normal",
|
|
199
|
+
"rationale": f"Paper: {title} ({paper.get('year', '?')}, {paper.get('citation_count', 0)} citations)",
|
|
200
|
+
"paper_url": paper.get("url"),
|
|
201
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
return hypotheses[:5]
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def save_literature_results(results: dict, output_dir: str = "experiments/literature") -> Path:
|
|
208
|
+
"""Save literature search results to markdown file."""
|
|
209
|
+
out_path = Path(output_dir)
|
|
210
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M%S")
|
|
213
|
+
filepath = out_path / f"query-{timestamp}.md"
|
|
214
|
+
|
|
215
|
+
with open(filepath, "w") as f:
|
|
216
|
+
f.write(format_literature_report(results))
|
|
217
|
+
|
|
218
|
+
return filepath
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def format_literature_report(results: dict) -> str:
|
|
222
|
+
"""Format literature search results as markdown."""
|
|
223
|
+
if "error" in results:
|
|
224
|
+
return f"ERROR: {results['error']}"
|
|
225
|
+
|
|
226
|
+
mode = results.get("mode", "query")
|
|
227
|
+
query = results.get("query", "")
|
|
228
|
+
papers = results.get("papers", [])
|
|
229
|
+
|
|
230
|
+
lines = [
|
|
231
|
+
f"# Literature Search",
|
|
232
|
+
"",
|
|
233
|
+
f"*Query: {query}*",
|
|
234
|
+
f"*Mode: {mode}*",
|
|
235
|
+
"",
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
if not papers:
|
|
239
|
+
lines.append("No papers found.")
|
|
240
|
+
return "\n".join(lines)
|
|
241
|
+
|
|
242
|
+
if any("error" in p for p in papers):
|
|
243
|
+
error_paper = next(p for p in papers if "error" in p)
|
|
244
|
+
lines.append(f"**API Error:** {error_paper['error']}")
|
|
245
|
+
lines.append("")
|
|
246
|
+
lines.append("*Search may be offline. Try again later or use a manual search.*")
|
|
247
|
+
return "\n".join(lines)
|
|
248
|
+
|
|
249
|
+
# Papers table
|
|
250
|
+
lines.extend([
|
|
251
|
+
"## Results",
|
|
252
|
+
"",
|
|
253
|
+
"| # | Title | Year | Venue | Citations |",
|
|
254
|
+
"|---|-------|------|-------|-----------|",
|
|
255
|
+
])
|
|
256
|
+
|
|
257
|
+
for i, paper in enumerate(papers, 1):
|
|
258
|
+
title = paper.get("title", "Untitled")
|
|
259
|
+
year = paper.get("year", "?")
|
|
260
|
+
venue = paper.get("venue", "N/A")
|
|
261
|
+
cites = paper.get("citation_count", 0)
|
|
262
|
+
lines.append(f"| {i} | {title} | {year} | {venue} | {cites} |")
|
|
263
|
+
|
|
264
|
+
# Paper details
|
|
265
|
+
lines.extend(["", "## Details", ""])
|
|
266
|
+
for i, paper in enumerate(papers, 1):
|
|
267
|
+
title = paper.get("title", "Untitled")
|
|
268
|
+
authors = ", ".join(paper.get("authors", [])[:3])
|
|
269
|
+
if len(paper.get("authors", [])) > 3:
|
|
270
|
+
authors += " et al."
|
|
271
|
+
abstract = paper.get("abstract", "")
|
|
272
|
+
url = paper.get("url", "")
|
|
273
|
+
|
|
274
|
+
lines.append(f"### {i}. {title}")
|
|
275
|
+
lines.append("")
|
|
276
|
+
lines.append(f"**Authors:** {authors}")
|
|
277
|
+
lines.append(f"**Year:** {paper.get('year', '?')} | **Venue:** {paper.get('venue', 'N/A')} | **Citations:** {paper.get('citation_count', 0)}")
|
|
278
|
+
if url:
|
|
279
|
+
lines.append(f"**URL:** {url}")
|
|
280
|
+
if abstract:
|
|
281
|
+
lines.append(f"**Abstract:** {abstract}...")
|
|
282
|
+
lines.append("")
|
|
283
|
+
|
|
284
|
+
# Baseline comparison
|
|
285
|
+
baseline = results.get("current_best")
|
|
286
|
+
if baseline and baseline.get("value") is not None:
|
|
287
|
+
lines.extend([
|
|
288
|
+
"## Current Performance",
|
|
289
|
+
"",
|
|
290
|
+
f"- **Best experiment:** {baseline.get('experiment_id', '?')}",
|
|
291
|
+
f"- **{baseline['metric']}:** {baseline['value']:.4f}",
|
|
292
|
+
"",
|
|
293
|
+
"*Compare against reported baselines in the papers above.*",
|
|
294
|
+
])
|
|
295
|
+
|
|
296
|
+
return "\n".join(lines)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def queue_literature_hypotheses(hypotheses: list[dict], queue_path: str = "hypotheses.yaml") -> int:
|
|
300
|
+
"""Append literature hypotheses to the queue."""
|
|
301
|
+
path = Path(queue_path)
|
|
302
|
+
existing = []
|
|
303
|
+
if path.exists() and path.stat().st_size > 0:
|
|
304
|
+
with open(path) as f:
|
|
305
|
+
data = yaml.safe_load(f)
|
|
306
|
+
if isinstance(data, list):
|
|
307
|
+
existing = data
|
|
308
|
+
|
|
309
|
+
existing_ids = {h.get("id") for h in existing}
|
|
310
|
+
new = [h for h in hypotheses if h["id"] not in existing_ids]
|
|
311
|
+
|
|
312
|
+
if new:
|
|
313
|
+
existing.extend(new)
|
|
314
|
+
with open(path, "w") as f:
|
|
315
|
+
yaml.dump(existing, f, default_flow_style=False, sort_keys=False)
|
|
316
|
+
|
|
317
|
+
return len(new)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def run_literature_search(
|
|
321
|
+
query: str | None = None,
|
|
322
|
+
baseline: bool = False,
|
|
323
|
+
related_exp_id: str | None = None,
|
|
324
|
+
auto_queue: bool = False,
|
|
325
|
+
config_path: str = "config.yaml",
|
|
326
|
+
log_path: str = "experiments/log.jsonl",
|
|
327
|
+
limit: int = DEFAULT_RESULT_COUNT,
|
|
328
|
+
) -> dict:
|
|
329
|
+
"""Run a literature search in the appropriate mode.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
query: Free-text search query.
|
|
333
|
+
baseline: If True, search for SOTA baselines.
|
|
334
|
+
related_exp_id: If set, find papers related to this experiment.
|
|
335
|
+
auto_queue: Auto-queue hypotheses from findings.
|
|
336
|
+
config_path: Path to config.yaml.
|
|
337
|
+
log_path: Path to experiment log.
|
|
338
|
+
limit: Maximum number of results.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Literature search result dict.
|
|
342
|
+
"""
|
|
343
|
+
config = load_config(config_path)
|
|
344
|
+
eval_cfg = config.get("evaluation", {})
|
|
345
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
346
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
347
|
+
experiments = load_experiments(log_path)
|
|
348
|
+
|
|
349
|
+
if baseline:
|
|
350
|
+
result = search_baseline(config, experiments, primary_metric, lower_is_better)
|
|
351
|
+
result["mode"] = "baseline"
|
|
352
|
+
elif related_exp_id:
|
|
353
|
+
target = None
|
|
354
|
+
for exp in experiments:
|
|
355
|
+
if exp.get("experiment_id") == related_exp_id:
|
|
356
|
+
target = exp
|
|
357
|
+
break
|
|
358
|
+
if not target:
|
|
359
|
+
return {"error": f"Experiment {related_exp_id} not found", "mode": "related"}
|
|
360
|
+
result = search_related(target, limit=limit)
|
|
361
|
+
result["mode"] = "related"
|
|
362
|
+
elif query:
|
|
363
|
+
papers = search_semantic_scholar(query, limit=limit)
|
|
364
|
+
result = {"query": query, "papers": papers, "mode": "query"}
|
|
365
|
+
else:
|
|
366
|
+
# Default: search based on config
|
|
367
|
+
query = build_query_from_config(config)
|
|
368
|
+
papers = search_semantic_scholar(query, limit=limit)
|
|
369
|
+
result = {"query": query, "papers": papers, "mode": "query"}
|
|
370
|
+
|
|
371
|
+
result["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
372
|
+
|
|
373
|
+
# Generate and optionally queue hypotheses
|
|
374
|
+
papers = result.get("papers", [])
|
|
375
|
+
if papers and not any("error" in p for p in papers):
|
|
376
|
+
hypotheses = generate_literature_hypotheses(papers)
|
|
377
|
+
result["hypotheses"] = hypotheses
|
|
378
|
+
|
|
379
|
+
if auto_queue and hypotheses:
|
|
380
|
+
n_added = queue_literature_hypotheses(hypotheses)
|
|
381
|
+
result["hypotheses_queued"] = n_added
|
|
382
|
+
print(f"Queued {n_added} hypotheses from literature", file=sys.stderr)
|
|
383
|
+
|
|
384
|
+
return result
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def main() -> None:
|
|
388
|
+
"""CLI entry point."""
|
|
389
|
+
parser = argparse.ArgumentParser(description="Literature search for ML experiments")
|
|
390
|
+
parser.add_argument("query", nargs="?", default=None, help="Free-text search query")
|
|
391
|
+
parser.add_argument("--baseline", action="store_true", help="Search for SOTA baselines")
|
|
392
|
+
parser.add_argument("--related", default=None, metavar="EXP_ID", help="Find related papers for experiment")
|
|
393
|
+
parser.add_argument("--auto-queue", action="store_true", help="Auto-queue hypotheses from findings")
|
|
394
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
395
|
+
parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
|
|
396
|
+
parser.add_argument("--limit", type=int, default=DEFAULT_RESULT_COUNT, help="Max results")
|
|
397
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
398
|
+
args = parser.parse_args()
|
|
399
|
+
|
|
400
|
+
result = run_literature_search(
|
|
401
|
+
query=args.query,
|
|
402
|
+
baseline=args.baseline,
|
|
403
|
+
related_exp_id=args.related,
|
|
404
|
+
auto_queue=args.auto_queue,
|
|
405
|
+
config_path=args.config,
|
|
406
|
+
log_path=args.log,
|
|
407
|
+
limit=args.limit,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if "error" not in result:
|
|
411
|
+
filepath = save_literature_results(result)
|
|
412
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
413
|
+
|
|
414
|
+
if args.json:
|
|
415
|
+
print(json.dumps(result, indent=2, default=str))
|
|
416
|
+
else:
|
|
417
|
+
print(format_literature_report(result))
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
if __name__ == "__main__":
|
|
421
|
+
main()
|
|
@@ -102,6 +102,8 @@ TEMPLATE_DIRS = {
|
|
|
102
102
|
"equivalence_checker.py",
|
|
103
103
|
"latency_benchmark.py",
|
|
104
104
|
"export_card.py",
|
|
105
|
+
"literature_search.py",
|
|
106
|
+
"draft_paper_sections.py",
|
|
105
107
|
],
|
|
106
108
|
"tests": ["__init__.py", "conftest.py"],
|
|
107
109
|
}
|
|
@@ -118,6 +120,8 @@ DIRECTORIES_TO_CREATE = [
|
|
|
118
120
|
"experiments/profiles",
|
|
119
121
|
"experiments/checkpoints",
|
|
120
122
|
"exports",
|
|
123
|
+
"experiments/literature",
|
|
124
|
+
"paper/sections",
|
|
121
125
|
"models/best",
|
|
122
126
|
"models/archive",
|
|
123
127
|
]
|