npm - claude-turing - Versions diffs - 1.0.0 - Mend

claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

package/.claude-plugin/plugin.json +34 -0
package/LICENSE +21 -0
package/README.md +457 -0
package/agents/ml-evaluator.md +43 -0
package/agents/ml-researcher.md +74 -0
package/bin/cli.js +46 -0
package/bin/turing-init.sh +57 -0
package/commands/brief.md +83 -0
package/commands/compare.md +24 -0
package/commands/design.md +97 -0
package/commands/init.md +123 -0
package/commands/logbook.md +51 -0
package/commands/mode.md +43 -0
package/commands/poster.md +89 -0
package/commands/preflight.md +75 -0
package/commands/report.md +97 -0
package/commands/rules/loop-protocol.md +91 -0
package/commands/status.md +24 -0
package/commands/suggest.md +95 -0
package/commands/sweep.md +45 -0
package/commands/train.md +66 -0
package/commands/try.md +63 -0
package/commands/turing.md +54 -0
package/commands/validate.md +34 -0
package/config/defaults.yaml +45 -0
package/config/experiment_archetypes.yaml +127 -0
package/config/lifecycle.toml +31 -0
package/config/novelty_aliases.yaml +107 -0
package/config/relationships.toml +125 -0
package/config/state.toml +24 -0
package/config/task_taxonomy.yaml +110 -0
package/config/taxonomy.toml +37 -0
package/package.json +54 -0
package/src/claude-md.js +55 -0
package/src/install.js +107 -0
package/src/paths.js +20 -0
package/src/postinstall.js +22 -0
package/src/verify.js +109 -0
package/templates/MEMORY.md +36 -0
package/templates/README.md +93 -0
package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
package/templates/config.yaml +48 -0
package/templates/evaluate.py +237 -0
package/templates/features/__init__.py +0 -0
package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
package/templates/features/featurizers.py +138 -0
package/templates/prepare.py +171 -0
package/templates/program.md +216 -0
package/templates/pyproject.toml +8 -0
package/templates/requirements.txt +8 -0
package/templates/scripts/__init__.py +0 -0
package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
package/templates/scripts/check_convergence.py +230 -0
package/templates/scripts/compare_runs.py +124 -0
package/templates/scripts/critique_hypothesis.py +350 -0
package/templates/scripts/experiment_index.py +288 -0
package/templates/scripts/generate_brief.py +389 -0
package/templates/scripts/generate_logbook.py +423 -0
package/templates/scripts/log_experiment.py +243 -0
package/templates/scripts/manage_hypotheses.py +543 -0
package/templates/scripts/novelty_guard.py +343 -0
package/templates/scripts/parse_metrics.py +139 -0
package/templates/scripts/post-train-hook.sh +74 -0
package/templates/scripts/preflight.py +549 -0
package/templates/scripts/scaffold.py +409 -0
package/templates/scripts/show_environment.py +92 -0
package/templates/scripts/show_experiment_tree.py +144 -0
package/templates/scripts/show_families.py +133 -0
package/templates/scripts/show_metrics.py +157 -0
package/templates/scripts/statistical_compare.py +259 -0
package/templates/scripts/stop-hook.sh +34 -0
package/templates/scripts/suggest_next.py +301 -0
package/templates/scripts/sweep.py +276 -0
package/templates/scripts/synthesize_decision.py +300 -0
package/templates/scripts/turing_io.py +76 -0
package/templates/scripts/update_state.py +296 -0
package/templates/scripts/validate_stability.py +167 -0
package/templates/scripts/verify_placeholders.py +119 -0
package/templates/sweep_config.yaml +14 -0
package/templates/tests/__init__.py +0 -0
package/templates/tests/conftest.py +91 -0
package/templates/train.py +240 -0

package/templates/program.md ADDED Viewed

@@ -0,0 +1,216 @@
+# Autoresearch: {{PROJECT_NAME}} Model Training
+*"An experiment is a question which science poses to Nature, and a measurement is the recording of Nature's answer."*
+## Goal
+{{TASK_DESCRIPTION}}
+**Primary metric:** {{TARGET_METRIC}} ({{METRIC_DIRECTION}} is better)
+**Secondary metrics:** as configured in config.yaml `evaluation.metrics`
+## The Fundamental Constraint
+**You modify `train.py` and `config.yaml`. You do NOT modify `prepare.py` or `evaluate.py`. Ever.**
+This separation is not a convention — it is the architectural invariant that makes your results comparable. If you could change evaluation between experiments, no comparison would be valid. The measurement apparatus is sacred.
+| Layer | Files | Your Access |
+|-------|-------|-------------|
+| Hidden | `evaluate.py` | NONE — do not read, reference, or access |
+| Measurement | `prepare.py` | READ-ONLY |
+| Hypothesis | `train.py`, `config.yaml` | READ-WRITE |
+| Features | `features/featurizers.py` | READ-ONLY (modify how `train.py` uses it) |
+## Configuration
+All hyperparameters live in `config.yaml`. Edit it for parameter changes — do NOT hardcode values in train.py.
+Key sections:
+- `model.type` — model framework (xgboost, lightgbm, etc.)
+- `model.hyperparams` — all model hyperparameters
+- `convergence.patience` — consecutive non-improvements before stopping
+## Branches
+Create per-experiment branches to preserve all code variants:
+```
+git checkout -b exp/NNN-description
+# ... make changes, run experiment ...
+# If improved: git checkout main && git merge exp/NNN-description
+# If not improved: git checkout main (branch preserved)
+```
+## Memory
+Read `.claude/agent-memory/ml-researcher/MEMORY.md` at the start of each session.
+Update it after each experiment with:
+- Best result (if improved)
+- What was tried and why
+- What worked / what failed
+- Promising next directions
+## Sweep
+For systematic hyperparameter search:
+1. Edit `sweep_config.yaml` with parameter ranges
+2. Generate queue: `python scripts/sweep.py`
+3. Check status: `python scripts/sweep.py --status`
+4. Get next: `python scripts/sweep.py --next`
+5. Apply overrides, create branch, run training
+6. Mark done: `python scripts/sweep.py --mark <name> complete|failed`
+## THE LOOP
+The autoresearch experiment loop. Each iteration is one experiment — one hypothesis tested.
+1. **OBSERVE** — Read recent results, check hypothesis queue, research plan, and review failed diffs:
+   ```bash
+   python scripts/show_metrics.py --last 5
+   python scripts/manage_hypotheses.py next 2>/dev/null || echo "No queued hypotheses"
+   cat RESEARCH_PLAN.md 2>/dev/null || true
+   ```
+   If `RESEARCH_PLAN.md` exists, use it for strategic direction (which model families to explore, in what order, what budget). The plan is advisory — deviate if evidence warrants, but note why.
+   For the most recent discarded experiments, read the actual git diff to understand what was tried and failed — do NOT rely on your own memory of what you changed:
+   ```bash
+   # Show diffs from recent discarded experiment branches
+   for branch in $(git branch --list 'exp/*' | tail -3); do
+     echo "=== $branch ==="
+     git diff main...$branch -- train.py config.yaml 2>/dev/null | head -40
+   done
+   ```
+2. **HYPOTHESIZE** — Check the queue first. If a queued hypothesis exists (especially human-injected, high priority), use it. Otherwise, generate your own and **register it in the queue before executing**:
+   **If using a queued hypothesis:**
+   ```bash
+   python scripts/manage_hypotheses.py mark hyp-NNN in-progress
+   ```
+   **If generating your own hypothesis**, register it with structured detail:
+   ```bash
+   python scripts/manage_hypotheses.py add "your hypothesis description" \
+     --priority medium --source agent \
+     --model-type xgboost \
+     --hyperparams '{"max_depth": 8, "n_estimators": 200}' \
+     --family optimizer-sweep \
+     --tags "depth,estimators" \
+     --parent exp-NNN \
+     --expected "deeper trees should capture feature interactions"
+   python scripts/manage_hypotheses.py mark hyp-NNN in-progress
+   ```
+   This creates both an index entry in `hypotheses.yaml` and a detailed file at `hypotheses/hyp-NNN.yaml` with full architecture, hyperparameters, expected outcome, and lineage.
+   Every experiment must have a corresponding hypothesis in the queue. This ensures the hypothesis database is a complete record of every idea — human and agent alike.
+   To read a hypothesis's full detail:
+   ```bash
+   python scripts/manage_hypotheses.py show hyp-NNN
+   ```
+3. **PREPARE** — Modify `config.yaml` for hyperparameter changes. Only modify `train.py` for structural code changes.
+4. **COMMIT** the experiment:
+   ```bash
+   git commit -am "exp: {description}"
+   ```
+5. **EXECUTE** training:
+   ```bash
+   source .venv/bin/activate && python train.py > run.log 2>&1
+   ```
+6. **MEASURE** — Parse metrics from run.log:
+   ```bash
+   grep -A 10 "^---" run.log | head -10
+   ```
+7. **DECIDE:**
+   **If improved** over current best:
+   - Keep the commit
+   - Copy model: `cp models/model.joblib models/best/model.joblib`
+   - Update `models/best/metadata.json`
+   **If NOT improved:**
+   ```bash
+   git reset --hard HEAD~1
+   ```
+8. **RECORD** — Log the experiment (kept or discarded):
+   ```bash
+   python scripts/log_experiment.py experiments/log.jsonl exp-NNN kept|discarded \
+     '{"{{TARGET_METRIC}}": X.XX, ...}' \
+     '{"model_type": "xgboost", "hyperparams": {...}}' \
+     models/model.joblib "Description of hypothesis and outcome"
+   ```
+   Update the hypothesis status with result metrics:
+   ```bash
+   python scripts/manage_hypotheses.py mark hyp-NNN tested \
+     --result exp-NNN \
+     --metrics '{"{{TARGET_METRIC}}": X.XX, ...}' \
+     --notes "Brief explanation of what happened and why"
+   # or: mark hyp-NNN promising (if it improved significantly)
+   # or: mark hyp-NNN dead-end (if it clearly failed)
+   ```
+   Then synthesize a decision packet and auto-queue follow-ups:
+   ```bash
+   python scripts/synthesize_decision.py --experiment exp-NNN --auto-queue
+   ```
+   This produces a verdict (promote/branch_followup/abandon/fix_and_retry) and automatically queues follow-up hypotheses for `branch_followup` and `fix_and_retry` outcomes.
+9. **CONVERGE** — Check stopping conditions:
+   - N consecutive non-improvements (`config.yaml` → `convergence.patience`) = STOP
+   - `max_iterations` reached = STOP
+   - Report final best model and recommend next steps
+10. **REPEAT** — return to step 1.
+## Execution Rules
+- **ALWAYS redirect output:** `python train.py > run.log 2>&1`
+- **ALWAYS parse with grep:** `grep -A 10 "^---" run.log | head -10`
+- **ALWAYS activate venv:** `source .venv/bin/activate`
+- **NEVER install packages** without human approval
+## Strategy Escalation Protocol
+When consecutive experiments fail to improve, escalate your approach rather than repeating similar attempts:
+| Consecutive Failures | Strategy | Description |
+|---------------------|----------|-------------|
+| 0-1 | **EXPLOIT** | Push further in the current direction — small tweaks, parameter refinement |
+| 2-3 | **RE-READ** | Stop. Re-read ALL code from scratch. Your mental model is likely stale. |
+| 4-5 | **COMBINE** | Combine two previously successful ideas that haven't been tried together |
+| 6+ | **RADICAL** | Abandon the current approach entirely. Try a fundamentally different model, architecture, or feature strategy. |
+Track your consecutive failure count. When you hit a new tier, announce it: "Escalating to COMBINE strategy after 4 consecutive failures."
+## Experiment Ideas
+Starting suggestions (ordered by expected impact):
+1. **Hyperparameter sweep:** max_depth, n_estimators, learning_rate
+2. **LightGBM:** often faster than XGBoost with comparable accuracy
+3. **Feature engineering:** domain-specific features via the featurizer pipeline
+4. **sklearn alternatives:** RandomForest, GradientBoosting
+5. **Learning rate schedule:** lower lr with more estimators (0.01 / 1000 trees)
+6. **Neural network:** if samples > 2000, try a small MLP
+## Output Format
+- **Model artifact:** `models/best/model.joblib`
+- **Metadata:** `models/best/metadata.json`
+- **Experiment log:** `experiments/log.jsonl` (append-only JSONL)
+- **TSV summary:** `experiments/results.tsv`
+## Comparing Runs
+```bash
+python scripts/compare_runs.py exp-001 exp-002
+```

package/templates/pyproject.toml ADDED Viewed

@@ -0,0 +1,8 @@
+[project]
+name = "{{PROJECT_NAME}}-ml"
+version = "0.1.0"
+requires-python = ">=3.12"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]

package/templates/requirements.txt ADDED Viewed

@@ -0,0 +1,8 @@
+scikit-learn>=1.6
+xgboost>=3.2
+lightgbm>=4.6
+pandas>=2.2
+numpy>=2.0
+joblib>=1.4
+pyyaml>=6.0
+pytest>=8.0

package/templates/scripts/__init__.py ADDED Viewed

File without changes

package/templates/scripts/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/classify_task.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/scaffold.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/show_families.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/sweep.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/turing_io.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/update_state.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/check_convergence.py ADDED Viewed

@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""Convergence detection for the autoresearch pipeline.
+Reads experiments/log.jsonl and checks if the last N experiments
+(where N = convergence.patience) show insufficient improvement
+over the best prior result.
+This is a discrete analogue of early stopping from gradient descent,
+adapted for the experiment loop context. The algorithm:
+1. Load all "kept" experiments from the JSONL log
+2. For each of the last N experiments, compute relative improvement
+   over the prior best
+3. If all N show < threshold improvement, declare convergence
+Usage:
+    python scripts/check_convergence.py [--config config.yaml] [--log experiments/log.jsonl]
+Exit codes:
+    0 = not converged, agent should continue
+    2 = converged, agent should stop (signals /loop to halt)
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+import yaml
+def load_convergence_config(config_path: str) -> dict:
+    """Load convergence parameters from config.yaml.
+    Returns dict with keys: patience, improvement_threshold,
+    primary_metric, lower_is_better.
+    Falls back to conservative defaults if config cannot be loaded.
+    """
+    defaults = {
+        "patience": 3,
+        "improvement_threshold": 0.005,
+        "primary_metric": "accuracy",
+        "lower_is_better": False,
+    }
+    path = Path(config_path)
+    if not path.exists():
+        print(f"convergence: Config not found at {config_path}, using defaults.", file=sys.stderr)
+        return defaults
+    try:
+        with open(path) as f:
+            config = yaml.safe_load(f)
+        convergence_cfg = config.get("convergence", {})
+        eval_cfg = config.get("evaluation", {})
+        return {
+            "patience": convergence_cfg.get("patience", defaults["patience"]),
+            "improvement_threshold": convergence_cfg.get(
+                "improvement_threshold", defaults["improvement_threshold"]
+            ),
+            "primary_metric": eval_cfg.get("primary_metric", defaults["primary_metric"]),
+            "lower_is_better": eval_cfg.get("lower_is_better", defaults["lower_is_better"]),
+        }
+    except (yaml.YAMLError, AttributeError) as e:
+        print(f"convergence: Error reading config: {e}. Using defaults.", file=sys.stderr)
+        return defaults
+def load_kept_experiments(log_path: str, primary_metric: str) -> list[dict]:
+    """Load all 'kept' experiments with valid primary metric values.
+    Args:
+        log_path: Path to experiments/log.jsonl.
+        primary_metric: Metric name to extract.
+    Returns:
+        List of dicts with 'id' and 'value' keys, in chronological order.
+    """
+    path = Path(log_path)
+    if not path.exists():
+        return []
+    experiments = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+                value = entry.get("metrics", {}).get(primary_metric)
+                if value is not None and entry.get("status") == "kept":
+                    experiments.append({
+                        "id": entry.get("experiment_id", "?"),
+                        "value": float(value),
+                    })
+            except (json.JSONDecodeError, ValueError, TypeError):
+                continue
+    return experiments
+def compute_relative_improvement(
+    current: float,
+    prior_best: float,
+    lower_is_better: bool,
+) -> float:
+    """Compute relative improvement of current value over prior best.
+    Returns a float where positive = improvement, negative = regression.
+    Returns 1.0 if prior_best is zero (any non-zero value is infinite improvement).
+    """
+    if prior_best == 0:
+        return 1.0 if current != 0 else 0.0
+    if lower_is_better:
+        return (prior_best - current) / abs(prior_best)
+    else:
+        return (current - prior_best) / abs(prior_best)
+def check_convergence(
+    experiments: list[dict],
+    patience: int,
+    improvement_threshold: float,
+    lower_is_better: bool,
+) -> tuple[bool, int, str]:
+    """Check if the experiment loop has converged.
+    Args:
+        experiments: List of dicts with 'id' and 'value' keys.
+        patience: Number of consecutive non-improvements required.
+        improvement_threshold: Minimum relative improvement to count.
+        lower_is_better: True for metrics like MAE/MSE.
+    Returns:
+        Tuple of (converged: bool, non_improvements: int, message: str).
+    """
+    total = len(experiments)
+    if total < patience:
+        return (
+            False,
+            0,
+            f"Only {total} experiments, need {patience} to check convergence.",
+        )
+    # Find best value across all experiments
+    values = [e["value"] for e in experiments]
+    best_value = min(values) if lower_is_better else max(values)
+    # Check last N experiments for improvement over their respective prior bests
+    non_improvements = 0
+    for i in range(total - patience, total):
+        prior_values = [e["value"] for e in experiments[:i]]
+        if not prior_values:
+            continue
+        prior_best = min(prior_values) if lower_is_better else max(prior_values)
+        current_value = experiments[i]["value"]
+        improvement = compute_relative_improvement(current_value, prior_best, lower_is_better)
+        if improvement < improvement_threshold:
+            non_improvements += 1
+    last_n = experiments[-patience:]
+    last_values = [round(e["value"], 4) for e in last_n]
+    primary_metric = "metric"  # Cosmetic — the caller knows the actual name
+    if non_improvements >= patience:
+        msg = (
+            f"CONVERGED: {patience} consecutive non-improvements "
+            f"(threshold: {improvement_threshold * 100:.1f}% relative gain). "
+            f"Best={best_value:.4f}, last {patience} values={last_values}"
+        )
+        return True, non_improvements, msg
+    else:
+        msg = (
+            f"Not converged ({non_improvements}/{patience} non-improvements). "
+            f"Best={best_value:.4f}, last {patience} values={last_values}"
+        )
+        return False, non_improvements, msg
+def main() -> None:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Check experiment convergence for the autoresearch pipeline"
+    )
+    parser.add_argument(
+        "--config",
+        default="config.yaml",
+        help="Path to config.yaml (default: config.yaml)",
+    )
+    parser.add_argument(
+        "--log",
+        default="experiments/log.jsonl",
+        help="Path to experiment log (default: experiments/log.jsonl)",
+    )
+    args = parser.parse_args()
+    # Load config
+    cfg = load_convergence_config(args.config)
+    # Load experiments
+    experiments = load_kept_experiments(args.log, cfg["primary_metric"])
+    # Check convergence
+    converged, non_improvements, message = check_convergence(
+        experiments=experiments,
+        patience=cfg["patience"],
+        improvement_threshold=cfg["improvement_threshold"],
+        lower_is_better=cfg["lower_is_better"],
+    )
+    print(f"convergence: {message}", file=sys.stderr)
+    if converged:
+        sys.exit(2)
+    else:
+        sys.exit(0)
+if __name__ == "__main__":
+    main()

package/templates/scripts/compare_runs.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Side-by-side comparison of two experiments.
+Shows configuration deltas and metric differences between two experiments,
+enabling the agent (or human) to understand which changes caused which
+metric movements.
+Usage: python scripts/compare_runs.py exp-001 exp-002 [--log path/to/log.jsonl]
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+import yaml
+from scripts.turing_io import load_config
+DEFAULT_LOG_PATH = "experiments/log.jsonl"
+def load_experiment(log_path: str, experiment_id: str) -> dict | None:
+    """Load a single experiment by ID."""
+    path = Path(log_path)
+    if not path.exists():
+        return None
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+                if entry.get("experiment_id") == experiment_id:
+                    return entry
+            except json.JSONDecodeError:
+                continue
+    return None
+def format_comparison(exp_a: dict, exp_b: dict, config: dict) -> str:
+    """Format side-by-side comparison of two experiments."""
+    id_a = exp_a.get("experiment_id", "?")
+    id_b = exp_b.get("experiment_id", "?")
+    eval_cfg = config.get("evaluation", {})
+    lower_is_better_metrics = set()
+    if eval_cfg.get("lower_is_better", False):
+        lower_is_better_metrics = set(eval_cfg.get("metrics", []))
+    lines = [
+        f"{'':20s} {id_a:<20s} {id_b:<20s}",
+        "=" * 60,
+        "",
+        "## Config",
+    ]
+    config_a = exp_a.get("config", {})
+    config_b = exp_b.get("config", {})
+    all_config_keys = sorted(set(list(config_a.keys()) + list(config_b.keys())))
+    for key in all_config_keys:
+        val_a = config_a.get(key, "N/A")
+        val_b = config_b.get(key, "N/A")
+        marker = " <--" if val_a != val_b else ""
+        lines.append(f"  {key:<18s} {str(val_a):<20s} {str(val_b):<20s}{marker}")
+    lines.append("")
+    lines.append("## Metrics")
+    metrics_a = exp_a.get("metrics", {})
+    metrics_b = exp_b.get("metrics", {})
+    all_metric_keys = sorted(set(list(metrics_a.keys()) + list(metrics_b.keys())))
+    for key in all_metric_keys:
+        val_a = metrics_a.get(key, "N/A")
+        val_b = metrics_b.get(key, "N/A")
+        diff_marker = ""
+        if isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
+            if key in lower_is_better_metrics:
+                diff_marker = " (better)" if val_b < val_a else " (worse)" if val_b > val_a else ""
+            else:
+                diff_marker = " (better)" if val_b > val_a else " (worse)" if val_b < val_a else ""
+        a_str = f"{val_a:.4f}" if isinstance(val_a, float) else str(val_a)
+        b_str = f"{val_b:.4f}" if isinstance(val_b, float) else str(val_b)
+        lines.append(f"  {key:<18s} {a_str:<20s} {b_str:<20s}{diff_marker}")
+    lines.append("")
+    lines.append("## Status")
+    lines.append(f"  {'status':<18s} {exp_a.get('status', '?'):<20s} {exp_b.get('status', '?'):<20s}")
+    lines.append(f"  {'timestamp':<18s} {exp_a.get('timestamp', '?')[:19]:<20s} {exp_b.get('timestamp', '?')[:19]:<20s}")
+    return "\n".join(lines)
+def main() -> None:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(description="Compare two experiment runs")
+    parser.add_argument("exp_a", help="First experiment ID (e.g., exp-001)")
+    parser.add_argument("exp_b", help="Second experiment ID (e.g., exp-002)")
+    parser.add_argument(
+        "--log",
+        default=DEFAULT_LOG_PATH,
+        help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
+    )
+    args = parser.parse_args()
+    a = load_experiment(args.log, args.exp_a)
+    b = load_experiment(args.log, args.exp_b)
+    if a is None:
+        print(f"Experiment {args.exp_a} not found in {args.log}")
+        sys.exit(1)
+    if b is None:
+        print(f"Experiment {args.exp_b} not found in {args.log}")
+        sys.exit(1)
+    config = load_config()
+    print(format_comparison(a, b, config))
+if __name__ == "__main__":
+    main()