claude-turing 4.5.0 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +18 -0
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +1 -1
- package/commands/turing.md +85 -77
- package/config/commands.yaml +928 -0
- package/package.json +11 -4
- package/skills/turing/SKILL.md +180 -0
- package/skills/turing/ablate/SKILL.md +47 -0
- package/skills/turing/annotate/SKILL.md +23 -0
- package/skills/turing/archive/SKILL.md +23 -0
- package/skills/turing/audit/SKILL.md +56 -0
- package/skills/turing/baseline/SKILL.md +45 -0
- package/skills/turing/brief/SKILL.md +95 -0
- package/skills/turing/budget/SKILL.md +52 -0
- package/skills/turing/calibrate/SKILL.md +47 -0
- package/skills/turing/card/SKILL.md +36 -0
- package/skills/turing/changelog/SKILL.md +22 -0
- package/skills/turing/checkpoint/SKILL.md +47 -0
- package/skills/turing/cite/SKILL.md +23 -0
- package/skills/turing/compare/SKILL.md +24 -0
- package/skills/turing/counterfactual/SKILL.md +27 -0
- package/skills/turing/curriculum/SKILL.md +43 -0
- package/skills/turing/design/SKILL.md +97 -0
- package/skills/turing/diagnose/SKILL.md +52 -0
- package/skills/turing/diff/SKILL.md +48 -0
- package/skills/turing/distill/SKILL.md +56 -0
- package/skills/turing/doctor/SKILL.md +31 -0
- package/skills/turing/ensemble/SKILL.md +54 -0
- package/skills/turing/explore/SKILL.md +107 -0
- package/skills/turing/export/SKILL.md +48 -0
- package/skills/turing/feature/SKILL.md +42 -0
- package/skills/turing/flashback/SKILL.md +22 -0
- package/skills/turing/fork/SKILL.md +40 -0
- package/skills/turing/frontier/SKILL.md +45 -0
- package/skills/turing/init/SKILL.md +154 -0
- package/skills/turing/leak/SKILL.md +47 -0
- package/skills/turing/lit/SKILL.md +47 -0
- package/skills/turing/logbook/SKILL.md +51 -0
- package/skills/turing/merge/SKILL.md +24 -0
- package/skills/turing/mode/SKILL.md +43 -0
- package/skills/turing/onboard/SKILL.md +20 -0
- package/skills/turing/paper/SKILL.md +44 -0
- package/skills/turing/plan/SKILL.md +27 -0
- package/skills/turing/poster/SKILL.md +89 -0
- package/skills/turing/postmortem/SKILL.md +28 -0
- package/skills/turing/preflight/SKILL.md +75 -0
- package/skills/turing/present/SKILL.md +23 -0
- package/skills/turing/profile/SKILL.md +43 -0
- package/skills/turing/prune/SKILL.md +26 -0
- package/skills/turing/quantize/SKILL.md +24 -0
- package/skills/turing/queue/SKILL.md +48 -0
- package/skills/turing/registry/SKILL.md +31 -0
- package/skills/turing/regress/SKILL.md +53 -0
- package/skills/turing/replay/SKILL.md +23 -0
- package/skills/turing/report/SKILL.md +97 -0
- package/skills/turing/reproduce/SKILL.md +48 -0
- package/skills/turing/retry/SKILL.md +41 -0
- package/skills/turing/review/SKILL.md +20 -0
- package/skills/turing/rules/loop-protocol.md +91 -0
- package/skills/turing/sanity/SKILL.md +48 -0
- package/skills/turing/scale/SKILL.md +55 -0
- package/skills/turing/search/SKILL.md +22 -0
- package/skills/turing/seed/SKILL.md +47 -0
- package/skills/turing/sensitivity/SKILL.md +41 -0
- package/skills/turing/share/SKILL.md +20 -0
- package/skills/turing/simulate/SKILL.md +28 -0
- package/skills/turing/status/SKILL.md +24 -0
- package/skills/turing/stitch/SKILL.md +49 -0
- package/skills/turing/suggest/SKILL.md +159 -0
- package/skills/turing/surgery/SKILL.md +27 -0
- package/skills/turing/sweep/SKILL.md +45 -0
- package/skills/turing/template/SKILL.md +22 -0
- package/skills/turing/train/SKILL.md +75 -0
- package/skills/turing/transfer/SKILL.md +54 -0
- package/skills/turing/trend/SKILL.md +21 -0
- package/skills/turing/try/SKILL.md +63 -0
- package/skills/turing/update/SKILL.md +27 -0
- package/skills/turing/validate/SKILL.md +34 -0
- package/skills/turing/warm/SKILL.md +53 -0
- package/skills/turing/watch/SKILL.md +60 -0
- package/skills/turing/whatif/SKILL.md +31 -0
- package/skills/turing/xray/SKILL.md +43 -0
- package/src/command-registry.js +160 -0
- package/src/install.js +8 -34
- package/src/sync-skills-layout.js +149 -0
- package/src/verify.js +5 -88
- package/templates/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-312.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/__init__.cpython-312.pyc +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-312.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/ablation_study.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/ablation_study.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/architecture_surgery.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/architecture_surgery.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/budget_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/budget_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/build_ensemble.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/build_ensemble.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/calibration.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/calibration.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/citation_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/citation_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/counterfactual_explanation.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/counterfactual_explanation.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/curriculum_optimizer.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/curriculum_optimizer.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/diagnose_errors.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/diagnose_errors.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/draft_paper_sections.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/draft_paper_sections.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_annotations.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_annotations.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_archive.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_archive.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_diff.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_diff.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_queue.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_queue.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_replay.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_replay.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_search.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_search.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_simulator.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_simulator.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_templates.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_templates.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/failure_postmortem.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/feature_intelligence.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/feature_intelligence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/fork_experiment.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/fork_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_baselines.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_baselines.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_changelog.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_changelog.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_figures.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_figures.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_onboarding.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_onboarding.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/harness_doctor.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/knowledge_transfer.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/knowledge_transfer.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/leakage_detector.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/leakage_detector.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/literature_search.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/literature_search.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/methodology_audit.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/methodology_audit.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_distiller.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_distiller.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_merger.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_merger.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_pruning.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_pruning.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_quantization.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_quantization.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_xray.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_xray.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/package_experiments.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/package_experiments.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/pareto_frontier.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/pareto_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/pipeline_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/pipeline_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/regression_gate.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/regression_gate.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/reproduce_experiment.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/research_planner.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/research_planner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sanity_checks.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/sanity_checks.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaling_estimator.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/scaling_estimator.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/seed_runner.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sensitivity_analysis.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/sensitivity_analysis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/session_flashback.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/session_flashback.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/simulate_review.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/simulate_review.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/smart_retry.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/smart_retry.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/training_monitor.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/training_monitor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/trend_analysis.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/trend_analysis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/warm_start.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/warm_start.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/whatif_engine.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/whatif_engine.cpython-314.pyc +0 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: template
|
|
3
|
+
description: Experiment template library — save winning configs as reusable templates, apply to new projects.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<save|list|apply|share> [--name name] [--from exp-id]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Turn your best experiment configs into reusable recipes that persist across projects.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. **Activate environment:** `source .venv/bin/activate`
|
|
13
|
+
2. **Run:** `python scripts/experiment_templates.py $ARGUMENTS`
|
|
14
|
+
3. **Operations:** save (from experiment), list (all templates), apply (to current project), share (export)
|
|
15
|
+
4. **Stored at:** `~/.turing/templates/` (cross-project)
|
|
16
|
+
|
|
17
|
+
## Examples
|
|
18
|
+
```
|
|
19
|
+
/turing:template save --from exp-042 --name "tabular-xgboost-v2"
|
|
20
|
+
/turing:template list
|
|
21
|
+
/turing:template apply tabular-xgboost-v2
|
|
22
|
+
```
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: train
|
|
3
|
+
description: Run the autonomous ML experiment loop. Iteratively hypothesizes, trains, evaluates, and decides — keeping only improvements. Implements the autoresearch pattern with formal convergence detection and git-disciplined rollback.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[max_iterations]"
|
|
6
|
+
allowed-tools: Read, Write, Edit, Bash(python train.py:*, python scripts/*:*, git:*, source .venv/bin/activate:*, pip:*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
You are an autonomous ML researcher. Your goal: iteratively improve a model by following the experiment loop protocol — the scientific method applied to machine learning.
|
|
10
|
+
|
|
11
|
+
Read `program.md` in the ML project directory for the complete protocol. Follow it exactly.
|
|
12
|
+
|
|
13
|
+
## Arguments
|
|
14
|
+
|
|
15
|
+
`$ARGUMENTS` — accepts a project path (e.g., `ml/coding`), a number for max_iterations, or both (e.g., `ml/coding 10`). If no number, run until convergence (as defined in `config.yaml` convergence settings).
|
|
16
|
+
|
|
17
|
+
## Bootstrap Sequence
|
|
18
|
+
|
|
19
|
+
0. **Detect project directory:**
|
|
20
|
+
- If `$ARGUMENTS` contains a path (e.g., `ml/coding`), use that as the project directory
|
|
21
|
+
- Else if cwd contains `config.yaml` and `train.py`, use cwd
|
|
22
|
+
- Else search for `ml/*/` subdirectories containing `config.yaml`
|
|
23
|
+
- If exactly one found, use it
|
|
24
|
+
- If multiple found, list them and ask the user which to target
|
|
25
|
+
- All subsequent commands run from the detected project directory
|
|
26
|
+
- Memory path: `.claude/agent-memory/ml-researcher-{project_name}/MEMORY.md`
|
|
27
|
+
|
|
28
|
+
1. **Restore memory:** Read `.claude/agent-memory/ml-researcher-{project_name}/MEMORY.md` for prior observations and best results.
|
|
29
|
+
2. **Read protocol:** Read `program.md` completely — it defines the experiment loop, constraints, and output format.
|
|
30
|
+
3. **Bootstrap data:** Check for training data at `config.yaml` → `data.source`. If no splits exist, run `python prepare.py`.
|
|
31
|
+
4. **Bootstrap venv:** `test -d .venv || (python3 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt)`
|
|
32
|
+
5. **Assess state:** `source .venv/bin/activate && python scripts/show_metrics.py --last 5`
|
|
33
|
+
6. **Begin the loop** from program.md.
|
|
34
|
+
|
|
35
|
+
## The Loop
|
|
36
|
+
|
|
37
|
+
Each iteration follows the experiment lifecycle (`config/lifecycle.toml`):
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
proposed -> running -> evaluating -> kept/discarded -> (next iteration)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The agent proposes a hypothesis, executes it, measures the result against the immutable evaluation harness, and decides whether to keep or discard. Only improvements survive in git history.
|
|
44
|
+
|
|
45
|
+
## Delegation
|
|
46
|
+
|
|
47
|
+
Use `@ml-evaluator` for analysis tasks. It is read-only (no Write/Edit) and cannot accidentally modify the pipeline.
|
|
48
|
+
|
|
49
|
+
## Context Management
|
|
50
|
+
|
|
51
|
+
- Redirect all training output: `python train.py > run.log 2>&1`
|
|
52
|
+
- Parse metrics with grep, never read full output
|
|
53
|
+
- Persist observations to MEMORY.md after each experiment
|
|
54
|
+
|
|
55
|
+
## Convergence
|
|
56
|
+
|
|
57
|
+
- Stop after `max_iterations` if provided
|
|
58
|
+
- Otherwise, stop after N consecutive non-improvements (`config.yaml` → `convergence.patience`)
|
|
59
|
+
- Report final best experiment and recommend next steps
|
|
60
|
+
|
|
61
|
+
## /loop Integration
|
|
62
|
+
|
|
63
|
+
For fully hands-off training:
|
|
64
|
+
```
|
|
65
|
+
/loop 5m /turing:train
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
The Stop hook automatically detects convergence and halts the loop. Recommended intervals:
|
|
69
|
+
- `3m` — fast iterations, small datasets
|
|
70
|
+
- `5m` — standard training runs
|
|
71
|
+
- `10m` — deep training with large models
|
|
72
|
+
|
|
73
|
+
## Rules
|
|
74
|
+
|
|
75
|
+
See `rules/loop-protocol.md` for safety constraints governing the experiment loop.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: transfer
|
|
3
|
+
description: Cross-project knowledge transfer — find similar prior projects and surface what worked. Builds institutional ML memory.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--from project-path] [--auto]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Find similar prior projects and surface what worked. "Last time you had tabular classification with class imbalance, LightGBM beat everything by 3%."
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--from ~/projects/fraud-detection` — transfer from a specific project
|
|
20
|
+
- `--auto` — auto-queue hypotheses from recommendations
|
|
21
|
+
- `--index ~/.turing/project_index.yaml` — custom index path
|
|
22
|
+
- `--json` — raw JSON output
|
|
23
|
+
|
|
24
|
+
3. **Run knowledge transfer:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/knowledge_transfer.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report includes:**
|
|
30
|
+
- Similar prior projects ranked by similarity score
|
|
31
|
+
- Per project: task type, winner model, key insights
|
|
32
|
+
- Suggested hypotheses from winning strategies
|
|
33
|
+
- Auto-queued hypotheses (with `--auto`)
|
|
34
|
+
|
|
35
|
+
5. **Similarity matching** uses:
|
|
36
|
+
- Task type (classification/regression) — highest weight
|
|
37
|
+
- Dataset size (log-scale comparison)
|
|
38
|
+
- Feature types (tabular/image/text)
|
|
39
|
+
- Class balance characteristics
|
|
40
|
+
- Dimensionality
|
|
41
|
+
|
|
42
|
+
6. **Project index** at `~/.turing/project_index.yaml` — local only, never uploaded
|
|
43
|
+
|
|
44
|
+
7. **If no similar projects found:** suggest running on more projects first or specifying one with `--from`
|
|
45
|
+
|
|
46
|
+
8. **Saved output:** report in `experiments/transfers/transfer-*.yaml`
|
|
47
|
+
|
|
48
|
+
## Examples
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
/turing:transfer # Search index for similar projects
|
|
52
|
+
/turing:transfer --from ~/projects/fraud-detection # Transfer from specific project
|
|
53
|
+
/turing:transfer --auto # Auto-queue hypotheses
|
|
54
|
+
```
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: trend
|
|
3
|
+
description: Long-term trend analysis — improvement velocity, family ROI, diminishing returns detection, strategic research direction.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--window 30d] [--metric accuracy]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
See the arc of your research, not just the latest results. Strategic view over 100+ experiments.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. **Activate environment:** `source .venv/bin/activate`
|
|
13
|
+
2. **Run:** `python scripts/trend_analysis.py $ARGUMENTS`
|
|
14
|
+
3. **Report:** improvement velocity over time windows, family ROI ranking, diminishing returns prediction, phase transitions
|
|
15
|
+
4. **Saved output:** `experiments/trends/trend-*.yaml`
|
|
16
|
+
|
|
17
|
+
## Examples
|
|
18
|
+
```
|
|
19
|
+
/turing:trend # Full trend analysis
|
|
20
|
+
/turing:trend --window 14d # Last 2 weeks
|
|
21
|
+
```
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: try
|
|
3
|
+
description: Inject a hypothesis into the agent's experiment queue. This is how research taste reaches the agent — the human selects which coins to flip, the agent flips them.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<hypothesis description>"
|
|
6
|
+
allowed-tools: Read, Write, Edit, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Inject a human hypothesis into the experiment queue for the next `/turing:train` iteration.
|
|
10
|
+
|
|
11
|
+
This is the taste-leverage mechanism: you provide judgment about what's worth trying, the agent provides disciplined execution.
|
|
12
|
+
|
|
13
|
+
## Steps
|
|
14
|
+
|
|
15
|
+
1. **Parse the hypothesis** from `$ARGUMENTS`. If empty, ask the user what they want the agent to try.
|
|
16
|
+
|
|
17
|
+
2. **Check for archetype syntax.** If the argument starts with `archetype:`, expand it:
|
|
18
|
+
```bash
|
|
19
|
+
source .venv/bin/activate && python scripts/manage_hypotheses.py add --archetype <name> --priority high --source human
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Otherwise, use the raw description:
|
|
23
|
+
```bash
|
|
24
|
+
source .venv/bin/activate && python scripts/manage_hypotheses.py add "$ARGUMENTS" --priority high --source human
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
3. **Confirm** with the hypothesis ID and instructions:
|
|
28
|
+
- "Queued as hyp-NNN (high priority, human-injected)"
|
|
29
|
+
- "The agent will prioritize this on the next `/turing:train` iteration"
|
|
30
|
+
- Show current queue: `python scripts/manage_hypotheses.py list --status queued`
|
|
31
|
+
|
|
32
|
+
## Examples
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
# Free-text hypotheses
|
|
36
|
+
/turing:try switch to LightGBM with dart boosting and lower learning rate
|
|
37
|
+
/turing:try add polynomial features for the numeric columns
|
|
38
|
+
/turing:try increase regularization, the train/val gap suggests overfitting
|
|
39
|
+
|
|
40
|
+
# Archetype-based structured strategies
|
|
41
|
+
/turing:try archetype:model_comparison
|
|
42
|
+
/turing:try archetype:feature_sweep
|
|
43
|
+
/turing:try archetype:ensemble_construction
|
|
44
|
+
/turing:try archetype:regularization_search
|
|
45
|
+
/turing:try archetype:ablation_study
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Available Archetypes
|
|
49
|
+
|
|
50
|
+
| Archetype | What it does | Expected experiments |
|
|
51
|
+
|-----------|-------------|---------------------|
|
|
52
|
+
| `model_comparison` | Compare XGBoost, LightGBM, RF, LR, MLP with statistical tests | ~5 |
|
|
53
|
+
| `hyperparameter_sweep` | Grid search with multi-seed validation | 15-36 |
|
|
54
|
+
| `feature_sweep` | Add/remove feature transforms one at a time | 6-10 |
|
|
55
|
+
| `regularization_search` | Binary search for optimal regularization | 4-6 |
|
|
56
|
+
| `ensemble_construction` | Voting, stacking, blending of top models | 4-6 |
|
|
57
|
+
| `learning_rate_schedule` | lr vs n_estimators tradeoff | 4-5 |
|
|
58
|
+
| `data_quality_audit` | Class balance, label noise, leakage checks | 3-5 |
|
|
59
|
+
| `ablation_study` | Remove features one at a time to measure importance | N+1 |
|
|
60
|
+
|
|
61
|
+
## How It Connects
|
|
62
|
+
|
|
63
|
+
The `/turing:train` loop checks `hypotheses.yaml` during the OBSERVE step. Human-injected hypotheses (high priority) are tried before the agent generates its own. After testing, the hypothesis is marked as `tested`, `promising`, or `dead-end` with a link to the resulting experiment.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: update
|
|
3
|
+
description: Incremental model update — add new data without full retraining, with forgetting detection.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id> --new-data <path> [--replay-ratio 0.1] [--tolerance 0.005]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Add new data to an existing model without starting from scratch. Detects catastrophic forgetting.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/incremental_update.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/updates/`
|
|
15
|
+
|
|
16
|
+
## Model-specific strategies
|
|
17
|
+
- **XGBoost/LightGBM:** continued boosting with additional rounds
|
|
18
|
+
- **Neural networks:** fine-tune with reduced LR + replay buffer from old data
|
|
19
|
+
- **scikit-learn:** partial_fit() or warm_start=True
|
|
20
|
+
|
|
21
|
+
## Examples
|
|
22
|
+
```
|
|
23
|
+
/turing:update exp-089 --new-data data/new_batch.csv
|
|
24
|
+
/turing:update exp-089 --new-data data/new.csv --replay-ratio 0.2
|
|
25
|
+
/turing:update exp-089 --new-data data/new.csv --tolerance 0.01
|
|
26
|
+
/turing:update exp-089 --new-data data/new.csv --json
|
|
27
|
+
```
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: validate
|
|
3
|
+
description: Run stability validation on the current experiment configuration. Executes N runs to measure metric variance and auto-configures multi-run evaluation if variance is too high.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--auto]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Validate the stability of the current ML pipeline by running it multiple times and measuring variance.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Run stability check:**
|
|
19
|
+
```bash
|
|
20
|
+
python scripts/validate_stability.py
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
3. **If `$ARGUMENTS` contains `--auto`:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/validate_stability.py --auto
|
|
26
|
+
```
|
|
27
|
+
This auto-writes `evaluation.n_runs: 3` to `config.yaml` if CV > 5%.
|
|
28
|
+
|
|
29
|
+
4. **Report results:**
|
|
30
|
+
- **Stable (CV < 5%):** metric is reliable, single-run evaluation is sufficient
|
|
31
|
+
- **Unstable (CV >= 5%):** metric has high variance, multi-run with median is recommended
|
|
32
|
+
- If `--auto` was used, report what was changed in config.yaml
|
|
33
|
+
|
|
34
|
+
5. **If no training pipeline exists:** suggest `/turing:init` first.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: warm
|
|
3
|
+
description: Warm-start from a prior model — load checkpoint, optionally freeze layers, adjust learning rate, and continue training.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id> [--freeze-layers encoder] [--unfreeze-after 5]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Take a trained checkpoint and use it as initialization for a new experiment. Automates the "start from here but change X" pattern.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument is the source experiment ID (required)
|
|
20
|
+
- `--freeze-layers encoder decoder` — layer names to freeze (neural only)
|
|
21
|
+
- `--unfreeze-after 5` — unfreeze all layers after N epochs (gradual unfreezing)
|
|
22
|
+
- `--lr-factor 0.1` — learning rate reduction factor (default: 0.1x)
|
|
23
|
+
- `--json` — raw JSON output
|
|
24
|
+
|
|
25
|
+
3. **Run warm-start planner:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/warm_start.py $ARGUMENTS
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **Report results:**
|
|
31
|
+
- Model type detection (tree, neural, sklearn)
|
|
32
|
+
- Strategy: continue_boosting, load_weights, or warm_start_param
|
|
33
|
+
- Numbered step-by-step instructions
|
|
34
|
+
- Config changes to apply
|
|
35
|
+
- Checkpoint info (path, format, size)
|
|
36
|
+
|
|
37
|
+
5. **Strategies by model type:**
|
|
38
|
+
- **Tree models (XGBoost/LightGBM):** continue boosting from existing trees with more estimators
|
|
39
|
+
- **Neural networks:** load weights, optionally freeze layers, reset optimizer, reduce LR
|
|
40
|
+
- **scikit-learn:** use `warm_start=True` parameter for incremental learning
|
|
41
|
+
|
|
42
|
+
6. **If no checkpoint found:** plan is still generated, but warns that checkpoint is needed
|
|
43
|
+
|
|
44
|
+
7. **Saved output:** report written to `experiments/warm_starts/warm-<exp-id>.yaml`
|
|
45
|
+
|
|
46
|
+
## Examples
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
/turing:warm exp-042 # Auto-detect strategy
|
|
50
|
+
/turing:warm exp-042 --freeze-layers encoder # Freeze encoder layers
|
|
51
|
+
/turing:warm exp-042 --freeze-layers encoder --unfreeze-after 5 # Gradual unfreezing
|
|
52
|
+
/turing:warm exp-042 --lr-factor 0.01 # Very small fine-tuning LR
|
|
53
|
+
```
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: watch
|
|
3
|
+
description: Live training monitor with early-warning alerts for loss spikes, NaN, overfitting, and metric plateaus.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--alerts] [--interval 10] [--analyze run.log]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Stream metrics during training with early-warning alerts. Catches problems mid-run instead of at the end.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--analyze run.log` — post-hoc analysis of a completed log (non-blocking)
|
|
20
|
+
- `--alerts` — show only alert lines, suppress normal output
|
|
21
|
+
- `--interval 10` — check interval in seconds (default: 10)
|
|
22
|
+
- `--alerts-config config/watch_alerts.yaml` — custom alert rules
|
|
23
|
+
- `--json` — raw JSON output (for `--analyze` mode)
|
|
24
|
+
|
|
25
|
+
3. **For post-hoc analysis:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/training_monitor.py --analyze run.log
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **For live monitoring (inform user):**
|
|
31
|
+
Live monitoring requires a running training process. Suggest the user run in a separate terminal:
|
|
32
|
+
```bash
|
|
33
|
+
python scripts/training_monitor.py --log run.log --interval 10
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
5. **Alert types:**
|
|
37
|
+
- **Loss spike:** loss > 3x rolling mean (configurable multiplier)
|
|
38
|
+
- **NaN detected:** any metric is NaN — CRITICAL, suggests pausing
|
|
39
|
+
- **Overfitting onset:** train/val gap widening for 3+ consecutive epochs
|
|
40
|
+
- **Plateau:** metric improvement < 0.001 for 5+ consecutive epochs
|
|
41
|
+
|
|
42
|
+
6. **Dashboard line format:**
|
|
43
|
+
```
|
|
44
|
+
Epoch 23/100 | loss: 0.342 ↓ | acc: 0.865 ↑ | gap: 0.018 | ⚠ plateau
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
7. **Alert config:** rules are in `config/watch_alerts.yaml` — users can customize thresholds.
|
|
48
|
+
|
|
49
|
+
8. **Saved output:** analysis report written to `experiments/monitors/analysis-*.yaml`
|
|
50
|
+
|
|
51
|
+
9. **If no training log exists:** suggest running `/turing:train` first.
|
|
52
|
+
|
|
53
|
+
## Examples
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
/turing:watch --analyze run.log # Analyze completed training
|
|
57
|
+
/turing:watch --analyze run.log --json # JSON output for scripting
|
|
58
|
+
/turing:watch --alerts # Live: show only alerts
|
|
59
|
+
/turing:watch --interval 5 # Live: check every 5 seconds
|
|
60
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: whatif
|
|
3
|
+
description: What-if analysis — answer hypotheticals from existing experiment data without running new experiments.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "\"<question>\" [--json]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Answer "what if?" questions using existing experiment data. Routes to the right estimator automatically.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/whatif_engine.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/whatif/`
|
|
15
|
+
|
|
16
|
+
## Supported question types
|
|
17
|
+
- **Data scaling:** "what if I had 2x more data" → scaling law extrapolation
|
|
18
|
+
- **Ablation:** "what if I removed class 3" → ablation study data
|
|
19
|
+
- **Pipeline stitch:** "what if I combined exp-031 with exp-042" → stitch estimation
|
|
20
|
+
- **Hyperparameters:** "what if learning_rate was 0.01" → sensitivity interpolation
|
|
21
|
+
- **Ensemble:** "what if I ensembled the top models" → correlation analysis
|
|
22
|
+
- **Pruning:** "what if I pruned to 50% sparsity" → pruning sweep interpolation
|
|
23
|
+
- **Budget:** "what if I spent my budget on X vs Y" → budget allocation
|
|
24
|
+
|
|
25
|
+
## Examples
|
|
26
|
+
```
|
|
27
|
+
/turing:whatif "what if I had 2x more data"
|
|
28
|
+
/turing:whatif "what if I removed class 3"
|
|
29
|
+
/turing:whatif "what if I combined exp-031 with exp-042"
|
|
30
|
+
/turing:whatif "what if learning_rate was 0.01" --json
|
|
31
|
+
```
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: xray
|
|
3
|
+
description: Internal model diagnostics — gradient flow, dead neurons, activation stats, weight distributions, tree depth analysis.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--layer encoder.layer.2] [--compare exp-a exp-b]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
See inside the model. When it underperforms, the fix depends on *why*.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- Optional experiment ID
|
|
20
|
+
- `--layer "name"` — focus on specific layer
|
|
21
|
+
- `--compare exp-a exp-b` — side-by-side diagnostics
|
|
22
|
+
- `--json` — raw JSON output
|
|
23
|
+
|
|
24
|
+
3. **Run model diagnostics:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/model_xray.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Diagnostics by model type:**
|
|
30
|
+
- **Neural networks:** gradient magnitudes, activation stats, dead neuron %, weight distributions, gradient-to-weight ratio
|
|
31
|
+
- **Tree models:** depth utilization, leaf purity, feature split dominance
|
|
32
|
+
- **scikit-learn:** coefficient magnitudes, feature importance concentration
|
|
33
|
+
|
|
34
|
+
5. **Issues detected:** dead gradients, vanishing/exploding gradients, dead neurons, sparse weights, feature dominance, overfitting risk
|
|
35
|
+
|
|
36
|
+
6. **Saved output:** report in `experiments/xrays/<exp-id>-xray.yaml`
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:xray exp-042 # Full diagnostics
|
|
42
|
+
/turing:xray # Best experiment
|
|
43
|
+
```
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import { readFile } from 'fs/promises';
|
|
2
|
+
import { dirname, join } from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
import YAML from 'yaml';
|
|
5
|
+
|
|
6
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
7
|
+
const PLUGIN_ROOT = dirname(__dirname);
|
|
8
|
+
const REGISTRY_PATH = join(PLUGIN_ROOT, 'config', 'commands.yaml');
|
|
9
|
+
|
|
10
|
+
const COMMAND_NAME_PATTERN = /^[a-z][a-z0-9-]*$/;
|
|
11
|
+
const INVOCATION_MODES = new Set(['slash_only']);
|
|
12
|
+
const MODEL_INVOCATIONS = new Set(['disabled', 'enabled']);
|
|
13
|
+
const SCRIPT_LOCATIONS = new Set(['repo', 'scaffold']);
|
|
14
|
+
|
|
15
|
+
function isRecord(value) {
|
|
16
|
+
return value !== null && typeof value === 'object' && !Array.isArray(value);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function requireRecord(value, label) {
|
|
20
|
+
if (!isRecord(value)) {
|
|
21
|
+
throw new Error(`${label} must be a mapping`);
|
|
22
|
+
}
|
|
23
|
+
return value;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function requireNonEmptyString(value, label) {
|
|
27
|
+
if (typeof value !== 'string' || value.length === 0) {
|
|
28
|
+
throw new Error(`${label} must be a non-empty string`);
|
|
29
|
+
}
|
|
30
|
+
return value;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function requireNonEmptyStringList(value, label) {
|
|
34
|
+
if (!Array.isArray(value) || value.length === 0) {
|
|
35
|
+
throw new Error(`${label} must be a non-empty string list`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
for (const [index, item] of value.entries()) {
|
|
39
|
+
requireNonEmptyString(item, `${label}[${index}]`);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return value;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function requireEnum(value, allowed, label) {
|
|
46
|
+
requireNonEmptyString(value, label);
|
|
47
|
+
if (!allowed.has(value)) {
|
|
48
|
+
throw new Error(`${label} must be one of: ${Array.from(allowed).join(', ')}`);
|
|
49
|
+
}
|
|
50
|
+
return value;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function validateEquivalentScript(value, commandName) {
|
|
54
|
+
const label = `commands.${commandName}.equivalent_script`;
|
|
55
|
+
const script = requireRecord(value, label);
|
|
56
|
+
const keys = Object.keys(script).sort();
|
|
57
|
+
const expectedKeys = ['location', 'path'];
|
|
58
|
+
if (keys.length !== expectedKeys.length || keys.some((key, index) => key !== expectedKeys[index])) {
|
|
59
|
+
throw new Error(`${label} must contain exactly: ${expectedKeys.join(', ')}`);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
requireNonEmptyString(script.path, `${label}.path`);
|
|
63
|
+
requireEnum(
|
|
64
|
+
script.location,
|
|
65
|
+
SCRIPT_LOCATIONS,
|
|
66
|
+
`${label}.location`,
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function validateCommand(commandName, value) {
|
|
71
|
+
if (!COMMAND_NAME_PATTERN.test(commandName)) {
|
|
72
|
+
throw new Error(`Invalid command name: ${commandName}`);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const command = requireRecord(value, `commands.${commandName}`);
|
|
76
|
+
requireNonEmptyString(command.description, `commands.${commandName}.description`);
|
|
77
|
+
requireNonEmptyString(command.lifecycle, `commands.${commandName}.lifecycle`);
|
|
78
|
+
requireEnum(command.invocation_mode, INVOCATION_MODES, `commands.${commandName}.invocation_mode`);
|
|
79
|
+
requireEnum(
|
|
80
|
+
command.model_invocation,
|
|
81
|
+
MODEL_INVOCATIONS,
|
|
82
|
+
`commands.${commandName}.model_invocation`,
|
|
83
|
+
);
|
|
84
|
+
|
|
85
|
+
if (typeof command.mutates_project !== 'boolean') {
|
|
86
|
+
throw new Error(`commands.${commandName}.mutates_project must be a boolean`);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
requireNonEmptyStringList(command.tools, `commands.${commandName}.tools`);
|
|
90
|
+
|
|
91
|
+
if ('argument_hint' in command) {
|
|
92
|
+
requireNonEmptyString(command.argument_hint, `commands.${commandName}.argument_hint`);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if ('equivalent_script' in command) {
|
|
96
|
+
validateEquivalentScript(command.equivalent_script, commandName);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function validateRegistry(value) {
|
|
101
|
+
const registry = requireRecord(value, 'Command registry root');
|
|
102
|
+
const configFiles = requireNonEmptyStringList(registry.config_files, 'config_files');
|
|
103
|
+
const commands = requireRecord(registry.commands, 'commands');
|
|
104
|
+
|
|
105
|
+
for (const [commandName, command] of Object.entries(commands)) {
|
|
106
|
+
validateCommand(commandName, command);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
commands,
|
|
111
|
+
commandNames: Object.keys(commands).sort(),
|
|
112
|
+
configFiles: [...configFiles].sort(),
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export async function loadCommandRegistry(registryPath = REGISTRY_PATH) {
|
|
117
|
+
let source;
|
|
118
|
+
try {
|
|
119
|
+
source = await readFile(registryPath, 'utf8');
|
|
120
|
+
} catch (error) {
|
|
121
|
+
throw new Error(`Failed to read command registry at ${registryPath}: ${error.message}`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let parsed;
|
|
125
|
+
try {
|
|
126
|
+
parsed = YAML.parse(source);
|
|
127
|
+
} catch (error) {
|
|
128
|
+
throw new Error(`Failed to parse command registry at ${registryPath}: ${error.message}`);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
return validateRegistry(parsed);
|
|
133
|
+
} catch (error) {
|
|
134
|
+
throw new Error(`Invalid command registry at ${registryPath}: ${error.message}`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export async function getCommandNames(registryPath) {
|
|
139
|
+
const registry = await loadCommandRegistry(registryPath);
|
|
140
|
+
return registry.commandNames;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export async function getExpectedCommandPaths(registryPath) {
|
|
144
|
+
const names = await getCommandNames(registryPath);
|
|
145
|
+
return ['SKILL.md', ...names.map((name) => `${name}/SKILL.md`)];
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export async function getExpectedSkillSourcePaths(registryPath) {
|
|
149
|
+
const names = await getCommandNames(registryPath);
|
|
150
|
+
return [
|
|
151
|
+
'skills/turing/SKILL.md',
|
|
152
|
+
...names.map((name) => `skills/turing/${name}/SKILL.md`),
|
|
153
|
+
'skills/turing/rules/loop-protocol.md',
|
|
154
|
+
];
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
export async function getConfigFiles(registryPath) {
|
|
158
|
+
const registry = await loadCommandRegistry(registryPath);
|
|
159
|
+
return registry.configFiles;
|
|
160
|
+
}
|