claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,45 @@
1
+ # Turing Plugin Defaults
2
+ #
3
+ # Fallback values used when a scaffolded project's config.yaml is missing keys.
4
+ # These represent conservative starting points — the autoresearch agent will
5
+ # explore the parameter space from here.
6
+
7
+ convergence:
8
+ patience: 3 # Consecutive non-improvements before stopping
9
+ improvement_threshold: 0.005 # 0.5% relative improvement required
10
+
11
+ evaluation:
12
+ primary_metric: "accuracy"
13
+ metrics: ["accuracy", "f1_weighted"]
14
+ lower_is_better: false
15
+
16
+ model:
17
+ type: "xgboost"
18
+ hyperparams:
19
+ n_estimators: 100
20
+ max_depth: 4
21
+ learning_rate: 0.1
22
+
23
+ data:
24
+ split_ratios:
25
+ train: 0.70
26
+ val: 0.15
27
+ test: 0.15
28
+ random_state: 42
29
+
30
+ # Template placeholders — resolved during /turing:init scaffolding
31
+ placeholders:
32
+ PROJECT_NAME: "Name of the ML project (e.g., sentiment, churn)"
33
+ TARGET_METRIC: "Primary metric to optimize (e.g., accuracy, f1, mae)"
34
+ TASK_DESCRIPTION: "What the model does (e.g., Predict customer churn)"
35
+ ML_DIR: "Directory for ML files relative to project root"
36
+ DATA_SOURCE: "Path to training data file"
37
+ METRIC_DIRECTION: "lower or higher — which direction is better"
38
+
39
+ # Agent configuration
40
+ agents:
41
+ researcher:
42
+ max_turns: 200
43
+ memory_path: ".claude/agent-memory/ml-researcher/MEMORY.md"
44
+ evaluator:
45
+ max_turns: 50
@@ -0,0 +1,127 @@
1
+ # Structured Experiment Archetypes
2
+ #
3
+ # Pre-defined experiment strategies that give the agent (and human)
4
+ # structured starting points instead of ad-hoc free text.
5
+ #
6
+ # Usage:
7
+ # /turing:try archetype:model_comparison
8
+ # /turing:try archetype:feature_sweep
9
+ # python scripts/manage_hypotheses.py add --archetype model_comparison
10
+ #
11
+ # Each archetype defines: what to do, in what order, and what to measure.
12
+ # The agent fills in project-specific details (metric, model type, etc.)
13
+ # from config.yaml at execution time.
14
+
15
+ archetypes:
16
+ model_comparison:
17
+ name: "Systematic Model Comparison"
18
+ description: "Compare multiple model families with identical preprocessing to find the best architecture for this task"
19
+ steps:
20
+ - "Train all candidate models with identical data splits and preprocessing"
21
+ - "Use cross-validation (5-fold) for each model"
22
+ - "Compare on primary metric + secondary metrics"
23
+ - "Run statistical comparison (paired t-test or Wilcoxon) between top 2"
24
+ - "Report: comparison table, statistical significance, recommendation"
25
+ suggested_models:
26
+ - "xgboost"
27
+ - "lightgbm"
28
+ - "random_forest"
29
+ - "logistic_regression"
30
+ - "mlp"
31
+ when_to_use: "Early in a project when you haven't established which model family works best"
32
+ expected_experiments: 5
33
+ family_tag: "model-comparison"
34
+
35
+ hyperparameter_sweep:
36
+ name: "Hyperparameter Grid Sweep"
37
+ description: "Systematically explore the hyperparameter space of the current best model"
38
+ steps:
39
+ - "Define parameter ranges based on current best config"
40
+ - "Generate cartesian product queue via sweep_config.yaml"
41
+ - "Run all combinations sequentially"
42
+ - "Identify top 3 configurations"
43
+ - "Run statistical validation on top 3 (multi-seed)"
44
+ when_to_use: "After selecting a model family, before fine-tuning"
45
+ expected_experiments: 15-36
46
+ family_tag: "hyperparameter-sweep"
47
+
48
+ feature_sweep:
49
+ name: "Feature Engineering Exploration"
50
+ description: "Systematically add and evaluate feature transformations one at a time"
51
+ steps:
52
+ - "Establish baseline with current features"
53
+ - "Add polynomial interaction features — evaluate"
54
+ - "Add target encoding for high-cardinality categoricals — evaluate"
55
+ - "Add binning for continuous features — evaluate"
56
+ - "Remove low-importance features (bottom 20%) — evaluate"
57
+ - "Combine the improvements that helped independently"
58
+ when_to_use: "When model architecture is settled but performance plateaus"
59
+ expected_experiments: 6-10
60
+ family_tag: "feature-engineering"
61
+
62
+ regularization_search:
63
+ name: "Regularization Optimization"
64
+ description: "Find the optimal regularization strength to minimize overfitting"
65
+ steps:
66
+ - "Measure current train/val gap (overfit_gap metric)"
67
+ - "If gap > 0.05: increase regularization (max_depth down, min_child_weight up, dropout up)"
68
+ - "If gap < 0.01: decrease regularization (model may be underfitting)"
69
+ - "Binary search: try 3-4 regularization levels"
70
+ - "Plot train/val gap vs regularization strength"
71
+ - "Select the configuration with smallest gap that doesn't sacrifice val performance"
72
+ when_to_use: "When train metrics are much better than val metrics (overfit_gap > 0.05)"
73
+ expected_experiments: 4-6
74
+ family_tag: "regularization"
75
+
76
+ ensemble_construction:
77
+ name: "Ensemble Model Construction"
78
+ description: "Combine top-performing models into an ensemble for improved stability and accuracy"
79
+ steps:
80
+ - "Select top 3 diverse models (different families preferred)"
81
+ - "Try soft voting ensemble — evaluate"
82
+ - "Try stacking with a simple meta-learner (logistic regression) — evaluate"
83
+ - "Try blending (average predictions on holdout) — evaluate"
84
+ - "Compare ensemble vs best individual model"
85
+ - "Run multi-seed validation on the best ensemble"
86
+ when_to_use: "Late-stage optimization when individual models have plateaued"
87
+ expected_experiments: 4-6
88
+ family_tag: "ensemble"
89
+
90
+ learning_rate_schedule:
91
+ name: "Learning Rate Schedule Optimization"
92
+ description: "Trade off learning rate against number of estimators for gradient boosting models"
93
+ steps:
94
+ - "Start with current lr and n_estimators"
95
+ - "Try lr/2 with 2x n_estimators"
96
+ - "Try lr/5 with 5x n_estimators"
97
+ - "Try lr/10 with 10x n_estimators"
98
+ - "Find the sweet spot where val performance peaks before overfitting"
99
+ when_to_use: "For gradient boosting models (XGBoost, LightGBM) after initial hyperparameter selection"
100
+ expected_experiments: 4-5
101
+ family_tag: "lr-schedule"
102
+
103
+ data_quality_audit:
104
+ name: "Data Quality Investigation"
105
+ description: "Investigate whether data quality issues are limiting model performance"
106
+ steps:
107
+ - "Check class balance — if imbalanced, try SMOTE or class weights"
108
+ - "Check for label noise — train on 90% of data, predict the held-out 10%, flag disagreements"
109
+ - "Check for data leakage — verify no future information in features"
110
+ - "Check feature distributions — identify outliers, missing value patterns"
111
+ - "Try robust preprocessing — winsorize outliers, impute missing values differently"
112
+ when_to_use: "When model performance is unexpectedly poor or unstable across seeds"
113
+ expected_experiments: 3-5
114
+ family_tag: "data-quality"
115
+
116
+ ablation_study:
117
+ name: "Feature Ablation Study"
118
+ description: "Remove features one at a time to understand their individual contribution"
119
+ steps:
120
+ - "Train baseline with all features — record metric"
121
+ - "For each feature group: remove it, retrain, record metric delta"
122
+ - "Rank features by importance (metric drop when removed)"
123
+ - "Identify features that hurt performance (metric improves when removed)"
124
+ - "Train final model without harmful features"
125
+ when_to_use: "When you have many features and want to understand which matter"
126
+ expected_experiments: "N+1 where N is the number of feature groups"
127
+ family_tag: "ablation"
@@ -0,0 +1,31 @@
1
+ # Experiment Lifecycle State Machine
2
+ #
3
+ # Each experiment transitions through these states. The autoresearch
4
+ # agent validates transitions against this data — not English prose.
5
+ #
6
+ # This is a formal encoding of the scientific method applied to ML:
7
+ # hypothesize -> execute -> observe -> decide -> record.
8
+
9
+ [states]
10
+ proposed = { description = "Hypothesis formed, changes committed, not yet run" }
11
+ running = { description = "Training in progress" }
12
+ evaluating = { description = "Training complete, metrics being computed" }
13
+ kept = { description = "Improvement confirmed, merged to main" }
14
+ discarded = { description = "No improvement, reverted" }
15
+ converged = { description = "Training loop terminated — convergence detected" }
16
+
17
+ [transitions]
18
+ # from -> [valid next states]
19
+ proposed = ["running"]
20
+ running = ["evaluating", "discarded"] # discarded if training crashes
21
+ evaluating = ["kept", "discarded"]
22
+ kept = ["proposed"] # next iteration starts new hypothesis
23
+ discarded = ["proposed"] # next iteration starts new hypothesis
24
+
25
+ [requirements]
26
+ # Preconditions for each transition
27
+ "proposed -> running" = "Changes committed to git, venv activated"
28
+ "running -> evaluating" = "train.py completed without error, run.log exists"
29
+ "evaluating -> kept" = "Primary metric improved over prior best"
30
+ "evaluating -> discarded" = "Primary metric did not improve"
31
+ "running -> discarded" = "Training crashed or timed out"
@@ -0,0 +1,107 @@
1
+ # Novelty Guard Alias Configuration
2
+ #
3
+ # Configurable per-project. The novelty guard normalizes experiment
4
+ # descriptions using these tables before comparing similarity.
5
+ # Extend with domain-specific aliases for your ML task.
6
+
7
+ phrase_aliases:
8
+ "learning rate": "lr"
9
+ "step size": "lr"
10
+ "batch size": "batch_size"
11
+ "gradient accumulation": "grad_accum"
12
+ "weight decay": "weight_decay"
13
+ "early stopping": "early_stop"
14
+ "random forest": "rf"
15
+ "gradient boosting": "gbdt"
16
+ "neural network": "nn"
17
+ "neural net": "nn"
18
+ "decision tree": "dtree"
19
+ "feature engineering": "feat_eng"
20
+ "feature selection": "feat_sel"
21
+ "hyperparameter tuning": "hparam_tune"
22
+ "cross validation": "cv"
23
+ "train test split": "split"
24
+ "one hot encoding": "onehot"
25
+ "label encoding": "label_enc"
26
+
27
+ token_aliases:
28
+ "increase": "up"
29
+ "increasing": "up"
30
+ "raise": "up"
31
+ "higher": "up"
32
+ "boost": "up"
33
+ "larger": "up"
34
+ "bigger": "up"
35
+ "more": "up"
36
+ "decrease": "down"
37
+ "decreasing": "down"
38
+ "lower": "down"
39
+ "reduce": "down"
40
+ "smaller": "down"
41
+ "fewer": "down"
42
+ "less": "down"
43
+ "remove": "drop"
44
+ "delete": "drop"
45
+ "disable": "drop"
46
+ "add": "include"
47
+ "enable": "include"
48
+ "introduce": "include"
49
+ "switch": "change"
50
+ "replace": "change"
51
+ "swap": "change"
52
+
53
+ stopwords:
54
+ - "a"
55
+ - "an"
56
+ - "and"
57
+ - "are"
58
+ - "as"
59
+ - "at"
60
+ - "be"
61
+ - "by"
62
+ - "for"
63
+ - "from"
64
+ - "if"
65
+ - "in"
66
+ - "into"
67
+ - "is"
68
+ - "it"
69
+ - "of"
70
+ - "on"
71
+ - "or"
72
+ - "the"
73
+ - "to"
74
+ - "try"
75
+ - "use"
76
+ - "using"
77
+ - "with"
78
+
79
+ concept_patterns:
80
+ lr: ["lr", "learning_rate", "step_size"]
81
+ architecture: ["depth", "width", "layers", "heads", "nn", "rf", "gbdt", "dtree", "xgboost", "lightgbm"]
82
+ regularization: ["dropout", "weight_decay", "l1", "l2", "early_stop", "regulariz"]
83
+ features: ["feat_eng", "feat_sel", "onehot", "label_enc", "feature", "column"]
84
+ data: ["split", "augment", "sample", "oversample", "undersample", "cv"]
85
+ optimizer: ["adam", "sgd", "optim", "momentum", "scheduler"]
86
+ ensemble: ["voting", "stacking", "blending", "bagging", "boosting"]
87
+
88
+ # Mode policies: what each research mode allows/blocks
89
+ mode_policies:
90
+ explore:
91
+ novel: "allow"
92
+ known_success: "block"
93
+ incremental_followup: "block"
94
+ repeat_failure: "block"
95
+ duplicate_run: "block"
96
+ exploit:
97
+ novel: "caution"
98
+ known_success: "allow"
99
+ incremental_followup: "allow"
100
+ repeat_failure: "block"
101
+ duplicate_run: "block"
102
+ replicate:
103
+ novel: "block"
104
+ known_success: "allow"
105
+ incremental_followup: "allow"
106
+ repeat_failure: "caution"
107
+ duplicate_run: "allow"
@@ -0,0 +1,125 @@
1
+ # ADR Relationship Graph
2
+ #
3
+ # Encodes dependencies and relationships between ADRs.
4
+ # Used by /blueprint:impact for incremental conflict detection.
5
+
6
+ [nodes]
7
+ "ADR-0001" = { title = "Use ADRs for architectural decisions", status = "Accepted" }
8
+ "ADR-0002" = { title = "Separate hypothesis space from measurement apparatus", status = "Accepted" }
9
+ "ADR-0003" = { title = "Two-agent architecture with least-privilege boundaries", status = "Accepted" }
10
+ "ADR-0004" = { title = "TOML config DSL for domain knowledge", status = "Accepted" }
11
+ "ADR-0005" = { title = "Git-disciplined experiment lifecycle", status = "Accepted" }
12
+ "ADR-0006" = { title = "Patience-based convergence detection", status = "Accepted" }
13
+ "ADR-0007" = { title = "JSONL append-only experiment logging", status = "Accepted" }
14
+ "ADR-0008" = { title = "Template-based project scaffolding", status = "Accepted" }
15
+ "ADR-0009" = { title = "XGBoost default with pluggable featurizers", status = "Accepted" }
16
+ "ADR-0010" = { title = "Claude Code plugin distribution", status = "Accepted" }
17
+
18
+ [[edges]]
19
+ from = "ADR-0003"
20
+ to = "ADR-0002"
21
+ type = "implements"
22
+ description = "Agent capability boundary is the agent-level implementation of hypothesis-measurement separation"
23
+
24
+ [[edges]]
25
+ from = "ADR-0006"
26
+ to = "ADR-0007"
27
+ type = "depends-on"
28
+ description = "Convergence detection reads the JSONL experiment log to count non-improvements"
29
+
30
+ [[edges]]
31
+ from = "ADR-0005"
32
+ to = "ADR-0002"
33
+ type = "supports"
34
+ description = "Git discipline (commit before, revert on failure) preserves the measurement-hypothesis boundary across experiments"
35
+
36
+ [[edges]]
37
+ from = "ADR-0008"
38
+ to = "ADR-0002"
39
+ type = "implements"
40
+ description = "Template headers label files as MEASUREMENT APPARATUS or HYPOTHESIS SPACE"
41
+
42
+ [[edges]]
43
+ from = "ADR-0008"
44
+ to = "ADR-0009"
45
+ type = "depends-on"
46
+ description = "Templates include the default XGBoost training pipeline"
47
+
48
+ [[edges]]
49
+ from = "ADR-0010"
50
+ to = "ADR-0008"
51
+ type = "depends-on"
52
+ description = "npm installer deploys the templates directory"
53
+
54
+ [[edges]]
55
+ from = "ADR-0009"
56
+ to = "ADR-0002"
57
+ type = "supports"
58
+ description = "Featurizer pipeline is READ-ONLY, agent modifies how train.py uses it"
59
+
60
+ # Proposed ADRs from architecture evaluation (2026-03-31)
61
+ [nodes."ADR-0011"]
62
+ title = "Establish testing strategy for plugin infrastructure"
63
+ status = "Proposed"
64
+
65
+ [nodes."ADR-0012"]
66
+ title = "Extract convergence detection from shell to testable Python"
67
+ status = "Proposed"
68
+
69
+ [nodes."ADR-0013"]
70
+ title = "Standardize experiment status vocabulary"
71
+ status = "Proposed"
72
+
73
+ [nodes."ADR-0014"]
74
+ title = "Enforce placeholder substitution verification"
75
+ status = "Proposed"
76
+
77
+ [nodes."ADR-0015"]
78
+ title = "Extract metric output format into documented contract"
79
+ status = "Proposed"
80
+
81
+ [nodes."ADR-0016"]
82
+ title = "Unify scaffolding into a single implementation"
83
+ status = "Proposed"
84
+
85
+ [[edges]]
86
+ from = "ADR-0011"
87
+ to = "ADR-0002"
88
+ type = "enforces"
89
+ description = "Testing strategy must verify hypothesis-measurement separation invariant"
90
+
91
+ [[edges]]
92
+ from = "ADR-0011"
93
+ to = "ADR-0006"
94
+ type = "enforces"
95
+ description = "Testing strategy must verify convergence detection"
96
+
97
+ [[edges]]
98
+ from = "ADR-0012"
99
+ to = "ADR-0006"
100
+ type = "refines"
101
+ description = "Extracts convergence logic from bash to testable Python module"
102
+
103
+ [[edges]]
104
+ from = "ADR-0013"
105
+ to = "ADR-0004"
106
+ type = "enforces"
107
+ description = "Standardizes runtime code to use lifecycle.toml vocabulary"
108
+
109
+ [[edges]]
110
+ from = "ADR-0014"
111
+ to = "ADR-0008"
112
+ type = "refines"
113
+ description = "Adds verification step to template scaffolding system"
114
+
115
+ [[edges]]
116
+ from = "ADR-0016"
117
+ to = "ADR-0008"
118
+ type = "refines"
119
+ description = "Merges dual scaffolding paths into single implementation"
120
+
121
+ [[edges]]
122
+ from = "ADR-0016"
123
+ to = "ADR-0014"
124
+ type = "depends-on"
125
+ description = "Unified scaffolding would integrate placeholder verification"
@@ -0,0 +1,24 @@
1
+ # Blueprint State — Session Memory
2
+ #
3
+ # Tracks the current state of the ADR system for contextual suggestions.
4
+ # Updated automatically by blueprint commands.
5
+
6
+ [paths]
7
+ adr_directory = "docs/adr"
8
+ architecture_doc = "docs/ARCHITECTURE.md"
9
+ project_root = "."
10
+
11
+ [timestamps]
12
+ last_adr_created = "2026-03-31"
13
+ last_audit = ""
14
+ last_evaluation = "2026-03-31"
15
+ last_retro = ""
16
+
17
+ [counts]
18
+ total_adrs = 16
19
+ accepted = 16
20
+ proposed = 0
21
+ rejected = 0
22
+ deferred = 0
23
+ deprecated = 0
24
+ superseded = 0
@@ -0,0 +1,110 @@
1
+ # ML Task Property Taxonomy
2
+ #
3
+ # Used by suggest_models.py to classify a task and generate
4
+ # targeted arXiv search queries. Adapted from AERO's
5
+ # ML_RESEARCH_CATEGORIES (src/aero/model_researcher/shared_defs.py).
6
+ #
7
+ # Each category has a key, a human-readable label, and a description.
8
+ # The LLM detects which categories apply to the user's task.
9
+ # Categories can be extended per-project by editing this file.
10
+
11
+ categories:
12
+ variable_length_sequences:
13
+ label: "Variable-Length Sequences"
14
+ description: "Data consists of sequences of varying lengths (e.g., text, sensor streams, speech)"
15
+ search_terms: ["sequence modeling", "variable length", "recurrent"]
16
+
17
+ fixed_channel_count:
18
+ label: "Fixed Channel Count"
19
+ description: "Inputs have a fixed number of channels or features across all samples (e.g., EEG, RGB images)"
20
+ search_terms: ["multi-channel", "fixed input", "feature vector"]
21
+
22
+ temporal_structure:
23
+ label: "Temporal Structure"
24
+ description: "Data has inherent time dependencies or ordering (e.g., time series, forecasting)"
25
+ search_terms: ["time series", "temporal", "forecasting"]
26
+
27
+ classification_objective:
28
+ label: "Classification"
29
+ description: "Task involves predicting discrete labels from data"
30
+ search_terms: ["classification", "categorical prediction"]
31
+
32
+ regression_objective:
33
+ label: "Regression"
34
+ description: "Task involves predicting continuous values"
35
+ search_terms: ["regression", "continuous prediction"]
36
+
37
+ generation_objective:
38
+ label: "Generation"
39
+ description: "Models must produce new data samples from learned distributions"
40
+ search_terms: ["generative model", "data generation", "synthesis"]
41
+
42
+ noise_robustness:
43
+ label: "Noise Robustness"
44
+ description: "System must perform well under noisy, incomplete, or corrupted inputs"
45
+ search_terms: ["noise robust", "data corruption", "missing data"]
46
+
47
+ real_time_constraint:
48
+ label: "Real-Time Constraint"
49
+ description: "Solution must operate under strict latency or streaming requirements"
50
+ search_terms: ["real-time", "low latency", "streaming"]
51
+
52
+ sensor_data:
53
+ label: "Sensor Data"
54
+ description: "Inputs originate from physical sensors (e.g., IoT, biomedical, accelerometers)"
55
+ search_terms: ["sensor data", "IoT", "signal processing"]
56
+
57
+ multimodal_data:
58
+ label: "Multimodal"
59
+ description: "Task combines multiple data types or modalities (e.g., vision + language)"
60
+ search_terms: ["multimodal", "multi-modal", "cross-modal"]
61
+
62
+ interpretability_required:
63
+ label: "Interpretability"
64
+ description: "Model must provide human-understandable reasoning or explanations"
65
+ search_terms: ["interpretable", "explainable", "XAI"]
66
+
67
+ few_shot_learning:
68
+ label: "Few-Shot Learning"
69
+ description: "System must generalize from very few labeled examples"
70
+ search_terms: ["few-shot", "low resource", "meta-learning"]
71
+
72
+ tabular_data:
73
+ label: "Tabular Data"
74
+ description: "Inputs are structured rows/columns (e.g., CSV, database tables)"
75
+ search_terms: ["tabular", "structured data", "gradient boosting"]
76
+
77
+ image_data:
78
+ label: "Image Data"
79
+ description: "Inputs are images or visual data"
80
+ search_terms: ["image classification", "computer vision", "convolutional"]
81
+
82
+ text_data:
83
+ label: "Text Data"
84
+ description: "Inputs are natural language text"
85
+ search_terms: ["NLP", "text classification", "language model"]
86
+
87
+ anomaly_detection:
88
+ label: "Anomaly Detection"
89
+ description: "Task involves identifying outliers or unusual patterns"
90
+ search_terms: ["anomaly detection", "outlier detection", "novelty detection"]
91
+
92
+ imbalanced_classes:
93
+ label: "Imbalanced Classes"
94
+ description: "Class distribution is heavily skewed"
95
+ search_terms: ["class imbalance", "oversampling", "SMOTE"]
96
+
97
+ high_dimensionality:
98
+ label: "High Dimensionality"
99
+ description: "Feature space is very large relative to sample count"
100
+ search_terms: ["dimensionality reduction", "feature selection", "curse of dimensionality"]
101
+
102
+ small_dataset:
103
+ label: "Small Dataset"
104
+ description: "Training data is limited (hundreds to low thousands of samples)"
105
+ search_terms: ["small data", "data-efficient", "transfer learning"]
106
+
107
+ large_scale:
108
+ label: "Large Scale"
109
+ description: "Dataset or model requires distributed or efficient training"
110
+ search_terms: ["large-scale", "distributed training", "efficient training"]
@@ -0,0 +1,37 @@
1
+ # Experiment Classification Taxonomy
2
+ #
3
+ # Structured vocabulary for categorizing experiments, failure modes,
4
+ # and model architectures. Agents use these categories consistently
5
+ # across sessions — classification by data, not by English judgment.
6
+
7
+ [experiment_types]
8
+ hyperparameter = "Change model hyperparameters (lr, depth, n_estimators, etc.)"
9
+ architecture = "Change model type or structure (XGBoost -> LightGBM, add layers)"
10
+ feature = "Change feature engineering (add/remove/transform features)"
11
+ data = "Change data handling (augmentation, sampling, preprocessing)"
12
+ ensemble = "Combine multiple models"
13
+ regularization = "Change regularization strategy (dropout, weight decay, early stopping)"
14
+
15
+ [failure_modes]
16
+ overfitting = "Val metrics significantly worse than train metrics"
17
+ underfitting = "Both train and val metrics poor"
18
+ convergence = "Metrics plateaued — no improvement for N iterations"
19
+ instability = "Metrics oscillate across runs with same config"
20
+ data_leakage = "Suspiciously high metrics suggesting information leak"
21
+ resource_limit = "OOM, timeout, or disk space exhaustion"
22
+ numerical = "NaN/Inf in gradients, loss, or predictions"
23
+ implementation = "Bug in train.py causing incorrect training"
24
+
25
+ [model_families]
26
+ gradient_boosting = ["xgboost", "lightgbm", "catboost", "sklearn_gbdt"]
27
+ random_forest = ["sklearn_rf", "extra_trees"]
28
+ linear = ["logistic_regression", "ridge", "lasso", "elastic_net"]
29
+ neural_network = ["mlp", "cnn", "rnn", "transformer"]
30
+ ensemble = ["voting", "stacking", "blending"]
31
+ baseline = ["dummy", "majority_class", "mean_prediction"]
32
+
33
+ [severity_levels]
34
+ critical = "Experiment cannot run at all"
35
+ major = "Experiment runs but produces invalid/unreliable results"
36
+ minor = "Experiment runs correctly but with suboptimal configuration"
37
+ info = "Observation for future reference, no action required"
package/package.json ADDED
@@ -0,0 +1,54 @@
1
+ {
2
+ "name": "claude-turing",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
6
+ "bin": {
7
+ "turing": "./bin/cli.js",
8
+ "claude-turing": "./bin/cli.js"
9
+ },
10
+ "scripts": {
11
+ "postinstall": "node src/postinstall.js"
12
+ },
13
+ "files": [
14
+ "bin/",
15
+ "src/",
16
+ ".claude-plugin/",
17
+ "commands/",
18
+ "agents/",
19
+ "templates/",
20
+ "config/"
21
+ ],
22
+ "keywords": [
23
+ "ml",
24
+ "machine-learning",
25
+ "autoresearch",
26
+ "experiment-tracking",
27
+ "hyperparameter-tuning",
28
+ "autonomous-training",
29
+ "convergence-detection",
30
+ "model-evaluation",
31
+ "scientific-method",
32
+ "claude-code",
33
+ "plugin",
34
+ "ai-agents"
35
+ ],
36
+ "author": {
37
+ "name": "pragnition"
38
+ },
39
+ "homepage": "https://github.com/pragnition/turing",
40
+ "repository": {
41
+ "type": "git",
42
+ "url": "git+https://github.com/pragnition/turing.git"
43
+ },
44
+ "bugs": {
45
+ "url": "https://github.com/pragnition/turing/issues"
46
+ },
47
+ "license": "MIT",
48
+ "engines": {
49
+ "node": ">=18.0.0"
50
+ },
51
+ "dependencies": {
52
+ "commander": "^13.0.0"
53
+ }
54
+ }