claude-turing 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +66 -3
  3. package/commands/card.md +36 -0
  4. package/commands/explore.md +107 -0
  5. package/commands/suggest.md +68 -4
  6. package/commands/turing.md +4 -0
  7. package/package.json +1 -1
  8. package/src/claude-md.js +1 -0
  9. package/src/install.js +2 -2
  10. package/src/verify.js +2 -0
  11. package/templates/requirements.txt +4 -0
  12. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  13. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  14. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +58 -3
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/manage_hypotheses.py +2 -2
  26. package/templates/scripts/plot_trajectory.py +611 -0
  27. package/templates/scripts/scaffold.py +8 -0
  28. package/templates/scripts/show_metrics.py +23 -2
  29. package/templates/scripts/treequest_suggest.py +520 -0
  30. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  31. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  32. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  33. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,342 @@
1
+ #!/usr/bin/env python3
2
+ """Model card generator for the autoresearch pipeline.
3
+
4
+ Produces a standardized model card markdown document inspired by
5
+ Hugging Face model cards and Google's Model Cards for Model Reporting.
6
+ Consolidates information from the experiment log, model contract,
7
+ and project config into a single documentation artifact.
8
+
9
+ Usage:
10
+ python scripts/generate_model_card.py [--config config.yaml] [--log experiments/log.jsonl] [--output MODEL_CARD.md]
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import re
17
+ import sys
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+
21
+ from scripts.turing_io import load_config, load_experiments
22
+
23
+
24
+ def load_best_experiment(
25
+ log_path: str,
26
+ metric: str,
27
+ lower_is_better: bool,
28
+ ) -> dict | None:
29
+ """Find the current best kept experiment.
30
+
31
+ Scans all experiments in the log and returns the one with
32
+ the best value for the given metric among 'kept' entries.
33
+
34
+ Args:
35
+ log_path: Path to the experiments JSONL file.
36
+ metric: Primary metric name to optimize.
37
+ lower_is_better: True for metrics like MAE/MSE, False for accuracy/F1.
38
+
39
+ Returns:
40
+ Best experiment dict, or None if no kept entries exist.
41
+ """
42
+ experiments = load_experiments(log_path)
43
+ best = None
44
+ best_val = float("inf") if lower_is_better else float("-inf")
45
+
46
+ for exp in experiments:
47
+ if exp.get("status") != "kept":
48
+ continue
49
+ val = exp.get("metrics", {}).get(metric)
50
+ if val is None:
51
+ continue
52
+ if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
53
+ best_val = val
54
+ best = exp
55
+
56
+ return best
57
+
58
+
59
+ def load_model_contract(contract_path: str) -> dict:
60
+ """Read model_contract.md and extract structured fields.
61
+
62
+ Parses the contract markdown to pull out the contract version
63
+ and bundle format description.
64
+
65
+ Args:
66
+ contract_path: Path to model_contract.md.
67
+
68
+ Returns:
69
+ Dict with 'version', 'bundle_format', and 'raw' keys.
70
+ Returns defaults if the file does not exist.
71
+ """
72
+ path = Path(contract_path)
73
+ if not path.exists():
74
+ return {"version": "1", "bundle_format": "joblib bundle", "raw": ""}
75
+
76
+ text = path.read_text(encoding="utf-8")
77
+
78
+ # Extract version
79
+ version = "1"
80
+ version_match = re.search(r"Version:\s*(\S+)", text)
81
+ if version_match:
82
+ version = version_match.group(1)
83
+
84
+ # Extract bundle format from the heading or description
85
+ bundle_format = "joblib bundle"
86
+ if "joblib" in text.lower():
87
+ bundle_format = "joblib bundle (model + featurizer + config)"
88
+ elif "onnx" in text.lower():
89
+ bundle_format = "ONNX"
90
+ elif "pickle" in text.lower():
91
+ bundle_format = "pickle"
92
+
93
+ return {"version": version, "bundle_format": bundle_format, "raw": text}
94
+
95
+
96
+ def generate_card(
97
+ config_path: str = "config.yaml",
98
+ log_path: str = "experiments/log.jsonl",
99
+ contract_path: str = "model_contract.md",
100
+ output_path: str | None = None,
101
+ ) -> str:
102
+ """Produce a model card markdown document.
103
+
104
+ Combines information from the project config, experiment log,
105
+ and model contract into a standardized model card.
106
+
107
+ Args:
108
+ config_path: Path to config.yaml.
109
+ log_path: Path to experiments/log.jsonl.
110
+ contract_path: Path to model_contract.md.
111
+ output_path: If given, write the card to this file.
112
+
113
+ Returns:
114
+ The model card as a markdown string.
115
+ """
116
+ config = load_config(config_path)
117
+ eval_cfg = config.get("evaluation", {})
118
+ metric = eval_cfg.get("primary_metric", "accuracy")
119
+ lower_is_better = eval_cfg.get("lower_is_better", False)
120
+ direction = "lower" if lower_is_better else "higher"
121
+
122
+ data_cfg = config.get("data", {})
123
+ model_cfg = config.get("model", {})
124
+
125
+ experiments = load_experiments(log_path)
126
+ best = load_best_experiment(log_path, metric, lower_is_better)
127
+ contract = load_model_contract(contract_path)
128
+
129
+ # Derive project name from config or directory
130
+ project_name = config.get("project_name", Path(".").resolve().name)
131
+
132
+ # Compute experiment stats
133
+ total_experiments = len(experiments)
134
+ kept_experiments = sum(1 for e in experiments if e.get("status") == "kept")
135
+
136
+ # Determine convergence status
137
+ convergence_cfg = config.get("convergence", {})
138
+ patience = convergence_cfg.get("patience", 3)
139
+ converged = "not converged"
140
+ if kept_experiments > 0:
141
+ # Check if last N kept experiments showed no improvement
142
+ kept_exps = [e for e in experiments if e.get("status") == "kept"]
143
+ if len(kept_exps) >= patience:
144
+ recent_vals = [
145
+ e.get("metrics", {}).get(metric)
146
+ for e in kept_exps[-patience:]
147
+ if e.get("metrics", {}).get(metric) is not None
148
+ ]
149
+ if len(recent_vals) == patience:
150
+ if lower_is_better:
151
+ converged = "converged" if min(recent_vals) == recent_vals[0] else "not converged"
152
+ else:
153
+ converged = "converged" if max(recent_vals) == recent_vals[0] else "not converged"
154
+
155
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
156
+
157
+ lines = [
158
+ f"# Model Card: {project_name}",
159
+ "",
160
+ f"Generated: {now}",
161
+ "",
162
+ ]
163
+
164
+ # --- Model Details ---
165
+ lines.extend([
166
+ "## Model Details",
167
+ "",
168
+ ])
169
+ if best:
170
+ best_config = best.get("config", {})
171
+ model_type = best_config.get("model_type", model_cfg.get("type", "unknown"))
172
+ framework = _infer_framework(model_type)
173
+ training_time = best.get("metrics", {}).get("training_time", "N/A")
174
+ lines.extend([
175
+ f"- **Model type:** {model_type}",
176
+ f"- **Task:** {config.get('task_description', eval_cfg.get('primary_metric', 'N/A'))}",
177
+ f"- **Primary metric:** {metric} ({direction} is better)",
178
+ f"- **Framework:** {framework}",
179
+ f"- **Training time:** {training_time}s" if training_time != "N/A" else f"- **Training time:** {training_time}",
180
+ f"- **Artifact:** models/best/model.joblib",
181
+ ])
182
+ else:
183
+ lines.extend([
184
+ f"- **Model type:** {model_cfg.get('type', 'unknown')}",
185
+ f"- **Task:** {config.get('task_description', 'N/A')}",
186
+ f"- **Primary metric:** {metric} ({direction} is better)",
187
+ "- **Framework:** N/A",
188
+ "- **Training time:** N/A",
189
+ "- **Artifact:** N/A (no trained model yet)",
190
+ ])
191
+
192
+ # --- Training Data ---
193
+ lines.extend([
194
+ "",
195
+ "## Training Data",
196
+ "",
197
+ f"- **Source:** {data_cfg.get('source', 'N/A')}",
198
+ ])
199
+ split_ratios = data_cfg.get("split_ratios", {})
200
+ if split_ratios:
201
+ splits_str = ", ".join(f"{k}: {v}" for k, v in split_ratios.items())
202
+ lines.append(f"- **Split ratios:** {splits_str}")
203
+ # Samples from experiment metadata if available
204
+ if best and best.get("metrics", {}).get("n_samples"):
205
+ lines.append(f"- **Samples:** {best['metrics']['n_samples']}")
206
+
207
+ # --- Performance ---
208
+ lines.extend([
209
+ "",
210
+ "## Performance",
211
+ "",
212
+ ])
213
+ if best:
214
+ lines.append("| Metric | Value |")
215
+ lines.append("|--------|-------|")
216
+ for m, v in best.get("metrics", {}).items():
217
+ if m in ("training_time", "n_samples"):
218
+ continue # Skip non-metric fields
219
+ if isinstance(v, float):
220
+ lines.append(f"| {m} | {v:.4f} |")
221
+ else:
222
+ lines.append(f"| {m} | {v} |")
223
+
224
+ # Per-class performance
225
+ per_class = best.get("metrics", {}).get("per_class")
226
+ if per_class:
227
+ lines.extend([
228
+ "",
229
+ "### Per-Class Performance",
230
+ "",
231
+ ])
232
+ if isinstance(per_class, dict):
233
+ headers = list(next(iter(per_class.values())).keys()) if per_class else []
234
+ if headers:
235
+ lines.append("| Class | " + " | ".join(headers) + " |")
236
+ lines.append("|-------|" + "|".join(["-------"] * len(headers)) + "|")
237
+ for cls, vals in per_class.items():
238
+ row = " | ".join(
239
+ f"{vals.get(h, 'N/A'):.4f}" if isinstance(vals.get(h), float) else str(vals.get(h, "N/A"))
240
+ for h in headers
241
+ )
242
+ lines.append(f"| {cls} | {row} |")
243
+ else:
244
+ lines.append("No experiments completed yet.")
245
+
246
+ # --- Training History ---
247
+ lines.extend([
248
+ "",
249
+ "## Training History",
250
+ "",
251
+ f"- **Total experiments:** {total_experiments}",
252
+ f"- **Experiments kept:** {kept_experiments}",
253
+ f"- **Best experiment:** {best.get('experiment_id', 'N/A') if best else 'N/A'}",
254
+ f"- **Convergence:** {converged}",
255
+ ])
256
+
257
+ # --- Limitations ---
258
+ lines.extend([
259
+ "",
260
+ "## Limitations",
261
+ "",
262
+ f"- This model was trained on {data_cfg.get('source', 'the provided dataset')} and may not generalize to other distributions",
263
+ "- Performance metrics are from the validation set; test set performance may differ",
264
+ ])
265
+ # Check for overfitting gap
266
+ if best:
267
+ train_metric = best.get("metrics", {}).get(f"train_{metric}")
268
+ val_metric = best.get("metrics", {}).get(metric)
269
+ if train_metric is not None and val_metric is not None:
270
+ gap = abs(train_metric - val_metric)
271
+ if gap > 0.05:
272
+ lines.append(f"- Train/val gap of {gap:.3f} suggests possible overfitting")
273
+
274
+ # --- Intended Use ---
275
+ task_desc = config.get("task_description", "N/A")
276
+ lines.extend([
277
+ "",
278
+ "## Intended Use",
279
+ "",
280
+ f"- {task_desc}",
281
+ "- Not intended for: <placeholder for user to fill>",
282
+ ])
283
+
284
+ # --- Ethical Considerations ---
285
+ lines.extend([
286
+ "",
287
+ "## Ethical Considerations",
288
+ "",
289
+ "- <placeholder -- users should document bias, fairness, and impact>",
290
+ ])
291
+
292
+ # --- Artifact Contract ---
293
+ lines.extend([
294
+ "",
295
+ "## Artifact Contract",
296
+ "",
297
+ f"- **Contract version:** {contract['version']}",
298
+ f"- **Bundle format:** {contract['bundle_format']}",
299
+ "- See `model_contract.md` for full consumer documentation",
300
+ ])
301
+
302
+ card = "\n".join(lines) + "\n"
303
+
304
+ if output_path:
305
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
306
+ Path(output_path).write_text(card, encoding="utf-8")
307
+
308
+ return card
309
+
310
+
311
+ def _infer_framework(model_type: str) -> str:
312
+ """Infer the ML framework from the model type string."""
313
+ model_lower = model_type.lower()
314
+ if "xgboost" in model_lower or "xgb" in model_lower:
315
+ return "xgboost"
316
+ if "lightgbm" in model_lower or "lgb" in model_lower:
317
+ return "lightgbm"
318
+ if "catboost" in model_lower:
319
+ return "catboost"
320
+ if any(t in model_lower for t in ("random_forest", "randomforest", "logistic", "svm", "svc", "mlp", "knn")):
321
+ return "sklearn"
322
+ return model_lower
323
+
324
+
325
+ def main() -> None:
326
+ """CLI entry point."""
327
+ parser = argparse.ArgumentParser(description="Generate a standardized model card")
328
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
329
+ parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
330
+ parser.add_argument("--contract", default="model_contract.md", help="Path to model contract")
331
+ parser.add_argument("--output", default=None, help="Output path (default: print to stdout)")
332
+ args = parser.parse_args()
333
+
334
+ card = generate_card(args.config, args.log, args.contract, args.output)
335
+ if args.output:
336
+ print(f"Model card written to {args.output}")
337
+ else:
338
+ print(card)
339
+
340
+
341
+ if __name__ == "__main__":
342
+ main()