claude-turing 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +66 -3
- package/commands/card.md +36 -0
- package/commands/explore.md +107 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +2 -2
- package/src/verify.js +2 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +58 -3
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +8 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/scripts/treequest_suggest.py +520 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Model card generator for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Produces a standardized model card markdown document inspired by
|
|
5
|
+
Hugging Face model cards and Google's Model Cards for Model Reporting.
|
|
6
|
+
Consolidates information from the experiment log, model contract,
|
|
7
|
+
and project config into a single documentation artifact.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python scripts/generate_model_card.py [--config config.yaml] [--log experiments/log.jsonl] [--output MODEL_CARD.md]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from scripts.turing_io import load_config, load_experiments
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_best_experiment(
|
|
25
|
+
log_path: str,
|
|
26
|
+
metric: str,
|
|
27
|
+
lower_is_better: bool,
|
|
28
|
+
) -> dict | None:
|
|
29
|
+
"""Find the current best kept experiment.
|
|
30
|
+
|
|
31
|
+
Scans all experiments in the log and returns the one with
|
|
32
|
+
the best value for the given metric among 'kept' entries.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
log_path: Path to the experiments JSONL file.
|
|
36
|
+
metric: Primary metric name to optimize.
|
|
37
|
+
lower_is_better: True for metrics like MAE/MSE, False for accuracy/F1.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Best experiment dict, or None if no kept entries exist.
|
|
41
|
+
"""
|
|
42
|
+
experiments = load_experiments(log_path)
|
|
43
|
+
best = None
|
|
44
|
+
best_val = float("inf") if lower_is_better else float("-inf")
|
|
45
|
+
|
|
46
|
+
for exp in experiments:
|
|
47
|
+
if exp.get("status") != "kept":
|
|
48
|
+
continue
|
|
49
|
+
val = exp.get("metrics", {}).get(metric)
|
|
50
|
+
if val is None:
|
|
51
|
+
continue
|
|
52
|
+
if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
53
|
+
best_val = val
|
|
54
|
+
best = exp
|
|
55
|
+
|
|
56
|
+
return best
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def load_model_contract(contract_path: str) -> dict:
|
|
60
|
+
"""Read model_contract.md and extract structured fields.
|
|
61
|
+
|
|
62
|
+
Parses the contract markdown to pull out the contract version
|
|
63
|
+
and bundle format description.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
contract_path: Path to model_contract.md.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Dict with 'version', 'bundle_format', and 'raw' keys.
|
|
70
|
+
Returns defaults if the file does not exist.
|
|
71
|
+
"""
|
|
72
|
+
path = Path(contract_path)
|
|
73
|
+
if not path.exists():
|
|
74
|
+
return {"version": "1", "bundle_format": "joblib bundle", "raw": ""}
|
|
75
|
+
|
|
76
|
+
text = path.read_text(encoding="utf-8")
|
|
77
|
+
|
|
78
|
+
# Extract version
|
|
79
|
+
version = "1"
|
|
80
|
+
version_match = re.search(r"Version:\s*(\S+)", text)
|
|
81
|
+
if version_match:
|
|
82
|
+
version = version_match.group(1)
|
|
83
|
+
|
|
84
|
+
# Extract bundle format from the heading or description
|
|
85
|
+
bundle_format = "joblib bundle"
|
|
86
|
+
if "joblib" in text.lower():
|
|
87
|
+
bundle_format = "joblib bundle (model + featurizer + config)"
|
|
88
|
+
elif "onnx" in text.lower():
|
|
89
|
+
bundle_format = "ONNX"
|
|
90
|
+
elif "pickle" in text.lower():
|
|
91
|
+
bundle_format = "pickle"
|
|
92
|
+
|
|
93
|
+
return {"version": version, "bundle_format": bundle_format, "raw": text}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def generate_card(
|
|
97
|
+
config_path: str = "config.yaml",
|
|
98
|
+
log_path: str = "experiments/log.jsonl",
|
|
99
|
+
contract_path: str = "model_contract.md",
|
|
100
|
+
output_path: str | None = None,
|
|
101
|
+
) -> str:
|
|
102
|
+
"""Produce a model card markdown document.
|
|
103
|
+
|
|
104
|
+
Combines information from the project config, experiment log,
|
|
105
|
+
and model contract into a standardized model card.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
config_path: Path to config.yaml.
|
|
109
|
+
log_path: Path to experiments/log.jsonl.
|
|
110
|
+
contract_path: Path to model_contract.md.
|
|
111
|
+
output_path: If given, write the card to this file.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
The model card as a markdown string.
|
|
115
|
+
"""
|
|
116
|
+
config = load_config(config_path)
|
|
117
|
+
eval_cfg = config.get("evaluation", {})
|
|
118
|
+
metric = eval_cfg.get("primary_metric", "accuracy")
|
|
119
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
120
|
+
direction = "lower" if lower_is_better else "higher"
|
|
121
|
+
|
|
122
|
+
data_cfg = config.get("data", {})
|
|
123
|
+
model_cfg = config.get("model", {})
|
|
124
|
+
|
|
125
|
+
experiments = load_experiments(log_path)
|
|
126
|
+
best = load_best_experiment(log_path, metric, lower_is_better)
|
|
127
|
+
contract = load_model_contract(contract_path)
|
|
128
|
+
|
|
129
|
+
# Derive project name from config or directory
|
|
130
|
+
project_name = config.get("project_name", Path(".").resolve().name)
|
|
131
|
+
|
|
132
|
+
# Compute experiment stats
|
|
133
|
+
total_experiments = len(experiments)
|
|
134
|
+
kept_experiments = sum(1 for e in experiments if e.get("status") == "kept")
|
|
135
|
+
|
|
136
|
+
# Determine convergence status
|
|
137
|
+
convergence_cfg = config.get("convergence", {})
|
|
138
|
+
patience = convergence_cfg.get("patience", 3)
|
|
139
|
+
converged = "not converged"
|
|
140
|
+
if kept_experiments > 0:
|
|
141
|
+
# Check if last N kept experiments showed no improvement
|
|
142
|
+
kept_exps = [e for e in experiments if e.get("status") == "kept"]
|
|
143
|
+
if len(kept_exps) >= patience:
|
|
144
|
+
recent_vals = [
|
|
145
|
+
e.get("metrics", {}).get(metric)
|
|
146
|
+
for e in kept_exps[-patience:]
|
|
147
|
+
if e.get("metrics", {}).get(metric) is not None
|
|
148
|
+
]
|
|
149
|
+
if len(recent_vals) == patience:
|
|
150
|
+
if lower_is_better:
|
|
151
|
+
converged = "converged" if min(recent_vals) == recent_vals[0] else "not converged"
|
|
152
|
+
else:
|
|
153
|
+
converged = "converged" if max(recent_vals) == recent_vals[0] else "not converged"
|
|
154
|
+
|
|
155
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
156
|
+
|
|
157
|
+
lines = [
|
|
158
|
+
f"# Model Card: {project_name}",
|
|
159
|
+
"",
|
|
160
|
+
f"Generated: {now}",
|
|
161
|
+
"",
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
# --- Model Details ---
|
|
165
|
+
lines.extend([
|
|
166
|
+
"## Model Details",
|
|
167
|
+
"",
|
|
168
|
+
])
|
|
169
|
+
if best:
|
|
170
|
+
best_config = best.get("config", {})
|
|
171
|
+
model_type = best_config.get("model_type", model_cfg.get("type", "unknown"))
|
|
172
|
+
framework = _infer_framework(model_type)
|
|
173
|
+
training_time = best.get("metrics", {}).get("training_time", "N/A")
|
|
174
|
+
lines.extend([
|
|
175
|
+
f"- **Model type:** {model_type}",
|
|
176
|
+
f"- **Task:** {config.get('task_description', eval_cfg.get('primary_metric', 'N/A'))}",
|
|
177
|
+
f"- **Primary metric:** {metric} ({direction} is better)",
|
|
178
|
+
f"- **Framework:** {framework}",
|
|
179
|
+
f"- **Training time:** {training_time}s" if training_time != "N/A" else f"- **Training time:** {training_time}",
|
|
180
|
+
f"- **Artifact:** models/best/model.joblib",
|
|
181
|
+
])
|
|
182
|
+
else:
|
|
183
|
+
lines.extend([
|
|
184
|
+
f"- **Model type:** {model_cfg.get('type', 'unknown')}",
|
|
185
|
+
f"- **Task:** {config.get('task_description', 'N/A')}",
|
|
186
|
+
f"- **Primary metric:** {metric} ({direction} is better)",
|
|
187
|
+
"- **Framework:** N/A",
|
|
188
|
+
"- **Training time:** N/A",
|
|
189
|
+
"- **Artifact:** N/A (no trained model yet)",
|
|
190
|
+
])
|
|
191
|
+
|
|
192
|
+
# --- Training Data ---
|
|
193
|
+
lines.extend([
|
|
194
|
+
"",
|
|
195
|
+
"## Training Data",
|
|
196
|
+
"",
|
|
197
|
+
f"- **Source:** {data_cfg.get('source', 'N/A')}",
|
|
198
|
+
])
|
|
199
|
+
split_ratios = data_cfg.get("split_ratios", {})
|
|
200
|
+
if split_ratios:
|
|
201
|
+
splits_str = ", ".join(f"{k}: {v}" for k, v in split_ratios.items())
|
|
202
|
+
lines.append(f"- **Split ratios:** {splits_str}")
|
|
203
|
+
# Samples from experiment metadata if available
|
|
204
|
+
if best and best.get("metrics", {}).get("n_samples"):
|
|
205
|
+
lines.append(f"- **Samples:** {best['metrics']['n_samples']}")
|
|
206
|
+
|
|
207
|
+
# --- Performance ---
|
|
208
|
+
lines.extend([
|
|
209
|
+
"",
|
|
210
|
+
"## Performance",
|
|
211
|
+
"",
|
|
212
|
+
])
|
|
213
|
+
if best:
|
|
214
|
+
lines.append("| Metric | Value |")
|
|
215
|
+
lines.append("|--------|-------|")
|
|
216
|
+
for m, v in best.get("metrics", {}).items():
|
|
217
|
+
if m in ("training_time", "n_samples"):
|
|
218
|
+
continue # Skip non-metric fields
|
|
219
|
+
if isinstance(v, float):
|
|
220
|
+
lines.append(f"| {m} | {v:.4f} |")
|
|
221
|
+
else:
|
|
222
|
+
lines.append(f"| {m} | {v} |")
|
|
223
|
+
|
|
224
|
+
# Per-class performance
|
|
225
|
+
per_class = best.get("metrics", {}).get("per_class")
|
|
226
|
+
if per_class:
|
|
227
|
+
lines.extend([
|
|
228
|
+
"",
|
|
229
|
+
"### Per-Class Performance",
|
|
230
|
+
"",
|
|
231
|
+
])
|
|
232
|
+
if isinstance(per_class, dict):
|
|
233
|
+
headers = list(next(iter(per_class.values())).keys()) if per_class else []
|
|
234
|
+
if headers:
|
|
235
|
+
lines.append("| Class | " + " | ".join(headers) + " |")
|
|
236
|
+
lines.append("|-------|" + "|".join(["-------"] * len(headers)) + "|")
|
|
237
|
+
for cls, vals in per_class.items():
|
|
238
|
+
row = " | ".join(
|
|
239
|
+
f"{vals.get(h, 'N/A'):.4f}" if isinstance(vals.get(h), float) else str(vals.get(h, "N/A"))
|
|
240
|
+
for h in headers
|
|
241
|
+
)
|
|
242
|
+
lines.append(f"| {cls} | {row} |")
|
|
243
|
+
else:
|
|
244
|
+
lines.append("No experiments completed yet.")
|
|
245
|
+
|
|
246
|
+
# --- Training History ---
|
|
247
|
+
lines.extend([
|
|
248
|
+
"",
|
|
249
|
+
"## Training History",
|
|
250
|
+
"",
|
|
251
|
+
f"- **Total experiments:** {total_experiments}",
|
|
252
|
+
f"- **Experiments kept:** {kept_experiments}",
|
|
253
|
+
f"- **Best experiment:** {best.get('experiment_id', 'N/A') if best else 'N/A'}",
|
|
254
|
+
f"- **Convergence:** {converged}",
|
|
255
|
+
])
|
|
256
|
+
|
|
257
|
+
# --- Limitations ---
|
|
258
|
+
lines.extend([
|
|
259
|
+
"",
|
|
260
|
+
"## Limitations",
|
|
261
|
+
"",
|
|
262
|
+
f"- This model was trained on {data_cfg.get('source', 'the provided dataset')} and may not generalize to other distributions",
|
|
263
|
+
"- Performance metrics are from the validation set; test set performance may differ",
|
|
264
|
+
])
|
|
265
|
+
# Check for overfitting gap
|
|
266
|
+
if best:
|
|
267
|
+
train_metric = best.get("metrics", {}).get(f"train_{metric}")
|
|
268
|
+
val_metric = best.get("metrics", {}).get(metric)
|
|
269
|
+
if train_metric is not None and val_metric is not None:
|
|
270
|
+
gap = abs(train_metric - val_metric)
|
|
271
|
+
if gap > 0.05:
|
|
272
|
+
lines.append(f"- Train/val gap of {gap:.3f} suggests possible overfitting")
|
|
273
|
+
|
|
274
|
+
# --- Intended Use ---
|
|
275
|
+
task_desc = config.get("task_description", "N/A")
|
|
276
|
+
lines.extend([
|
|
277
|
+
"",
|
|
278
|
+
"## Intended Use",
|
|
279
|
+
"",
|
|
280
|
+
f"- {task_desc}",
|
|
281
|
+
"- Not intended for: <placeholder for user to fill>",
|
|
282
|
+
])
|
|
283
|
+
|
|
284
|
+
# --- Ethical Considerations ---
|
|
285
|
+
lines.extend([
|
|
286
|
+
"",
|
|
287
|
+
"## Ethical Considerations",
|
|
288
|
+
"",
|
|
289
|
+
"- <placeholder -- users should document bias, fairness, and impact>",
|
|
290
|
+
])
|
|
291
|
+
|
|
292
|
+
# --- Artifact Contract ---
|
|
293
|
+
lines.extend([
|
|
294
|
+
"",
|
|
295
|
+
"## Artifact Contract",
|
|
296
|
+
"",
|
|
297
|
+
f"- **Contract version:** {contract['version']}",
|
|
298
|
+
f"- **Bundle format:** {contract['bundle_format']}",
|
|
299
|
+
"- See `model_contract.md` for full consumer documentation",
|
|
300
|
+
])
|
|
301
|
+
|
|
302
|
+
card = "\n".join(lines) + "\n"
|
|
303
|
+
|
|
304
|
+
if output_path:
|
|
305
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
306
|
+
Path(output_path).write_text(card, encoding="utf-8")
|
|
307
|
+
|
|
308
|
+
return card
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _infer_framework(model_type: str) -> str:
|
|
312
|
+
"""Infer the ML framework from the model type string."""
|
|
313
|
+
model_lower = model_type.lower()
|
|
314
|
+
if "xgboost" in model_lower or "xgb" in model_lower:
|
|
315
|
+
return "xgboost"
|
|
316
|
+
if "lightgbm" in model_lower or "lgb" in model_lower:
|
|
317
|
+
return "lightgbm"
|
|
318
|
+
if "catboost" in model_lower:
|
|
319
|
+
return "catboost"
|
|
320
|
+
if any(t in model_lower for t in ("random_forest", "randomforest", "logistic", "svm", "svc", "mlp", "knn")):
|
|
321
|
+
return "sklearn"
|
|
322
|
+
return model_lower
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def main() -> None:
|
|
326
|
+
"""CLI entry point."""
|
|
327
|
+
parser = argparse.ArgumentParser(description="Generate a standardized model card")
|
|
328
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
329
|
+
parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
|
|
330
|
+
parser.add_argument("--contract", default="model_contract.md", help="Path to model contract")
|
|
331
|
+
parser.add_argument("--output", default=None, help="Output path (default: print to stdout)")
|
|
332
|
+
args = parser.parse_args()
|
|
333
|
+
|
|
334
|
+
card = generate_card(args.config, args.log, args.contract, args.output)
|
|
335
|
+
if args.output:
|
|
336
|
+
print(f"Model card written to {args.output}")
|
|
337
|
+
else:
|
|
338
|
+
print(card)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
if __name__ == "__main__":
|
|
342
|
+
main()
|