claude-turing 3.4.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +12 -2
  3. package/commands/annotate.md +23 -0
  4. package/commands/archive.md +23 -0
  5. package/commands/changelog.md +22 -0
  6. package/commands/cite.md +23 -0
  7. package/commands/flashback.md +22 -0
  8. package/commands/present.md +23 -0
  9. package/commands/replay.md +23 -0
  10. package/commands/search.md +22 -0
  11. package/commands/template.md +22 -0
  12. package/commands/trend.md +21 -0
  13. package/commands/turing.md +20 -0
  14. package/package.json +1 -1
  15. package/src/install.js +2 -0
  16. package/src/verify.js +10 -0
  17. package/templates/scripts/__pycache__/citation_manager.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/experiment_annotations.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/experiment_archive.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/experiment_replay.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/experiment_search.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/experiment_templates.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/generate_changelog.cpython-314.pyc +0 -0
  24. package/templates/scripts/__pycache__/generate_figures.cpython-314.pyc +0 -0
  25. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  26. package/templates/scripts/__pycache__/session_flashback.cpython-314.pyc +0 -0
  27. package/templates/scripts/__pycache__/trend_analysis.cpython-314.pyc +0 -0
  28. package/templates/scripts/citation_manager.py +436 -0
  29. package/templates/scripts/experiment_annotations.py +392 -0
  30. package/templates/scripts/experiment_archive.py +534 -0
  31. package/templates/scripts/experiment_replay.py +592 -0
  32. package/templates/scripts/experiment_search.py +451 -0
  33. package/templates/scripts/experiment_templates.py +501 -0
  34. package/templates/scripts/generate_changelog.py +464 -0
  35. package/templates/scripts/generate_figures.py +597 -0
  36. package/templates/scripts/scaffold.py +17 -0
  37. package/templates/scripts/session_flashback.py +461 -0
  38. package/templates/scripts/trend_analysis.py +503 -0
@@ -0,0 +1,464 @@
1
+ #!/usr/bin/env python3
2
+ """Model changelog generator for the autoresearch pipeline.
3
+
4
+ Reads experiment history, identifies "keep" decisions that improved
5
+ the primary metric, groups them into versions by step-change
6
+ improvements, and formats a narrative changelog. Designed for both
7
+ technical and stakeholder audiences.
8
+
9
+ Usage:
10
+ python scripts/generate_changelog.py
11
+ python scripts/generate_changelog.py --audience stakeholder
12
+ python scripts/generate_changelog.py --audience technical --since exp-030
13
+ python scripts/generate_changelog.py --since 2026-03-15
14
+ python scripts/generate_changelog.py --json
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import math
22
+ import sys
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+
26
+ import yaml
27
+
28
+ from scripts.turing_io import load_config, load_experiments
29
+
30
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
31
+ DEFAULT_OUTPUT_PATH = "paper/CHANGELOG.md"
32
+
33
+ # Minimum relative improvement to trigger a new version boundary
34
+ VERSION_THRESHOLD = 0.02 # 2% relative improvement
35
+
36
+
37
+ # --- Version Detection ---
38
+
39
+
40
+ def compute_trajectory(
41
+ experiments: list[dict],
42
+ metric_name: str,
43
+ lower_is_better: bool = False,
44
+ ) -> list[dict]:
45
+ """Build improvement trajectory from kept experiments."""
46
+ trajectory = []
47
+ best_val = None
48
+
49
+ for exp in experiments:
50
+ if exp.get("status") != "kept":
51
+ continue
52
+
53
+ val = exp.get("metrics", {}).get(metric_name)
54
+ if val is None or not isinstance(val, (int, float)):
55
+ continue
56
+
57
+ prev_best = best_val
58
+ is_improvement = False
59
+
60
+ if best_val is None:
61
+ best_val = val
62
+ is_improvement = True
63
+ elif lower_is_better and val < best_val:
64
+ best_val = val
65
+ is_improvement = True
66
+ elif not lower_is_better and val > best_val:
67
+ best_val = val
68
+ is_improvement = True
69
+
70
+ if is_improvement:
71
+ delta = 0.0
72
+ relative_delta = 0.0
73
+ if prev_best is not None:
74
+ delta = val - prev_best
75
+ if prev_best != 0:
76
+ relative_delta = abs(delta) / abs(prev_best)
77
+
78
+ trajectory.append({
79
+ "experiment_id": exp.get("experiment_id", "?"),
80
+ "description": exp.get("description", ""),
81
+ "timestamp": exp.get("timestamp", ""),
82
+ "value": val,
83
+ "delta": delta,
84
+ "relative_delta": relative_delta,
85
+ "config": exp.get("config", {}),
86
+ "family": exp.get("family") or exp.get("config", {}).get("model_type", ""),
87
+ })
88
+
89
+ return trajectory
90
+
91
+
92
+ def detect_version_boundaries(
93
+ trajectory: list[dict],
94
+ threshold: float = VERSION_THRESHOLD,
95
+ ) -> list[int]:
96
+ """Detect version boundaries based on significant metric jumps.
97
+
98
+ A new version starts when the relative improvement exceeds the
99
+ threshold, suggesting a qualitative step-change rather than
100
+ incremental tuning.
101
+
102
+ Returns indices into the trajectory where new versions begin.
103
+ """
104
+ boundaries = [0] # First entry always starts v1
105
+
106
+ for i, entry in enumerate(trajectory):
107
+ if i == 0:
108
+ continue
109
+ if entry["relative_delta"] >= threshold:
110
+ boundaries.append(i)
111
+
112
+ return boundaries
113
+
114
+
115
+ def group_into_versions(
116
+ trajectory: list[dict],
117
+ boundaries: list[int],
118
+ ) -> list[dict]:
119
+ """Group trajectory entries into version blocks.
120
+
121
+ Each version contains the improvements between consecutive
122
+ boundaries, with a summary of the collective improvement.
123
+ """
124
+ versions = []
125
+
126
+ for vi, start in enumerate(boundaries):
127
+ end = boundaries[vi + 1] if vi + 1 < len(boundaries) else len(trajectory)
128
+ entries = trajectory[start:end]
129
+ if not entries:
130
+ continue
131
+
132
+ first_val = entries[0]["value"]
133
+ last_val = entries[-1]["value"]
134
+ version_delta = last_val - first_val
135
+
136
+ # Timestamp range
137
+ timestamps = [e["timestamp"] for e in entries if e.get("timestamp")]
138
+ date_range = ""
139
+ if timestamps:
140
+ first_ts = min(timestamps)[:10]
141
+ last_ts = max(timestamps)[:10]
142
+ date_range = first_ts if first_ts == last_ts else f"{first_ts} to {last_ts}"
143
+
144
+ versions.append({
145
+ "version": f"v{vi + 1}.0",
146
+ "improvements": entries,
147
+ "start_value": first_val,
148
+ "end_value": last_val,
149
+ "version_delta": version_delta,
150
+ "n_improvements": len(entries),
151
+ "date_range": date_range,
152
+ })
153
+
154
+ return versions
155
+
156
+
157
+ # --- Formatting ---
158
+
159
+
160
+ def format_technical_changelog(
161
+ versions: list[dict],
162
+ metric_name: str,
163
+ lower_is_better: bool,
164
+ task_description: str,
165
+ ) -> str:
166
+ """Format changelog for technical audience with experiment IDs and deltas."""
167
+ direction = "lower is better" if lower_is_better else "higher is better"
168
+ lines = [
169
+ f"# Model Changelog",
170
+ f"",
171
+ f"**Task:** {task_description}",
172
+ f"**Primary metric:** {metric_name} ({direction})",
173
+ f"**Generated:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
174
+ f"",
175
+ "---",
176
+ "",
177
+ ]
178
+
179
+ for version in reversed(versions):
180
+ ver = version["version"]
181
+ end_val = version["end_value"]
182
+ delta = version["version_delta"]
183
+ delta_sign = "+" if delta > 0 else ""
184
+ date_range = version.get("date_range", "")
185
+ n = version["n_improvements"]
186
+
187
+ lines.append(f"## {ver} ({date_range})")
188
+ lines.append("")
189
+ lines.append(f"**{metric_name}:** {end_val:.4f} ({delta_sign}{delta:.4f} from version start)")
190
+ lines.append(f"**Improvements:** {n}")
191
+ lines.append("")
192
+
193
+ for entry in version["improvements"]:
194
+ exp_id = entry["experiment_id"]
195
+ desc = entry["description"] or "No description"
196
+ val = entry["value"]
197
+ entry_delta = entry["delta"]
198
+ entry_sign = "+" if entry_delta > 0 else ""
199
+ family = entry.get("family", "")
200
+ family_tag = f" [{family}]" if family else ""
201
+
202
+ lines.append(f"- `{exp_id}`{family_tag}: {desc}")
203
+ lines.append(f" {metric_name}: {val:.4f} ({entry_sign}{entry_delta:.4f})")
204
+
205
+ # Show key config changes
206
+ cfg = entry.get("config", {})
207
+ if cfg:
208
+ cfg_items = []
209
+ for k, v in cfg.items():
210
+ if isinstance(v, (int, float, str, bool)):
211
+ cfg_items.append(f"{k}={v}")
212
+ if cfg_items:
213
+ lines.append(f" Config: {', '.join(cfg_items[:5])}")
214
+
215
+ lines.append("")
216
+ lines.append("---")
217
+ lines.append("")
218
+
219
+ return "\n".join(lines)
220
+
221
+
222
+ def format_stakeholder_changelog(
223
+ versions: list[dict],
224
+ metric_name: str,
225
+ lower_is_better: bool,
226
+ task_description: str,
227
+ ) -> str:
228
+ """Format changelog for non-technical stakeholders.
229
+
230
+ No experiment IDs, no configs. Plain English with clear
231
+ performance narratives.
232
+ """
233
+ direction_word = "reduced" if lower_is_better else "improved"
234
+ lines = [
235
+ f"# Model Progress Report",
236
+ f"",
237
+ f"**Task:** {task_description}",
238
+ f"**Generated:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
239
+ f"",
240
+ ]
241
+
242
+ # Overall summary
243
+ if versions:
244
+ first_val = versions[0]["start_value"]
245
+ last_val = versions[-1]["end_value"]
246
+ total_delta = last_val - first_val
247
+ total_pct = abs(total_delta / first_val * 100) if first_val != 0 else 0
248
+
249
+ lines.append(f"## Overall Progress")
250
+ lines.append("")
251
+ lines.append(f"Performance {direction_word} from **{first_val:.4f}** to "
252
+ f"**{last_val:.4f}** ({total_pct:.1f}% change) across "
253
+ f"**{len(versions)} version(s)**.")
254
+ lines.append("")
255
+ lines.append("---")
256
+ lines.append("")
257
+
258
+ for version in reversed(versions):
259
+ ver = version["version"]
260
+ date_range = version.get("date_range", "")
261
+ n = version["n_improvements"]
262
+
263
+ lines.append(f"## {ver}" + (f" ({date_range})" if date_range else ""))
264
+ lines.append("")
265
+
266
+ # Narrative description
267
+ improvements = version["improvements"]
268
+ end_val = version["end_value"]
269
+
270
+ # Summarize what changed in plain language
271
+ families_seen = set()
272
+ descriptions = []
273
+ for entry in improvements:
274
+ desc = entry.get("description", "")
275
+ family = entry.get("family", "")
276
+ if family:
277
+ families_seen.add(family)
278
+ if desc:
279
+ # Clean up technical jargon for stakeholders
280
+ clean = desc.replace("_", " ").strip()
281
+ if clean and clean not in descriptions:
282
+ descriptions.append(clean)
283
+
284
+ if families_seen:
285
+ lines.append(f"Explored approaches: {', '.join(sorted(families_seen))}.")
286
+ lines.append("")
287
+
288
+ if descriptions:
289
+ lines.append("Key changes:")
290
+ lines.append("")
291
+ for desc in descriptions[:5]:
292
+ lines.append(f"- {desc}")
293
+ lines.append("")
294
+
295
+ # Performance summary in plain language
296
+ delta = version["version_delta"]
297
+ if abs(delta) > 0:
298
+ pct = abs(delta / version["start_value"] * 100) if version["start_value"] != 0 else 0
299
+ lines.append(f"Result: {metric_name.replace('_', ' ')} {direction_word} by "
300
+ f"**{pct:.1f}%** to **{end_val:.4f}** "
301
+ f"({n} improvement{'s' if n > 1 else ''}).")
302
+ else:
303
+ lines.append(f"Established baseline at **{end_val:.4f}**.")
304
+
305
+ lines.append("")
306
+ lines.append("---")
307
+ lines.append("")
308
+
309
+ return "\n".join(lines)
310
+
311
+
312
+ # --- Report ---
313
+
314
+
315
+ def format_changelog_report(result: dict) -> str:
316
+ """Format the changelog result as readable text."""
317
+ if "changelog" in result:
318
+ return result["changelog"]
319
+ return json.dumps(result, indent=2, default=str)
320
+
321
+
322
+ def save_changelog_report(
323
+ result: dict,
324
+ output_path: str = DEFAULT_OUTPUT_PATH,
325
+ ) -> Path:
326
+ """Save the changelog to a markdown file."""
327
+ p = Path(output_path)
328
+ p.parent.mkdir(parents=True, exist_ok=True)
329
+ changelog = result.get("changelog", "")
330
+ with open(p, "w") as f:
331
+ f.write(changelog)
332
+ return p
333
+
334
+
335
+ # --- Orchestration ---
336
+
337
+
338
+ def filter_experiments_since(
339
+ experiments: list[dict],
340
+ since: str,
341
+ ) -> list[dict]:
342
+ """Filter experiments to those after a given experiment ID or date.
343
+
344
+ If `since` looks like an experiment ID (starts with "exp-"), filter
345
+ to experiments after that one. Otherwise treat as a date string.
346
+ """
347
+ if since.startswith("exp-"):
348
+ found = False
349
+ filtered = []
350
+ for exp in experiments:
351
+ if found:
352
+ filtered.append(exp)
353
+ if exp.get("experiment_id") == since:
354
+ found = True
355
+ return filtered
356
+ else:
357
+ return [e for e in experiments if e.get("timestamp", "") >= since]
358
+
359
+
360
+ def run_generate_changelog(
361
+ audience: str = "technical",
362
+ since: str | None = None,
363
+ threshold: float = VERSION_THRESHOLD,
364
+ log_path: str = DEFAULT_LOG_PATH,
365
+ config_path: str = "config.yaml",
366
+ output_path: str = DEFAULT_OUTPUT_PATH,
367
+ save: bool = True,
368
+ ) -> dict:
369
+ """Generate model changelog from experiment history."""
370
+ timestamp = datetime.now(timezone.utc).isoformat()
371
+ config = load_config(config_path)
372
+ experiments = load_experiments(log_path)
373
+
374
+ if not experiments:
375
+ return {"timestamp": timestamp, "error": "No experiments found in log"}
376
+
377
+ if since:
378
+ experiments = filter_experiments_since(experiments, since)
379
+ if not experiments:
380
+ return {"timestamp": timestamp, "error": f"No experiments found after '{since}'"}
381
+
382
+ metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
383
+ lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
384
+ task_desc = config.get("task", {}).get("description", "ML experiment campaign")
385
+
386
+ trajectory = compute_trajectory(experiments, metric_name, lower_is_better)
387
+
388
+ if not trajectory:
389
+ return {"timestamp": timestamp, "error": "No kept experiments with metrics found"}
390
+
391
+ boundaries = detect_version_boundaries(trajectory, threshold)
392
+ versions = group_into_versions(trajectory, boundaries)
393
+
394
+ if audience == "stakeholder":
395
+ changelog = format_stakeholder_changelog(
396
+ versions, metric_name, lower_is_better, task_desc,
397
+ )
398
+ else:
399
+ changelog = format_technical_changelog(
400
+ versions, metric_name, lower_is_better, task_desc,
401
+ )
402
+
403
+ result = {
404
+ "timestamp": timestamp,
405
+ "audience": audience,
406
+ "metric": metric_name,
407
+ "n_versions": len(versions),
408
+ "n_improvements": len(trajectory),
409
+ "versions": versions,
410
+ "changelog": changelog,
411
+ }
412
+
413
+ if save:
414
+ saved_path = save_changelog_report(result, output_path)
415
+ result["saved_to"] = str(saved_path)
416
+
417
+ return result
418
+
419
+
420
+ def main() -> None:
421
+ """CLI entry point."""
422
+ parser = argparse.ArgumentParser(
423
+ description="Generate model changelog from experiment history",
424
+ )
425
+ parser.add_argument("--audience", choices=["technical", "stakeholder"],
426
+ default="technical",
427
+ help="Target audience (technical = full detail, stakeholder = plain English)")
428
+ parser.add_argument("--since", default=None,
429
+ help="Start from experiment ID (exp-NNN) or date (YYYY-MM-DD)")
430
+ parser.add_argument("--threshold", type=float, default=VERSION_THRESHOLD,
431
+ help=f"Relative improvement threshold for version boundaries (default: {VERSION_THRESHOLD})")
432
+ parser.add_argument("--output", default=DEFAULT_OUTPUT_PATH,
433
+ help="Output file path")
434
+ parser.add_argument("--no-save", action="store_true",
435
+ help="Print to stdout instead of saving file")
436
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
437
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
438
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
439
+ args = parser.parse_args()
440
+
441
+ report = run_generate_changelog(
442
+ audience=args.audience,
443
+ since=args.since,
444
+ threshold=args.threshold,
445
+ log_path=args.log,
446
+ config_path=args.config,
447
+ output_path=args.output,
448
+ save=not args.no_save,
449
+ )
450
+
451
+ if args.json:
452
+ print(json.dumps(report, indent=2, default=str))
453
+ else:
454
+ if "error" in report:
455
+ print(f"ERROR: {report['error']}", file=sys.stderr)
456
+ sys.exit(1)
457
+ print(format_changelog_report(report))
458
+ saved = report.get("saved_to")
459
+ if saved:
460
+ print(f"\nSaved to {saved}", file=sys.stderr)
461
+
462
+
463
+ if __name__ == "__main__":
464
+ main()