claude-turing 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +66 -3
  3. package/commands/card.md +36 -0
  4. package/commands/explore.md +107 -0
  5. package/commands/suggest.md +68 -4
  6. package/commands/turing.md +4 -0
  7. package/package.json +1 -1
  8. package/src/claude-md.js +1 -0
  9. package/src/install.js +2 -2
  10. package/src/verify.js +2 -0
  11. package/templates/requirements.txt +4 -0
  12. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  13. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  14. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +58 -3
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/manage_hypotheses.py +2 -2
  26. package/templates/scripts/plot_trajectory.py +611 -0
  27. package/templates/scripts/scaffold.py +8 -0
  28. package/templates/scripts/show_metrics.py +23 -2
  29. package/templates/scripts/treequest_suggest.py +520 -0
  30. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  31. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  32. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  33. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,457 @@
1
+ """Export experiment results to CSV, Markdown, LaTeX, or JSON.
2
+
3
+ Reads experiments/log.jsonl and renders the selected experiments in the
4
+ requested format — suitable for pasting into README files, academic papers,
5
+ or downstream data pipelines.
6
+
7
+ Typical usage:
8
+ python scripts/export_results.py # CSV, all experiments
9
+ python scripts/export_results.py --format markdown --status kept # Markdown, kept only
10
+ python scripts/export_results.py --format latex --last 10 --sort accuracy
11
+ python scripts/export_results.py --format csv --output results.csv
12
+ python scripts/export_results.py --columns "experiment_id,accuracy,f1_weighted" --format markdown
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import csv
19
+ import io
20
+ import json
21
+ import sys
22
+ from pathlib import Path
23
+ from typing import Any
24
+
25
+ from scripts.turing_io import load_config, load_experiments
26
+
27
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
28
+
29
+ # Columns that are promoted to the top level from nested dicts in the raw
30
+ # JSONL entry. Everything else is treated as a plain top-level key.
31
+ _CONFIG_KEYS = {"model_type", "hyperparams"}
32
+ _DEFAULT_COLUMNS = ["experiment_id", "status", "model_type", "description"]
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Data extraction helpers
37
+ # ---------------------------------------------------------------------------
38
+
39
+
40
+ def _flatten(entry: dict) -> dict[str, Any]:
41
+ """Return a flat dict from one JSONL entry.
42
+
43
+ Promotes ``config.model_type`` and all ``metrics.*`` keys to the top
44
+ level so callers can reference them by bare name (e.g. "accuracy",
45
+ "model_type").
46
+ """
47
+ flat: dict[str, Any] = {}
48
+
49
+ # Top-level scalar fields
50
+ for key in ("experiment_id", "timestamp", "status", "description",
51
+ "git_commit", "parent_experiment", "hypothesis_id", "family",
52
+ "model_path"):
53
+ flat[key] = entry.get(key, "")
54
+
55
+ # Tags as a semicolon-separated string so it fits in a single cell
56
+ tags = entry.get("tags", [])
57
+ flat["tags"] = ";".join(tags) if isinstance(tags, list) else str(tags or "")
58
+
59
+ # config sub-keys
60
+ cfg = entry.get("config") or {}
61
+ flat["model_type"] = cfg.get("model_type", "")
62
+
63
+ # metrics — each metric becomes its own column
64
+ for k, v in (entry.get("metrics") or {}).items():
65
+ flat[k] = v
66
+
67
+ return flat
68
+
69
+
70
+ def _collect_all_columns(rows: list[dict[str, Any]]) -> list[str]:
71
+ """Return a deterministic column order across all rows.
72
+
73
+ Order: default columns first (preserving the list), then all metric
74
+ columns sorted alphabetically, then any remaining keys sorted.
75
+ """
76
+ seen: set[str] = set()
77
+ ordered: list[str] = []
78
+
79
+ for col in _DEFAULT_COLUMNS:
80
+ if col not in seen:
81
+ ordered.append(col)
82
+ seen.add(col)
83
+
84
+ # Collect metric keys (anything that is not in the fixed set)
85
+ fixed = {
86
+ "experiment_id", "timestamp", "status", "description",
87
+ "git_commit", "parent_experiment", "hypothesis_id", "family",
88
+ "model_path", "tags", "model_type",
89
+ }
90
+ metric_keys: set[str] = set()
91
+ extra_keys: set[str] = set()
92
+ for row in rows:
93
+ for k in row:
94
+ if k in seen:
95
+ continue
96
+ if k not in fixed:
97
+ metric_keys.add(k)
98
+ else:
99
+ extra_keys.add(k)
100
+
101
+ for k in sorted(metric_keys):
102
+ if k not in seen:
103
+ ordered.append(k)
104
+ seen.add(k)
105
+ for k in sorted(extra_keys):
106
+ if k not in seen:
107
+ ordered.append(k)
108
+ seen.add(k)
109
+
110
+ return ordered
111
+
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # Filtering and sorting
115
+ # ---------------------------------------------------------------------------
116
+
117
+
118
+ def filter_experiments(
119
+ experiments: list[dict],
120
+ status: str,
121
+ last: int | None,
122
+ family: str | None,
123
+ ) -> list[dict]:
124
+ """Apply status, family, and recency filters to the raw experiment list."""
125
+ result = experiments
126
+
127
+ if status != "all":
128
+ result = [e for e in result if e.get("status") == status]
129
+
130
+ if family:
131
+ result = [e for e in result if e.get("family") == family]
132
+
133
+ if last is not None and last > 0:
134
+ result = result[-last:]
135
+
136
+ return result
137
+
138
+
139
+ def sort_experiments(
140
+ rows: list[dict[str, Any]],
141
+ sort_key: str | None,
142
+ ascending: bool,
143
+ ) -> list[dict[str, Any]]:
144
+ """Sort rows by the given key. Rows missing the key sort to the end."""
145
+ if not sort_key:
146
+ return rows
147
+
148
+ def _key(row: dict[str, Any]) -> tuple[int, Any]:
149
+ val = row.get(sort_key)
150
+ if val is None or val == "":
151
+ return (1, 0)
152
+ try:
153
+ return (0, float(val))
154
+ except (TypeError, ValueError):
155
+ return (0, str(val))
156
+
157
+ return sorted(rows, key=_key, reverse=not ascending)
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # Formatters
162
+ # ---------------------------------------------------------------------------
163
+
164
+
165
+ def _cell(value: Any) -> str:
166
+ """Convert a cell value to a display string."""
167
+ if value is None or value == "":
168
+ return ""
169
+ if isinstance(value, float):
170
+ # Trim trailing zeros but keep up to 6 decimal places
171
+ return f"{value:.6g}"
172
+ return str(value)
173
+
174
+
175
+ def format_csv(rows: list[dict[str, Any]], columns: list[str]) -> str:
176
+ """Render rows as RFC 4180 CSV."""
177
+ buf = io.StringIO()
178
+ writer = csv.DictWriter(
179
+ buf,
180
+ fieldnames=columns,
181
+ extrasaction="ignore",
182
+ lineterminator="\n",
183
+ )
184
+ writer.writeheader()
185
+ for row in rows:
186
+ writer.writerow({col: _cell(row.get(col)) for col in columns})
187
+ return buf.getvalue()
188
+
189
+
190
+ def format_markdown(rows: list[dict[str, Any]], columns: list[str]) -> str:
191
+ """Render rows as a GitHub-flavoured Markdown table."""
192
+ if not rows:
193
+ return "_No experiments match the requested filters._\n"
194
+
195
+ # Compute column widths
196
+ widths = {col: max(len(col), max((len(_cell(r.get(col))) for r in rows), default=0))
197
+ for col in columns}
198
+
199
+ def _pad(text: str, col: str) -> str:
200
+ return text.ljust(widths[col])
201
+
202
+ header = "| " + " | ".join(_pad(col, col) for col in columns) + " |"
203
+ sep = "| " + " | ".join("-" * widths[col] for col in columns) + " |"
204
+ lines = [header, sep]
205
+
206
+ for row in rows:
207
+ cells = " | ".join(_pad(_cell(row.get(col)), col) for col in columns)
208
+ lines.append(f"| {cells} |")
209
+
210
+ return "\n".join(lines) + "\n"
211
+
212
+
213
+ def _latex_escape(text: str) -> str:
214
+ """Escape special LaTeX characters in a cell value."""
215
+ replacements = [
216
+ ("\\", r"\textbackslash{}"),
217
+ ("&", r"\&"),
218
+ ("%", r"\%"),
219
+ ("$", r"\$"),
220
+ ("#", r"\#"),
221
+ ("_", r"\_"),
222
+ ("{", r"\{"),
223
+ ("}", r"\}"),
224
+ ("~", r"\textasciitilde{}"),
225
+ ("^", r"\textasciicircum{}"),
226
+ ]
227
+ for old, new in replacements:
228
+ text = text.replace(old, new)
229
+ return text
230
+
231
+
232
+ def format_latex(rows: list[dict[str, Any]], columns: list[str]) -> str:
233
+ """Render rows as a LaTeX tabular environment.
234
+
235
+ Produces a self-contained snippet ready to drop into a paper's
236
+ ``table`` float. Numeric columns use right-alignment; text columns
237
+ use left-alignment.
238
+ """
239
+ if not rows:
240
+ return "% No experiments match the requested filters.\n"
241
+
242
+ # Determine alignment: right-align columns whose values look numeric
243
+ def _is_numeric_col(col: str) -> bool:
244
+ for row in rows:
245
+ val = row.get(col)
246
+ if val is None or val == "":
247
+ continue
248
+ try:
249
+ float(val)
250
+ return True
251
+ except (TypeError, ValueError):
252
+ return False
253
+ return False
254
+
255
+ alignments = ["r" if _is_numeric_col(c) else "l" for c in columns]
256
+ col_spec = " ".join(alignments)
257
+
258
+ header_row = " & ".join(
259
+ r"\textbf{" + _latex_escape(col.replace("_", r"\_")) + "}"
260
+ for col in columns
261
+ )
262
+
263
+ data_lines = []
264
+ for row in rows:
265
+ cells = " & ".join(_latex_escape(_cell(row.get(col))) for col in columns)
266
+ data_lines.append(f" {cells} \\\\")
267
+
268
+ lines = [
269
+ r"\begin{tabular}{" + col_spec + "}",
270
+ r" \hline",
271
+ f" {header_row} \\\\",
272
+ r" \hline",
273
+ *data_lines,
274
+ r" \hline",
275
+ r"\end{tabular}",
276
+ ]
277
+ return "\n".join(lines) + "\n"
278
+
279
+
280
+ def format_json(rows: list[dict[str, Any]], columns: list[str]) -> str:
281
+ """Render rows as a JSON array containing only the selected columns."""
282
+ filtered = [{col: row.get(col) for col in columns} for row in rows]
283
+ return json.dumps(filtered, indent=2, default=str) + "\n"
284
+
285
+
286
+ # ---------------------------------------------------------------------------
287
+ # Main
288
+ # ---------------------------------------------------------------------------
289
+
290
+
291
+ def build_parser() -> argparse.ArgumentParser:
292
+ """Return the configured argument parser."""
293
+ parser = argparse.ArgumentParser(
294
+ description=(
295
+ "Export experiment results from experiments/log.jsonl to CSV, "
296
+ "Markdown, LaTeX, or JSON."
297
+ ),
298
+ formatter_class=argparse.RawDescriptionHelpFormatter,
299
+ epilog="""examples:
300
+ python scripts/export_results.py
301
+ python scripts/export_results.py --format markdown --status kept
302
+ python scripts/export_results.py --format latex --last 10 --sort accuracy
303
+ python scripts/export_results.py --format csv --output results.csv
304
+ python scripts/export_results.py --columns "experiment_id,accuracy,f1_weighted" --format markdown
305
+ """,
306
+ )
307
+
308
+ parser.add_argument(
309
+ "--log",
310
+ default=DEFAULT_LOG_PATH,
311
+ metavar="PATH",
312
+ help=f"Path to JSONL experiment log (default: {DEFAULT_LOG_PATH})",
313
+ )
314
+ parser.add_argument(
315
+ "--format",
316
+ choices=["csv", "markdown", "latex", "json"],
317
+ default="csv",
318
+ help="Output format (default: csv)",
319
+ )
320
+ parser.add_argument(
321
+ "--status",
322
+ choices=["all", "kept", "discarded"],
323
+ default="all",
324
+ help="Filter by experiment status (default: all)",
325
+ )
326
+ parser.add_argument(
327
+ "--last",
328
+ type=int,
329
+ default=None,
330
+ metavar="N",
331
+ help="Include only the last N experiments (applied after status filter)",
332
+ )
333
+ parser.add_argument(
334
+ "--family",
335
+ default=None,
336
+ metavar="NAME",
337
+ help="Filter to a specific experiment family",
338
+ )
339
+ parser.add_argument(
340
+ "--columns",
341
+ default=None,
342
+ metavar="COL1,COL2,...",
343
+ help=(
344
+ "Comma-separated list of columns to include. "
345
+ "Defaults to experiment_id, status, model_type, all metrics, description."
346
+ ),
347
+ )
348
+ parser.add_argument(
349
+ "--sort",
350
+ default=None,
351
+ metavar="METRIC",
352
+ help="Sort by this column, descending (higher-is-better default)",
353
+ )
354
+ parser.add_argument(
355
+ "--sort-asc",
356
+ default=None,
357
+ metavar="METRIC",
358
+ dest="sort_asc",
359
+ help="Sort by this column, ascending (lower-is-better)",
360
+ )
361
+ parser.add_argument(
362
+ "--output",
363
+ default=None,
364
+ metavar="FILE",
365
+ help="Write output to FILE instead of stdout",
366
+ )
367
+
368
+ return parser
369
+
370
+
371
+ def main() -> None:
372
+ """CLI entry point for export_results."""
373
+ parser = build_parser()
374
+ args = parser.parse_args()
375
+
376
+ # --sort and --sort-asc are mutually exclusive
377
+ if args.sort and args.sort_asc:
378
+ parser.error("--sort and --sort-asc are mutually exclusive")
379
+
380
+ sort_key = args.sort or args.sort_asc
381
+ ascending = bool(args.sort_asc)
382
+
383
+ # Load config for metric names (used to build default column list)
384
+ config = load_config()
385
+ eval_cfg = config.get("evaluation", {})
386
+ configured_metrics: list[str] = eval_cfg.get("metrics", [])
387
+
388
+ # Load and filter experiments
389
+ raw_experiments = load_experiments(args.log)
390
+
391
+ if not raw_experiments:
392
+ msg = f"No experiments found in {args.log}."
393
+ if args.format in ("markdown",):
394
+ print(f"_{msg}_")
395
+ else:
396
+ print(msg, file=sys.stderr)
397
+ sys.exit(0)
398
+
399
+ filtered = filter_experiments(raw_experiments, args.status, args.last, args.family)
400
+
401
+ if not filtered:
402
+ msg = "No experiments match the requested filters."
403
+ if args.format == "markdown":
404
+ print(f"_{msg}_")
405
+ else:
406
+ print(msg, file=sys.stderr)
407
+ sys.exit(0)
408
+
409
+ # Flatten each entry
410
+ rows = [_flatten(e) for e in filtered]
411
+
412
+ # Determine column list
413
+ if args.columns:
414
+ columns = [c.strip() for c in args.columns.split(",") if c.strip()]
415
+ else:
416
+ # Build default: fixed header + configured metrics + description
417
+ all_cols = _collect_all_columns(rows)
418
+ # Promote configured metrics to appear before description when they
419
+ # exist in the data.
420
+ fixed_head = ["experiment_id", "status", "model_type"]
421
+ fixed_tail = ["description"]
422
+ metric_cols = [m for m in configured_metrics if m in {c for r in rows for c in r}]
423
+ # Any additional metric columns not listed in config
424
+ extra_metric_cols = [
425
+ c for c in all_cols
426
+ if c not in fixed_head
427
+ and c not in fixed_tail
428
+ and c not in metric_cols
429
+ and c not in {"timestamp", "git_commit", "parent_experiment",
430
+ "hypothesis_id", "family", "model_path", "tags"}
431
+ ]
432
+ columns = fixed_head + metric_cols + extra_metric_cols + fixed_tail
433
+
434
+ # Sort
435
+ rows = sort_experiments(rows, sort_key, ascending)
436
+
437
+ # Render
438
+ formatters = {
439
+ "csv": format_csv,
440
+ "markdown": format_markdown,
441
+ "latex": format_latex,
442
+ "json": format_json,
443
+ }
444
+ output_text = formatters[args.format](rows, columns)
445
+
446
+ # Write
447
+ if args.output:
448
+ out_path = Path(args.output)
449
+ out_path.parent.mkdir(parents=True, exist_ok=True)
450
+ out_path.write_text(output_text, encoding="utf-8")
451
+ print(f"Wrote {len(rows)} experiment(s) to {out_path}", file=sys.stderr)
452
+ else:
453
+ print(output_text, end="")
454
+
455
+
456
+ if __name__ == "__main__":
457
+ main()
@@ -23,6 +23,7 @@ from pathlib import Path
23
23
 
24
24
  import yaml
25
25
 
26
+ from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
26
27
  from scripts.turing_io import load_config, load_experiments, load_hypotheses
27
28
 
28
29
 
@@ -220,6 +221,8 @@ def format_brief(
220
221
  lower_is_better: bool,
221
222
  failure_clusters: list[dict] | None = None,
222
223
  env_warnings: list[str] | None = None,
224
+ cost_data: list | None = None,
225
+ cost_frontier: list | None = None,
223
226
  ) -> str:
224
227
  """Format the research briefing as markdown."""
225
228
  direction = "lower" if lower_is_better else "higher"
@@ -283,7 +286,8 @@ def format_brief(
283
286
  lines.append(f"**{len(queued)} queued:**")
284
287
  for h in queued:
285
288
  priority_marker = " (HIGH)" if h.get("priority") == "high" else ""
286
- source_marker = " [human]" if h.get("source") == "human" else ""
289
+ source = h.get("source", "")
290
+ source_marker = f" [{source}]" if source in ("human", "treequest", "literature") else ""
287
291
  lines.append(f"- {h['id']}: {h.get('description', '?')}{priority_marker}{source_marker}")
288
292
  else:
289
293
  lines.append("No queued hypotheses. Use `/turing:try` to inject ideas.")
@@ -312,6 +316,51 @@ def format_brief(
312
316
  lines.append("")
313
317
  lines.append("*Results may not be directly comparable. Consider re-running the best experiment in the current environment.*")
314
318
 
319
+ # Cost-performance analysis (only if train_seconds data exists)
320
+ if cost_data and cost_frontier is not None:
321
+ lines.extend(["", "## Cost-Performance Analysis", ""])
322
+
323
+ frontier_ids = {r.experiment_id for r in cost_frontier}
324
+
325
+ lines.append("**Pareto frontier (efficient set):**")
326
+ if cost_frontier:
327
+ for r in cost_frontier:
328
+ lines.append(
329
+ f"- {r.experiment_id} ({r.model_type}): "
330
+ f"{metric}={r.metric_value:.4f}, time={_format_seconds(r.train_seconds)}"
331
+ )
332
+ else:
333
+ lines.append("- No Pareto-optimal experiments found.")
334
+
335
+ # Compare best metric vs cheapest frontier alternative
336
+ if len(cost_frontier) >= 2:
337
+ if lower_is_better:
338
+ best_cost_exp = min(cost_data, key=lambda r: r.metric_value)
339
+ else:
340
+ best_cost_exp = max(cost_data, key=lambda r: r.metric_value)
341
+
342
+ cheapest = cost_frontier[0] # sorted by train_seconds
343
+
344
+ if best_cost_exp.experiment_id != cheapest.experiment_id:
345
+ metric_diff = abs(best_cost_exp.metric_value - cheapest.metric_value)
346
+ if cheapest.metric_value != 0:
347
+ pct = metric_diff / abs(cheapest.metric_value) * 100
348
+ else:
349
+ pct = 0.0
350
+ if cheapest.train_seconds > 0:
351
+ ratio = best_cost_exp.train_seconds / cheapest.train_seconds
352
+ else:
353
+ ratio = float("inf")
354
+
355
+ lines.extend([
356
+ "",
357
+ f"Current best: **{best_cost_exp.experiment_id}** "
358
+ f"({best_cost_exp.metric_value:.4f}, {_format_seconds(best_cost_exp.train_seconds)})",
359
+ f"Cheapest acceptable: **{cheapest.experiment_id}** "
360
+ f"({cheapest.metric_value:.4f}, {_format_seconds(cheapest.train_seconds)})",
361
+ f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
362
+ ])
363
+
315
364
  lines.extend([
316
365
  "",
317
366
  "## Recommendations",
@@ -339,9 +388,9 @@ def format_brief(
339
388
 
340
389
  # Check if hypotheses are exhausted
341
390
  if not queued:
342
- lines.append("- No hypotheses queued — inject ideas with `/turing:try`")
391
+ lines.append("- No hypotheses queued — inject ideas with `/turing:try` or explore with `/turing:explore`")
343
392
 
344
- lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses. Use `/turing:train` to execute.*"])
393
+ lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses, `/turing:explore` for tree search, `/turing:train` to execute.*"])
345
394
 
346
395
  return "\n".join(lines)
347
396
 
@@ -367,9 +416,15 @@ def generate_brief(
367
416
  failures = cluster_failures(experiments)
368
417
  env_warnings = detect_environment_drift(experiments)
369
418
 
419
+ # Load cost-performance data if available
420
+ cost_records = load_cost_data(log_path, metric)
421
+ pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
422
+
370
423
  return format_brief(
371
424
  campaign, best, trajectory, model_types, hypotheses,
372
425
  metric, lower_is_better, failures, env_warnings,
426
+ cost_data=cost_records if cost_records else None,
427
+ cost_frontier=pareto if cost_records else None,
373
428
  )
374
429
 
375
430