claude-turing 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +48 -7
- package/commands/brief.md +13 -1
- package/commands/card.md +36 -0
- package/commands/init.md +13 -0
- package/commands/train.md +16 -7
- package/commands/turing.md +4 -2
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +1 -0
- package/templates/model_contract.md +49 -0
- package/templates/model_registry.yaml +69 -0
- package/templates/program.md +2 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +54 -0
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +9 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""Export experiment results to CSV, Markdown, LaTeX, or JSON.
|
|
2
|
+
|
|
3
|
+
Reads experiments/log.jsonl and renders the selected experiments in the
|
|
4
|
+
requested format — suitable for pasting into README files, academic papers,
|
|
5
|
+
or downstream data pipelines.
|
|
6
|
+
|
|
7
|
+
Typical usage:
|
|
8
|
+
python scripts/export_results.py # CSV, all experiments
|
|
9
|
+
python scripts/export_results.py --format markdown --status kept # Markdown, kept only
|
|
10
|
+
python scripts/export_results.py --format latex --last 10 --sort accuracy
|
|
11
|
+
python scripts/export_results.py --format csv --output results.csv
|
|
12
|
+
python scripts/export_results.py --columns "experiment_id,accuracy,f1_weighted" --format markdown
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import csv
|
|
19
|
+
import io
|
|
20
|
+
import json
|
|
21
|
+
import sys
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from scripts.turing_io import load_config, load_experiments
|
|
26
|
+
|
|
27
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
28
|
+
|
|
29
|
+
# Columns that are promoted to the top level from nested dicts in the raw
|
|
30
|
+
# JSONL entry. Everything else is treated as a plain top-level key.
|
|
31
|
+
_CONFIG_KEYS = {"model_type", "hyperparams"}
|
|
32
|
+
_DEFAULT_COLUMNS = ["experiment_id", "status", "model_type", "description"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Data extraction helpers
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _flatten(entry: dict) -> dict[str, Any]:
|
|
41
|
+
"""Return a flat dict from one JSONL entry.
|
|
42
|
+
|
|
43
|
+
Promotes ``config.model_type`` and all ``metrics.*`` keys to the top
|
|
44
|
+
level so callers can reference them by bare name (e.g. "accuracy",
|
|
45
|
+
"model_type").
|
|
46
|
+
"""
|
|
47
|
+
flat: dict[str, Any] = {}
|
|
48
|
+
|
|
49
|
+
# Top-level scalar fields
|
|
50
|
+
for key in ("experiment_id", "timestamp", "status", "description",
|
|
51
|
+
"git_commit", "parent_experiment", "hypothesis_id", "family",
|
|
52
|
+
"model_path"):
|
|
53
|
+
flat[key] = entry.get(key, "")
|
|
54
|
+
|
|
55
|
+
# Tags as a semicolon-separated string so it fits in a single cell
|
|
56
|
+
tags = entry.get("tags", [])
|
|
57
|
+
flat["tags"] = ";".join(tags) if isinstance(tags, list) else str(tags or "")
|
|
58
|
+
|
|
59
|
+
# config sub-keys
|
|
60
|
+
cfg = entry.get("config") or {}
|
|
61
|
+
flat["model_type"] = cfg.get("model_type", "")
|
|
62
|
+
|
|
63
|
+
# metrics — each metric becomes its own column
|
|
64
|
+
for k, v in (entry.get("metrics") or {}).items():
|
|
65
|
+
flat[k] = v
|
|
66
|
+
|
|
67
|
+
return flat
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _collect_all_columns(rows: list[dict[str, Any]]) -> list[str]:
|
|
71
|
+
"""Return a deterministic column order across all rows.
|
|
72
|
+
|
|
73
|
+
Order: default columns first (preserving the list), then all metric
|
|
74
|
+
columns sorted alphabetically, then any remaining keys sorted.
|
|
75
|
+
"""
|
|
76
|
+
seen: set[str] = set()
|
|
77
|
+
ordered: list[str] = []
|
|
78
|
+
|
|
79
|
+
for col in _DEFAULT_COLUMNS:
|
|
80
|
+
if col not in seen:
|
|
81
|
+
ordered.append(col)
|
|
82
|
+
seen.add(col)
|
|
83
|
+
|
|
84
|
+
# Collect metric keys (anything that is not in the fixed set)
|
|
85
|
+
fixed = {
|
|
86
|
+
"experiment_id", "timestamp", "status", "description",
|
|
87
|
+
"git_commit", "parent_experiment", "hypothesis_id", "family",
|
|
88
|
+
"model_path", "tags", "model_type",
|
|
89
|
+
}
|
|
90
|
+
metric_keys: set[str] = set()
|
|
91
|
+
extra_keys: set[str] = set()
|
|
92
|
+
for row in rows:
|
|
93
|
+
for k in row:
|
|
94
|
+
if k in seen:
|
|
95
|
+
continue
|
|
96
|
+
if k not in fixed:
|
|
97
|
+
metric_keys.add(k)
|
|
98
|
+
else:
|
|
99
|
+
extra_keys.add(k)
|
|
100
|
+
|
|
101
|
+
for k in sorted(metric_keys):
|
|
102
|
+
if k not in seen:
|
|
103
|
+
ordered.append(k)
|
|
104
|
+
seen.add(k)
|
|
105
|
+
for k in sorted(extra_keys):
|
|
106
|
+
if k not in seen:
|
|
107
|
+
ordered.append(k)
|
|
108
|
+
seen.add(k)
|
|
109
|
+
|
|
110
|
+
return ordered
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# Filtering and sorting
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def filter_experiments(
|
|
119
|
+
experiments: list[dict],
|
|
120
|
+
status: str,
|
|
121
|
+
last: int | None,
|
|
122
|
+
family: str | None,
|
|
123
|
+
) -> list[dict]:
|
|
124
|
+
"""Apply status, family, and recency filters to the raw experiment list."""
|
|
125
|
+
result = experiments
|
|
126
|
+
|
|
127
|
+
if status != "all":
|
|
128
|
+
result = [e for e in result if e.get("status") == status]
|
|
129
|
+
|
|
130
|
+
if family:
|
|
131
|
+
result = [e for e in result if e.get("family") == family]
|
|
132
|
+
|
|
133
|
+
if last is not None and last > 0:
|
|
134
|
+
result = result[-last:]
|
|
135
|
+
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def sort_experiments(
|
|
140
|
+
rows: list[dict[str, Any]],
|
|
141
|
+
sort_key: str | None,
|
|
142
|
+
ascending: bool,
|
|
143
|
+
) -> list[dict[str, Any]]:
|
|
144
|
+
"""Sort rows by the given key. Rows missing the key sort to the end."""
|
|
145
|
+
if not sort_key:
|
|
146
|
+
return rows
|
|
147
|
+
|
|
148
|
+
def _key(row: dict[str, Any]) -> tuple[int, Any]:
|
|
149
|
+
val = row.get(sort_key)
|
|
150
|
+
if val is None or val == "":
|
|
151
|
+
return (1, 0)
|
|
152
|
+
try:
|
|
153
|
+
return (0, float(val))
|
|
154
|
+
except (TypeError, ValueError):
|
|
155
|
+
return (0, str(val))
|
|
156
|
+
|
|
157
|
+
return sorted(rows, key=_key, reverse=not ascending)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
# Formatters
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _cell(value: Any) -> str:
|
|
166
|
+
"""Convert a cell value to a display string."""
|
|
167
|
+
if value is None or value == "":
|
|
168
|
+
return ""
|
|
169
|
+
if isinstance(value, float):
|
|
170
|
+
# Trim trailing zeros but keep up to 6 decimal places
|
|
171
|
+
return f"{value:.6g}"
|
|
172
|
+
return str(value)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def format_csv(rows: list[dict[str, Any]], columns: list[str]) -> str:
|
|
176
|
+
"""Render rows as RFC 4180 CSV."""
|
|
177
|
+
buf = io.StringIO()
|
|
178
|
+
writer = csv.DictWriter(
|
|
179
|
+
buf,
|
|
180
|
+
fieldnames=columns,
|
|
181
|
+
extrasaction="ignore",
|
|
182
|
+
lineterminator="\n",
|
|
183
|
+
)
|
|
184
|
+
writer.writeheader()
|
|
185
|
+
for row in rows:
|
|
186
|
+
writer.writerow({col: _cell(row.get(col)) for col in columns})
|
|
187
|
+
return buf.getvalue()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def format_markdown(rows: list[dict[str, Any]], columns: list[str]) -> str:
|
|
191
|
+
"""Render rows as a GitHub-flavoured Markdown table."""
|
|
192
|
+
if not rows:
|
|
193
|
+
return "_No experiments match the requested filters._\n"
|
|
194
|
+
|
|
195
|
+
# Compute column widths
|
|
196
|
+
widths = {col: max(len(col), max((len(_cell(r.get(col))) for r in rows), default=0))
|
|
197
|
+
for col in columns}
|
|
198
|
+
|
|
199
|
+
def _pad(text: str, col: str) -> str:
|
|
200
|
+
return text.ljust(widths[col])
|
|
201
|
+
|
|
202
|
+
header = "| " + " | ".join(_pad(col, col) for col in columns) + " |"
|
|
203
|
+
sep = "| " + " | ".join("-" * widths[col] for col in columns) + " |"
|
|
204
|
+
lines = [header, sep]
|
|
205
|
+
|
|
206
|
+
for row in rows:
|
|
207
|
+
cells = " | ".join(_pad(_cell(row.get(col)), col) for col in columns)
|
|
208
|
+
lines.append(f"| {cells} |")
|
|
209
|
+
|
|
210
|
+
return "\n".join(lines) + "\n"
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _latex_escape(text: str) -> str:
|
|
214
|
+
"""Escape special LaTeX characters in a cell value."""
|
|
215
|
+
replacements = [
|
|
216
|
+
("\\", r"\textbackslash{}"),
|
|
217
|
+
("&", r"\&"),
|
|
218
|
+
("%", r"\%"),
|
|
219
|
+
("$", r"\$"),
|
|
220
|
+
("#", r"\#"),
|
|
221
|
+
("_", r"\_"),
|
|
222
|
+
("{", r"\{"),
|
|
223
|
+
("}", r"\}"),
|
|
224
|
+
("~", r"\textasciitilde{}"),
|
|
225
|
+
("^", r"\textasciicircum{}"),
|
|
226
|
+
]
|
|
227
|
+
for old, new in replacements:
|
|
228
|
+
text = text.replace(old, new)
|
|
229
|
+
return text
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def format_latex(rows: list[dict[str, Any]], columns: list[str]) -> str:
|
|
233
|
+
"""Render rows as a LaTeX tabular environment.
|
|
234
|
+
|
|
235
|
+
Produces a self-contained snippet ready to drop into a paper's
|
|
236
|
+
``table`` float. Numeric columns use right-alignment; text columns
|
|
237
|
+
use left-alignment.
|
|
238
|
+
"""
|
|
239
|
+
if not rows:
|
|
240
|
+
return "% No experiments match the requested filters.\n"
|
|
241
|
+
|
|
242
|
+
# Determine alignment: right-align columns whose values look numeric
|
|
243
|
+
def _is_numeric_col(col: str) -> bool:
|
|
244
|
+
for row in rows:
|
|
245
|
+
val = row.get(col)
|
|
246
|
+
if val is None or val == "":
|
|
247
|
+
continue
|
|
248
|
+
try:
|
|
249
|
+
float(val)
|
|
250
|
+
return True
|
|
251
|
+
except (TypeError, ValueError):
|
|
252
|
+
return False
|
|
253
|
+
return False
|
|
254
|
+
|
|
255
|
+
alignments = ["r" if _is_numeric_col(c) else "l" for c in columns]
|
|
256
|
+
col_spec = " ".join(alignments)
|
|
257
|
+
|
|
258
|
+
header_row = " & ".join(
|
|
259
|
+
r"\textbf{" + _latex_escape(col.replace("_", r"\_")) + "}"
|
|
260
|
+
for col in columns
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
data_lines = []
|
|
264
|
+
for row in rows:
|
|
265
|
+
cells = " & ".join(_latex_escape(_cell(row.get(col))) for col in columns)
|
|
266
|
+
data_lines.append(f" {cells} \\\\")
|
|
267
|
+
|
|
268
|
+
lines = [
|
|
269
|
+
r"\begin{tabular}{" + col_spec + "}",
|
|
270
|
+
r" \hline",
|
|
271
|
+
f" {header_row} \\\\",
|
|
272
|
+
r" \hline",
|
|
273
|
+
*data_lines,
|
|
274
|
+
r" \hline",
|
|
275
|
+
r"\end{tabular}",
|
|
276
|
+
]
|
|
277
|
+
return "\n".join(lines) + "\n"
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def format_json(rows: list[dict[str, Any]], columns: list[str]) -> str:
|
|
281
|
+
"""Render rows as a JSON array containing only the selected columns."""
|
|
282
|
+
filtered = [{col: row.get(col) for col in columns} for row in rows]
|
|
283
|
+
return json.dumps(filtered, indent=2, default=str) + "\n"
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# ---------------------------------------------------------------------------
|
|
287
|
+
# Main
|
|
288
|
+
# ---------------------------------------------------------------------------
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
292
|
+
"""Return the configured argument parser."""
|
|
293
|
+
parser = argparse.ArgumentParser(
|
|
294
|
+
description=(
|
|
295
|
+
"Export experiment results from experiments/log.jsonl to CSV, "
|
|
296
|
+
"Markdown, LaTeX, or JSON."
|
|
297
|
+
),
|
|
298
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
299
|
+
epilog="""examples:
|
|
300
|
+
python scripts/export_results.py
|
|
301
|
+
python scripts/export_results.py --format markdown --status kept
|
|
302
|
+
python scripts/export_results.py --format latex --last 10 --sort accuracy
|
|
303
|
+
python scripts/export_results.py --format csv --output results.csv
|
|
304
|
+
python scripts/export_results.py --columns "experiment_id,accuracy,f1_weighted" --format markdown
|
|
305
|
+
""",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
parser.add_argument(
|
|
309
|
+
"--log",
|
|
310
|
+
default=DEFAULT_LOG_PATH,
|
|
311
|
+
metavar="PATH",
|
|
312
|
+
help=f"Path to JSONL experiment log (default: {DEFAULT_LOG_PATH})",
|
|
313
|
+
)
|
|
314
|
+
parser.add_argument(
|
|
315
|
+
"--format",
|
|
316
|
+
choices=["csv", "markdown", "latex", "json"],
|
|
317
|
+
default="csv",
|
|
318
|
+
help="Output format (default: csv)",
|
|
319
|
+
)
|
|
320
|
+
parser.add_argument(
|
|
321
|
+
"--status",
|
|
322
|
+
choices=["all", "kept", "discarded"],
|
|
323
|
+
default="all",
|
|
324
|
+
help="Filter by experiment status (default: all)",
|
|
325
|
+
)
|
|
326
|
+
parser.add_argument(
|
|
327
|
+
"--last",
|
|
328
|
+
type=int,
|
|
329
|
+
default=None,
|
|
330
|
+
metavar="N",
|
|
331
|
+
help="Include only the last N experiments (applied after status filter)",
|
|
332
|
+
)
|
|
333
|
+
parser.add_argument(
|
|
334
|
+
"--family",
|
|
335
|
+
default=None,
|
|
336
|
+
metavar="NAME",
|
|
337
|
+
help="Filter to a specific experiment family",
|
|
338
|
+
)
|
|
339
|
+
parser.add_argument(
|
|
340
|
+
"--columns",
|
|
341
|
+
default=None,
|
|
342
|
+
metavar="COL1,COL2,...",
|
|
343
|
+
help=(
|
|
344
|
+
"Comma-separated list of columns to include. "
|
|
345
|
+
"Defaults to experiment_id, status, model_type, all metrics, description."
|
|
346
|
+
),
|
|
347
|
+
)
|
|
348
|
+
parser.add_argument(
|
|
349
|
+
"--sort",
|
|
350
|
+
default=None,
|
|
351
|
+
metavar="METRIC",
|
|
352
|
+
help="Sort by this column, descending (higher-is-better default)",
|
|
353
|
+
)
|
|
354
|
+
parser.add_argument(
|
|
355
|
+
"--sort-asc",
|
|
356
|
+
default=None,
|
|
357
|
+
metavar="METRIC",
|
|
358
|
+
dest="sort_asc",
|
|
359
|
+
help="Sort by this column, ascending (lower-is-better)",
|
|
360
|
+
)
|
|
361
|
+
parser.add_argument(
|
|
362
|
+
"--output",
|
|
363
|
+
default=None,
|
|
364
|
+
metavar="FILE",
|
|
365
|
+
help="Write output to FILE instead of stdout",
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return parser
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def main() -> None:
|
|
372
|
+
"""CLI entry point for export_results."""
|
|
373
|
+
parser = build_parser()
|
|
374
|
+
args = parser.parse_args()
|
|
375
|
+
|
|
376
|
+
# --sort and --sort-asc are mutually exclusive
|
|
377
|
+
if args.sort and args.sort_asc:
|
|
378
|
+
parser.error("--sort and --sort-asc are mutually exclusive")
|
|
379
|
+
|
|
380
|
+
sort_key = args.sort or args.sort_asc
|
|
381
|
+
ascending = bool(args.sort_asc)
|
|
382
|
+
|
|
383
|
+
# Load config for metric names (used to build default column list)
|
|
384
|
+
config = load_config()
|
|
385
|
+
eval_cfg = config.get("evaluation", {})
|
|
386
|
+
configured_metrics: list[str] = eval_cfg.get("metrics", [])
|
|
387
|
+
|
|
388
|
+
# Load and filter experiments
|
|
389
|
+
raw_experiments = load_experiments(args.log)
|
|
390
|
+
|
|
391
|
+
if not raw_experiments:
|
|
392
|
+
msg = f"No experiments found in {args.log}."
|
|
393
|
+
if args.format in ("markdown",):
|
|
394
|
+
print(f"_{msg}_")
|
|
395
|
+
else:
|
|
396
|
+
print(msg, file=sys.stderr)
|
|
397
|
+
sys.exit(0)
|
|
398
|
+
|
|
399
|
+
filtered = filter_experiments(raw_experiments, args.status, args.last, args.family)
|
|
400
|
+
|
|
401
|
+
if not filtered:
|
|
402
|
+
msg = "No experiments match the requested filters."
|
|
403
|
+
if args.format == "markdown":
|
|
404
|
+
print(f"_{msg}_")
|
|
405
|
+
else:
|
|
406
|
+
print(msg, file=sys.stderr)
|
|
407
|
+
sys.exit(0)
|
|
408
|
+
|
|
409
|
+
# Flatten each entry
|
|
410
|
+
rows = [_flatten(e) for e in filtered]
|
|
411
|
+
|
|
412
|
+
# Determine column list
|
|
413
|
+
if args.columns:
|
|
414
|
+
columns = [c.strip() for c in args.columns.split(",") if c.strip()]
|
|
415
|
+
else:
|
|
416
|
+
# Build default: fixed header + configured metrics + description
|
|
417
|
+
all_cols = _collect_all_columns(rows)
|
|
418
|
+
# Promote configured metrics to appear before description when they
|
|
419
|
+
# exist in the data.
|
|
420
|
+
fixed_head = ["experiment_id", "status", "model_type"]
|
|
421
|
+
fixed_tail = ["description"]
|
|
422
|
+
metric_cols = [m for m in configured_metrics if m in {c for r in rows for c in r}]
|
|
423
|
+
# Any additional metric columns not listed in config
|
|
424
|
+
extra_metric_cols = [
|
|
425
|
+
c for c in all_cols
|
|
426
|
+
if c not in fixed_head
|
|
427
|
+
and c not in fixed_tail
|
|
428
|
+
and c not in metric_cols
|
|
429
|
+
and c not in {"timestamp", "git_commit", "parent_experiment",
|
|
430
|
+
"hypothesis_id", "family", "model_path", "tags"}
|
|
431
|
+
]
|
|
432
|
+
columns = fixed_head + metric_cols + extra_metric_cols + fixed_tail
|
|
433
|
+
|
|
434
|
+
# Sort
|
|
435
|
+
rows = sort_experiments(rows, sort_key, ascending)
|
|
436
|
+
|
|
437
|
+
# Render
|
|
438
|
+
formatters = {
|
|
439
|
+
"csv": format_csv,
|
|
440
|
+
"markdown": format_markdown,
|
|
441
|
+
"latex": format_latex,
|
|
442
|
+
"json": format_json,
|
|
443
|
+
}
|
|
444
|
+
output_text = formatters[args.format](rows, columns)
|
|
445
|
+
|
|
446
|
+
# Write
|
|
447
|
+
if args.output:
|
|
448
|
+
out_path = Path(args.output)
|
|
449
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
450
|
+
out_path.write_text(output_text, encoding="utf-8")
|
|
451
|
+
print(f"Wrote {len(rows)} experiment(s) to {out_path}", file=sys.stderr)
|
|
452
|
+
else:
|
|
453
|
+
print(output_text, end="")
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
if __name__ == "__main__":
|
|
457
|
+
main()
|
|
@@ -23,6 +23,7 @@ from pathlib import Path
|
|
|
23
23
|
|
|
24
24
|
import yaml
|
|
25
25
|
|
|
26
|
+
from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
|
|
26
27
|
from scripts.turing_io import load_config, load_experiments, load_hypotheses
|
|
27
28
|
|
|
28
29
|
|
|
@@ -220,6 +221,8 @@ def format_brief(
|
|
|
220
221
|
lower_is_better: bool,
|
|
221
222
|
failure_clusters: list[dict] | None = None,
|
|
222
223
|
env_warnings: list[str] | None = None,
|
|
224
|
+
cost_data: list | None = None,
|
|
225
|
+
cost_frontier: list | None = None,
|
|
223
226
|
) -> str:
|
|
224
227
|
"""Format the research briefing as markdown."""
|
|
225
228
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -312,6 +315,51 @@ def format_brief(
|
|
|
312
315
|
lines.append("")
|
|
313
316
|
lines.append("*Results may not be directly comparable. Consider re-running the best experiment in the current environment.*")
|
|
314
317
|
|
|
318
|
+
# Cost-performance analysis (only if train_seconds data exists)
|
|
319
|
+
if cost_data and cost_frontier is not None:
|
|
320
|
+
lines.extend(["", "## Cost-Performance Analysis", ""])
|
|
321
|
+
|
|
322
|
+
frontier_ids = {r.experiment_id for r in cost_frontier}
|
|
323
|
+
|
|
324
|
+
lines.append("**Pareto frontier (efficient set):**")
|
|
325
|
+
if cost_frontier:
|
|
326
|
+
for r in cost_frontier:
|
|
327
|
+
lines.append(
|
|
328
|
+
f"- {r.experiment_id} ({r.model_type}): "
|
|
329
|
+
f"{metric}={r.metric_value:.4f}, time={_format_seconds(r.train_seconds)}"
|
|
330
|
+
)
|
|
331
|
+
else:
|
|
332
|
+
lines.append("- No Pareto-optimal experiments found.")
|
|
333
|
+
|
|
334
|
+
# Compare best metric vs cheapest frontier alternative
|
|
335
|
+
if len(cost_frontier) >= 2:
|
|
336
|
+
if lower_is_better:
|
|
337
|
+
best_cost_exp = min(cost_data, key=lambda r: r.metric_value)
|
|
338
|
+
else:
|
|
339
|
+
best_cost_exp = max(cost_data, key=lambda r: r.metric_value)
|
|
340
|
+
|
|
341
|
+
cheapest = cost_frontier[0] # sorted by train_seconds
|
|
342
|
+
|
|
343
|
+
if best_cost_exp.experiment_id != cheapest.experiment_id:
|
|
344
|
+
metric_diff = abs(best_cost_exp.metric_value - cheapest.metric_value)
|
|
345
|
+
if cheapest.metric_value != 0:
|
|
346
|
+
pct = metric_diff / abs(cheapest.metric_value) * 100
|
|
347
|
+
else:
|
|
348
|
+
pct = 0.0
|
|
349
|
+
if cheapest.train_seconds > 0:
|
|
350
|
+
ratio = best_cost_exp.train_seconds / cheapest.train_seconds
|
|
351
|
+
else:
|
|
352
|
+
ratio = float("inf")
|
|
353
|
+
|
|
354
|
+
lines.extend([
|
|
355
|
+
"",
|
|
356
|
+
f"Current best: **{best_cost_exp.experiment_id}** "
|
|
357
|
+
f"({best_cost_exp.metric_value:.4f}, {_format_seconds(best_cost_exp.train_seconds)})",
|
|
358
|
+
f"Cheapest acceptable: **{cheapest.experiment_id}** "
|
|
359
|
+
f"({cheapest.metric_value:.4f}, {_format_seconds(cheapest.train_seconds)})",
|
|
360
|
+
f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
|
|
361
|
+
])
|
|
362
|
+
|
|
315
363
|
lines.extend([
|
|
316
364
|
"",
|
|
317
365
|
"## Recommendations",
|
|
@@ -367,9 +415,15 @@ def generate_brief(
|
|
|
367
415
|
failures = cluster_failures(experiments)
|
|
368
416
|
env_warnings = detect_environment_drift(experiments)
|
|
369
417
|
|
|
418
|
+
# Load cost-performance data if available
|
|
419
|
+
cost_records = load_cost_data(log_path, metric)
|
|
420
|
+
pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
|
|
421
|
+
|
|
370
422
|
return format_brief(
|
|
371
423
|
campaign, best, trajectory, model_types, hypotheses,
|
|
372
424
|
metric, lower_is_better, failures, env_warnings,
|
|
425
|
+
cost_data=cost_records if cost_records else None,
|
|
426
|
+
cost_frontier=pareto if cost_records else None,
|
|
373
427
|
)
|
|
374
428
|
|
|
375
429
|
|