claude-turing 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +66 -3
- package/commands/card.md +36 -0
- package/commands/explore.md +107 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +2 -2
- package/src/verify.js +2 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +58 -3
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +8 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/scripts/treequest_suggest.py +520 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Metric trajectory chart generator for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Reads experiments/log.jsonl and plots the primary metric over experiment
|
|
5
|
+
sequence. Produces publication-ready SVG or PNG charts suitable for papers
|
|
6
|
+
and slides, or opens an interactive window for exploratory analysis.
|
|
7
|
+
|
|
8
|
+
Chart anatomy:
|
|
9
|
+
- Green dots: kept experiments (passed evaluation)
|
|
10
|
+
- Red dots: discarded experiments (failed evaluation or were dropped)
|
|
11
|
+
- Blue step line: "best so far" running maximum (or minimum, if lower_is_better)
|
|
12
|
+
- Dashed horizontal line: convergence threshold band around best value
|
|
13
|
+
- Star annotation: best experiment ID and value
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
python scripts/plot_trajectory.py # Interactive
|
|
17
|
+
python scripts/plot_trajectory.py --output trajectory.svg # SVG for papers
|
|
18
|
+
python scripts/plot_trajectory.py --output trajectory.png --dpi 300 # High-res PNG
|
|
19
|
+
python scripts/plot_trajectory.py --last 20 --no-discarded # Clean recent view
|
|
20
|
+
python scripts/plot_trajectory.py --metric f1_weighted # Specific metric
|
|
21
|
+
|
|
22
|
+
Exit codes:
|
|
23
|
+
0 = success
|
|
24
|
+
1 = error (no experiments, missing metric, bad args)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import json
|
|
31
|
+
import sys
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Optional
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Config loading
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_config(config_path: str) -> dict:
|
|
42
|
+
"""Load relevant settings from config.yaml.
|
|
43
|
+
|
|
44
|
+
Returns dict with keys: primary_metric, lower_is_better, patience,
|
|
45
|
+
improvement_threshold, project_name.
|
|
46
|
+
|
|
47
|
+
Falls back to safe defaults if config is missing or malformed.
|
|
48
|
+
"""
|
|
49
|
+
defaults: dict = {
|
|
50
|
+
"primary_metric": "accuracy",
|
|
51
|
+
"lower_is_better": False,
|
|
52
|
+
"patience": 3,
|
|
53
|
+
"improvement_threshold": 0.005,
|
|
54
|
+
"project_name": "ML Project",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
path = Path(config_path)
|
|
58
|
+
if not path.exists():
|
|
59
|
+
return defaults
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
import yaml # yaml is already in the autoresearch env
|
|
63
|
+
|
|
64
|
+
with open(path) as f:
|
|
65
|
+
config = yaml.safe_load(f) or {}
|
|
66
|
+
|
|
67
|
+
eval_cfg = config.get("evaluation", {})
|
|
68
|
+
conv_cfg = config.get("convergence", {})
|
|
69
|
+
|
|
70
|
+
# Attempt to derive a human-readable project name from the data source
|
|
71
|
+
# or fall back to the directory name of the config file.
|
|
72
|
+
data_source = config.get("data", {}).get("source", "")
|
|
73
|
+
project_name = (
|
|
74
|
+
Path(data_source).stem.replace("_", " ").title()
|
|
75
|
+
if data_source and not data_source.startswith("{{")
|
|
76
|
+
else Path(config_path).parent.name.replace("_", " ").title()
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
"primary_metric": eval_cfg.get("primary_metric", defaults["primary_metric"]),
|
|
81
|
+
"lower_is_better": eval_cfg.get("lower_is_better", defaults["lower_is_better"]),
|
|
82
|
+
"patience": conv_cfg.get("patience", defaults["patience"]),
|
|
83
|
+
"improvement_threshold": conv_cfg.get(
|
|
84
|
+
"improvement_threshold", defaults["improvement_threshold"]
|
|
85
|
+
),
|
|
86
|
+
"project_name": project_name,
|
|
87
|
+
}
|
|
88
|
+
except Exception as exc: # pragma: no cover
|
|
89
|
+
print(f"plot_trajectory: Warning — could not parse config: {exc}", file=sys.stderr)
|
|
90
|
+
return defaults
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Experiment log loading
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def load_experiments(log_path: str, metric: str) -> list[dict]:
|
|
99
|
+
"""Load all experiments from log.jsonl that contain the requested metric.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
log_path: Path to experiments/log.jsonl.
|
|
103
|
+
metric: Metric name to extract.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of dicts, each with keys:
|
|
107
|
+
index int — 1-based chronological sequence number
|
|
108
|
+
experiment_id str
|
|
109
|
+
value float — metric value
|
|
110
|
+
status str — "kept" | "discarded" | other
|
|
111
|
+
"""
|
|
112
|
+
path = Path(log_path)
|
|
113
|
+
if not path.exists():
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
results = []
|
|
117
|
+
seq = 0
|
|
118
|
+
with open(path) as f:
|
|
119
|
+
for line in f:
|
|
120
|
+
line = line.strip()
|
|
121
|
+
if not line:
|
|
122
|
+
continue
|
|
123
|
+
try:
|
|
124
|
+
entry = json.loads(line)
|
|
125
|
+
except json.JSONDecodeError:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
value = entry.get("metrics", {}).get(metric)
|
|
129
|
+
if value is None:
|
|
130
|
+
continue # Skip entries without the requested metric
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
value = float(value)
|
|
134
|
+
except (TypeError, ValueError):
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
seq += 1
|
|
138
|
+
results.append({
|
|
139
|
+
"index": seq,
|
|
140
|
+
"experiment_id": entry.get("experiment_id", f"exp-{seq:03d}"),
|
|
141
|
+
"value": value,
|
|
142
|
+
"status": entry.get("status", "unknown"),
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
return results
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# ---------------------------------------------------------------------------
|
|
149
|
+
# Best-so-far computation
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def compute_best_so_far(
|
|
154
|
+
experiments: list[dict],
|
|
155
|
+
lower_is_better: bool,
|
|
156
|
+
) -> list[float]:
|
|
157
|
+
"""Return a list of running-best values aligned with experiments.
|
|
158
|
+
|
|
159
|
+
For each position i, best_so_far[i] is the best metric value seen
|
|
160
|
+
among experiments[0..i] (inclusive), considering all statuses.
|
|
161
|
+
"""
|
|
162
|
+
best: Optional[float] = None
|
|
163
|
+
result: list[float] = []
|
|
164
|
+
for exp in experiments:
|
|
165
|
+
v = exp["value"]
|
|
166
|
+
if best is None:
|
|
167
|
+
best = v
|
|
168
|
+
else:
|
|
169
|
+
if lower_is_better:
|
|
170
|
+
best = min(best, v)
|
|
171
|
+
else:
|
|
172
|
+
best = max(best, v)
|
|
173
|
+
result.append(best)
|
|
174
|
+
return result
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# Convergence detection (reused logic from check_convergence.py)
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def detect_convergence(
|
|
183
|
+
experiments: list[dict],
|
|
184
|
+
patience: int,
|
|
185
|
+
improvement_threshold: float,
|
|
186
|
+
lower_is_better: bool,
|
|
187
|
+
) -> bool:
|
|
188
|
+
"""Return True if the last *patience* kept experiments show no meaningful gain."""
|
|
189
|
+
kept = [e for e in experiments if e["status"] == "kept"]
|
|
190
|
+
if len(kept) < patience:
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
for i in range(len(kept) - patience, len(kept)):
|
|
194
|
+
prior_values = [e["value"] for e in kept[:i]]
|
|
195
|
+
if not prior_values:
|
|
196
|
+
return False
|
|
197
|
+
prior_best = min(prior_values) if lower_is_better else max(prior_values)
|
|
198
|
+
current = kept[i]["value"]
|
|
199
|
+
if prior_best == 0:
|
|
200
|
+
improvement = 1.0 if current != 0 else 0.0
|
|
201
|
+
elif lower_is_better:
|
|
202
|
+
improvement = (prior_best - current) / abs(prior_best)
|
|
203
|
+
else:
|
|
204
|
+
improvement = (current - prior_best) / abs(prior_best)
|
|
205
|
+
|
|
206
|
+
if improvement >= improvement_threshold:
|
|
207
|
+
return False # At least one non-trivial improvement in the window
|
|
208
|
+
|
|
209
|
+
return True
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Plotting
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def parse_figsize(figsize_str: str) -> tuple[float, float]:
|
|
218
|
+
"""Parse a WxH string (e.g. '10x6') into a (width, height) tuple."""
|
|
219
|
+
try:
|
|
220
|
+
w, h = figsize_str.lower().split("x")
|
|
221
|
+
return float(w), float(h)
|
|
222
|
+
except (ValueError, AttributeError) as exc:
|
|
223
|
+
raise argparse.ArgumentTypeError(
|
|
224
|
+
f"Invalid --figsize '{figsize_str}'. Expected format: WxH (e.g. 10x6)"
|
|
225
|
+
) from exc
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _best_experiment(
|
|
229
|
+
experiments: list[dict],
|
|
230
|
+
lower_is_better: bool,
|
|
231
|
+
) -> dict:
|
|
232
|
+
"""Return the experiment with the single best metric value."""
|
|
233
|
+
return (min if lower_is_better else max)(experiments, key=lambda e: e["value"])
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def plot_trajectory(
|
|
237
|
+
experiments: list[dict],
|
|
238
|
+
metric: str,
|
|
239
|
+
config: dict,
|
|
240
|
+
args: argparse.Namespace,
|
|
241
|
+
) -> None:
|
|
242
|
+
"""Build and emit the trajectory chart.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
experiments: Filtered experiment list (already sliced by --last).
|
|
246
|
+
metric: Metric name being plotted.
|
|
247
|
+
config: Parsed config dict.
|
|
248
|
+
args: CLI arguments namespace.
|
|
249
|
+
"""
|
|
250
|
+
import matplotlib.pyplot as plt
|
|
251
|
+
import matplotlib.patches as mpatches
|
|
252
|
+
|
|
253
|
+
# Style — use seaborn-v0_8-whitegrid when available, fall back gracefully
|
|
254
|
+
available_styles = plt.style.available
|
|
255
|
+
for candidate in ("seaborn-v0_8-whitegrid", "seaborn-whitegrid", "ggplot", "default"):
|
|
256
|
+
if candidate in available_styles or candidate == "default":
|
|
257
|
+
plt.style.use(candidate)
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
lower_is_better: bool = config["lower_is_better"]
|
|
261
|
+
patience: int = config["patience"]
|
|
262
|
+
improvement_threshold: float = config["improvement_threshold"]
|
|
263
|
+
|
|
264
|
+
# Separate kept vs discarded
|
|
265
|
+
kept = [e for e in experiments if e["status"] == "kept"]
|
|
266
|
+
discarded = [e for e in experiments if e["status"] != "kept"]
|
|
267
|
+
|
|
268
|
+
# Best-so-far step line (computed over all experiments)
|
|
269
|
+
best_so_far = compute_best_so_far(experiments, lower_is_better)
|
|
270
|
+
xs_all = [e["index"] for e in experiments]
|
|
271
|
+
|
|
272
|
+
# Best experiment overall
|
|
273
|
+
best_exp = _best_experiment(experiments, lower_is_better)
|
|
274
|
+
best_value = best_exp["value"]
|
|
275
|
+
|
|
276
|
+
# Convergence threshold band: best ± threshold × |best|
|
|
277
|
+
threshold_delta = improvement_threshold * abs(best_value)
|
|
278
|
+
if lower_is_better:
|
|
279
|
+
threshold_y = best_value + threshold_delta
|
|
280
|
+
else:
|
|
281
|
+
threshold_y = best_value - threshold_delta
|
|
282
|
+
|
|
283
|
+
# Convergence detection
|
|
284
|
+
converged = detect_convergence(
|
|
285
|
+
experiments, patience, improvement_threshold, lower_is_better
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# -----------------------------------------------------------------------
|
|
289
|
+
# Figure setup
|
|
290
|
+
# -----------------------------------------------------------------------
|
|
291
|
+
figsize = parse_figsize(args.figsize)
|
|
292
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
293
|
+
|
|
294
|
+
# -----------------------------------------------------------------------
|
|
295
|
+
# Best-so-far step function
|
|
296
|
+
# -----------------------------------------------------------------------
|
|
297
|
+
ax.step(
|
|
298
|
+
xs_all,
|
|
299
|
+
best_so_far,
|
|
300
|
+
where="post",
|
|
301
|
+
color="#2166ac",
|
|
302
|
+
linewidth=1.8,
|
|
303
|
+
alpha=0.85,
|
|
304
|
+
label="Best so far",
|
|
305
|
+
zorder=2,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# -----------------------------------------------------------------------
|
|
309
|
+
# Convergence threshold dashed line
|
|
310
|
+
# -----------------------------------------------------------------------
|
|
311
|
+
ax.axhline(
|
|
312
|
+
threshold_y,
|
|
313
|
+
color="#762a83",
|
|
314
|
+
linestyle="--",
|
|
315
|
+
linewidth=1.2,
|
|
316
|
+
alpha=0.7,
|
|
317
|
+
label=f"Convergence threshold ({improvement_threshold * 100:.1f}% from best)",
|
|
318
|
+
zorder=2,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# -----------------------------------------------------------------------
|
|
322
|
+
# Scatter: kept experiments (green)
|
|
323
|
+
# -----------------------------------------------------------------------
|
|
324
|
+
if kept:
|
|
325
|
+
ax.scatter(
|
|
326
|
+
[e["index"] for e in kept],
|
|
327
|
+
[e["value"] for e in kept],
|
|
328
|
+
color="#4dac26",
|
|
329
|
+
s=55,
|
|
330
|
+
zorder=4,
|
|
331
|
+
alpha=0.9,
|
|
332
|
+
edgecolors="white",
|
|
333
|
+
linewidths=0.6,
|
|
334
|
+
label=f"Kept ({len(kept)})",
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# -----------------------------------------------------------------------
|
|
338
|
+
# Scatter: discarded experiments (red) — hidden if --no-discarded
|
|
339
|
+
# -----------------------------------------------------------------------
|
|
340
|
+
if discarded and not args.no_discarded:
|
|
341
|
+
ax.scatter(
|
|
342
|
+
[e["index"] for e in discarded],
|
|
343
|
+
[e["value"] for e in discarded],
|
|
344
|
+
color="#d01c8b",
|
|
345
|
+
s=40,
|
|
346
|
+
zorder=3,
|
|
347
|
+
alpha=0.65,
|
|
348
|
+
marker="x",
|
|
349
|
+
linewidths=1.2,
|
|
350
|
+
label=f"Discarded ({len(discarded)})",
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# -----------------------------------------------------------------------
|
|
354
|
+
# Annotate best experiment
|
|
355
|
+
# -----------------------------------------------------------------------
|
|
356
|
+
best_x = best_exp["index"]
|
|
357
|
+
ax.scatter(
|
|
358
|
+
[best_x],
|
|
359
|
+
[best_value],
|
|
360
|
+
color="#d73027",
|
|
361
|
+
s=130,
|
|
362
|
+
zorder=5,
|
|
363
|
+
marker="*",
|
|
364
|
+
edgecolors="#7f0000",
|
|
365
|
+
linewidths=0.6,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Choose annotation offset direction: push up for higher-is-better, down for lower
|
|
369
|
+
vert_offset = 0.015 * (ax.get_ylim()[1] - ax.get_ylim()[0] or 1.0)
|
|
370
|
+
annotation_y = best_value + vert_offset if not lower_is_better else best_value - vert_offset
|
|
371
|
+
|
|
372
|
+
ax.annotate(
|
|
373
|
+
f"Best: {best_exp['experiment_id']}\n{best_value:.4f}",
|
|
374
|
+
xy=(best_x, best_value),
|
|
375
|
+
xytext=(best_x + max(1, len(experiments) * 0.04), annotation_y),
|
|
376
|
+
fontsize=8,
|
|
377
|
+
color="#7f0000",
|
|
378
|
+
arrowprops=dict(
|
|
379
|
+
arrowstyle="->",
|
|
380
|
+
color="#7f0000",
|
|
381
|
+
lw=1.0,
|
|
382
|
+
),
|
|
383
|
+
bbox=dict(
|
|
384
|
+
boxstyle="round,pad=0.25",
|
|
385
|
+
facecolor="white",
|
|
386
|
+
edgecolor="#7f0000",
|
|
387
|
+
alpha=0.85,
|
|
388
|
+
linewidth=0.8,
|
|
389
|
+
),
|
|
390
|
+
zorder=6,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# -----------------------------------------------------------------------
|
|
394
|
+
# Converged annotation
|
|
395
|
+
# -----------------------------------------------------------------------
|
|
396
|
+
if converged:
|
|
397
|
+
ax.text(
|
|
398
|
+
0.98,
|
|
399
|
+
0.05,
|
|
400
|
+
f"Converged\n(patience={patience})",
|
|
401
|
+
transform=ax.transAxes,
|
|
402
|
+
fontsize=8,
|
|
403
|
+
color="#762a83",
|
|
404
|
+
ha="right",
|
|
405
|
+
va="bottom",
|
|
406
|
+
bbox=dict(
|
|
407
|
+
boxstyle="round,pad=0.3",
|
|
408
|
+
facecolor="#f7f4f9",
|
|
409
|
+
edgecolor="#762a83",
|
|
410
|
+
alpha=0.9,
|
|
411
|
+
linewidth=0.8,
|
|
412
|
+
),
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# -----------------------------------------------------------------------
|
|
416
|
+
# Labels, title, legend
|
|
417
|
+
# -----------------------------------------------------------------------
|
|
418
|
+
direction_hint = "(lower is better)" if lower_is_better else "(higher is better)"
|
|
419
|
+
ax.set_xlabel("Experiment #", fontsize=11)
|
|
420
|
+
ax.set_ylabel(f"{metric} {direction_hint}", fontsize=11)
|
|
421
|
+
|
|
422
|
+
if args.title:
|
|
423
|
+
title = args.title
|
|
424
|
+
else:
|
|
425
|
+
project_name = config.get("project_name", "ML Project")
|
|
426
|
+
title = f"{project_name} — {metric} trajectory"
|
|
427
|
+
|
|
428
|
+
ax.set_title(title, fontsize=13, fontweight="bold", pad=12)
|
|
429
|
+
|
|
430
|
+
# Integer x-ticks only
|
|
431
|
+
ax.xaxis.get_major_locator().set_params(integer=True) # type: ignore[attr-defined]
|
|
432
|
+
|
|
433
|
+
ax.legend(fontsize=9, framealpha=0.85, loc="best")
|
|
434
|
+
ax.spines["top"].set_visible(False)
|
|
435
|
+
ax.spines["right"].set_visible(False)
|
|
436
|
+
|
|
437
|
+
fig.tight_layout()
|
|
438
|
+
|
|
439
|
+
# -----------------------------------------------------------------------
|
|
440
|
+
# Output
|
|
441
|
+
# -----------------------------------------------------------------------
|
|
442
|
+
if args.output:
|
|
443
|
+
out_path = Path(args.output)
|
|
444
|
+
ext = out_path.suffix.lower()
|
|
445
|
+
if ext == ".png":
|
|
446
|
+
fig.savefig(out_path, dpi=args.dpi, bbox_inches="tight")
|
|
447
|
+
elif ext == ".svg":
|
|
448
|
+
fig.savefig(out_path, format="svg", bbox_inches="tight")
|
|
449
|
+
else:
|
|
450
|
+
print(
|
|
451
|
+
f"plot_trajectory: Warning — unknown extension '{ext}', saving as PNG.",
|
|
452
|
+
file=sys.stderr,
|
|
453
|
+
)
|
|
454
|
+
fig.savefig(out_path, dpi=args.dpi, bbox_inches="tight")
|
|
455
|
+
plt.close(fig)
|
|
456
|
+
print(f"plot_trajectory: Saved to {out_path}")
|
|
457
|
+
else:
|
|
458
|
+
plt.show()
|
|
459
|
+
plt.close(fig)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# ---------------------------------------------------------------------------
|
|
463
|
+
# CLI
|
|
464
|
+
# ---------------------------------------------------------------------------
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
468
|
+
parser = argparse.ArgumentParser(
|
|
469
|
+
description=(
|
|
470
|
+
"Plot primary metric trajectory from experiments/log.jsonl.\n"
|
|
471
|
+
"Output can be SVG (default, best for papers), PNG, or interactive."
|
|
472
|
+
),
|
|
473
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
474
|
+
epilog=(
|
|
475
|
+
"Examples:\n"
|
|
476
|
+
" python scripts/plot_trajectory.py\n"
|
|
477
|
+
" python scripts/plot_trajectory.py --output trajectory.svg\n"
|
|
478
|
+
" python scripts/plot_trajectory.py --output trajectory.png --dpi 300\n"
|
|
479
|
+
" python scripts/plot_trajectory.py --last 20 --no-discarded\n"
|
|
480
|
+
" python scripts/plot_trajectory.py --metric f1_weighted\n"
|
|
481
|
+
),
|
|
482
|
+
)
|
|
483
|
+
parser.add_argument(
|
|
484
|
+
"--log",
|
|
485
|
+
default="experiments/log.jsonl",
|
|
486
|
+
metavar="PATH",
|
|
487
|
+
help="Path to experiment log (default: experiments/log.jsonl)",
|
|
488
|
+
)
|
|
489
|
+
parser.add_argument(
|
|
490
|
+
"--config",
|
|
491
|
+
default="config.yaml",
|
|
492
|
+
metavar="PATH",
|
|
493
|
+
help="Path to config.yaml (default: config.yaml)",
|
|
494
|
+
)
|
|
495
|
+
parser.add_argument(
|
|
496
|
+
"--output",
|
|
497
|
+
default=None,
|
|
498
|
+
metavar="FILE",
|
|
499
|
+
help="Output file path (.svg or .png). Omit for interactive display.",
|
|
500
|
+
)
|
|
501
|
+
parser.add_argument(
|
|
502
|
+
"--metric",
|
|
503
|
+
default=None,
|
|
504
|
+
metavar="NAME",
|
|
505
|
+
help="Metric to plot (default: primary_metric from config.yaml)",
|
|
506
|
+
)
|
|
507
|
+
parser.add_argument(
|
|
508
|
+
"--last",
|
|
509
|
+
type=int,
|
|
510
|
+
default=None,
|
|
511
|
+
metavar="N",
|
|
512
|
+
help="Only plot the last N experiments",
|
|
513
|
+
)
|
|
514
|
+
parser.add_argument(
|
|
515
|
+
"--no-discarded",
|
|
516
|
+
action="store_true",
|
|
517
|
+
help="Hide discarded experiments from the chart",
|
|
518
|
+
)
|
|
519
|
+
parser.add_argument(
|
|
520
|
+
"--title",
|
|
521
|
+
default=None,
|
|
522
|
+
metavar="TEXT",
|
|
523
|
+
help="Override the auto-generated chart title",
|
|
524
|
+
)
|
|
525
|
+
parser.add_argument(
|
|
526
|
+
"--figsize",
|
|
527
|
+
default="10x6",
|
|
528
|
+
metavar="WxH",
|
|
529
|
+
help="Figure size in inches, width x height (default: 10x6)",
|
|
530
|
+
)
|
|
531
|
+
parser.add_argument(
|
|
532
|
+
"--dpi",
|
|
533
|
+
type=int,
|
|
534
|
+
default=150,
|
|
535
|
+
help="Resolution for PNG output in dots per inch (default: 150)",
|
|
536
|
+
)
|
|
537
|
+
return parser
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def main() -> None:
|
|
541
|
+
"""CLI entry point."""
|
|
542
|
+
# Guard: matplotlib must be importable
|
|
543
|
+
try:
|
|
544
|
+
import matplotlib # noqa: F401
|
|
545
|
+
except ImportError:
|
|
546
|
+
print(
|
|
547
|
+
"plot_trajectory: Error — matplotlib is not installed.\n"
|
|
548
|
+
" Install it with: pip install matplotlib",
|
|
549
|
+
file=sys.stderr,
|
|
550
|
+
)
|
|
551
|
+
sys.exit(1)
|
|
552
|
+
|
|
553
|
+
parser = build_parser()
|
|
554
|
+
args = parser.parse_args()
|
|
555
|
+
|
|
556
|
+
# Validate --figsize early so we get a clean error message
|
|
557
|
+
try:
|
|
558
|
+
parse_figsize(args.figsize)
|
|
559
|
+
except argparse.ArgumentTypeError as exc:
|
|
560
|
+
parser.error(str(exc))
|
|
561
|
+
|
|
562
|
+
# Load config
|
|
563
|
+
config = load_config(args.config)
|
|
564
|
+
|
|
565
|
+
# Resolve metric name
|
|
566
|
+
metric = args.metric if args.metric else config["primary_metric"]
|
|
567
|
+
|
|
568
|
+
# Load experiments
|
|
569
|
+
experiments = load_experiments(args.log, metric)
|
|
570
|
+
|
|
571
|
+
if not experiments:
|
|
572
|
+
log_path = Path(args.log)
|
|
573
|
+
if not log_path.exists():
|
|
574
|
+
print(
|
|
575
|
+
f"plot_trajectory: Error — log not found at '{args.log}'.\n"
|
|
576
|
+
" Run at least one experiment first.",
|
|
577
|
+
file=sys.stderr,
|
|
578
|
+
)
|
|
579
|
+
else:
|
|
580
|
+
print(
|
|
581
|
+
f"plot_trajectory: Error — no experiments with metric '{metric}' "
|
|
582
|
+
f"found in '{args.log}'.\n"
|
|
583
|
+
" Check the metric name or run experiments first.",
|
|
584
|
+
file=sys.stderr,
|
|
585
|
+
)
|
|
586
|
+
sys.exit(1)
|
|
587
|
+
|
|
588
|
+
# Apply --last filter
|
|
589
|
+
if args.last is not None:
|
|
590
|
+
if args.last < 1:
|
|
591
|
+
parser.error("--last must be a positive integer")
|
|
592
|
+
experiments = experiments[-args.last :]
|
|
593
|
+
|
|
594
|
+
# Re-index so x-axis is contiguous after slicing
|
|
595
|
+
if args.last is not None:
|
|
596
|
+
for i, exp in enumerate(experiments, start=1):
|
|
597
|
+
exp["index"] = i
|
|
598
|
+
|
|
599
|
+
n_kept = sum(1 for e in experiments if e["status"] == "kept")
|
|
600
|
+
n_disc = sum(1 for e in experiments if e["status"] != "kept")
|
|
601
|
+
print(
|
|
602
|
+
f"plot_trajectory: Plotting {len(experiments)} experiments "
|
|
603
|
+
f"({n_kept} kept, {n_disc} discarded) for metric '{metric}'",
|
|
604
|
+
file=sys.stderr,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
plot_trajectory(experiments, metric, config, args)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
if __name__ == "__main__":
|
|
611
|
+
main()
|
|
@@ -82,6 +82,14 @@ TEMPLATE_DIRS = {
|
|
|
82
82
|
"show_environment.py",
|
|
83
83
|
"turing_io.py",
|
|
84
84
|
"preflight.py",
|
|
85
|
+
"cleanup.py",
|
|
86
|
+
"generate_model_card.py",
|
|
87
|
+
"cost_frontier.py",
|
|
88
|
+
"leaderboard.py",
|
|
89
|
+
"diff_configs.py",
|
|
90
|
+
"export_results.py",
|
|
91
|
+
"plot_trajectory.py",
|
|
92
|
+
"treequest_suggest.py",
|
|
85
93
|
],
|
|
86
94
|
"tests": ["__init__.py", "conftest.py"],
|
|
87
95
|
}
|
|
@@ -44,9 +44,16 @@ def format_table(experiments: list[dict], best_id: str | None, metric_names: lis
|
|
|
44
44
|
if not experiments:
|
|
45
45
|
return "No experiments logged yet."
|
|
46
46
|
|
|
47
|
+
# Detect if any experiment has train_seconds
|
|
48
|
+
has_train_seconds = any(
|
|
49
|
+
exp.get("metrics", {}).get("train_seconds") is not None
|
|
50
|
+
for exp in experiments
|
|
51
|
+
)
|
|
52
|
+
|
|
47
53
|
# Build dynamic header based on configured metrics
|
|
48
54
|
metric_headers = "".join(f"{m:>12}" for m in metric_names)
|
|
49
|
-
|
|
55
|
+
time_header = f"{'Time':>10}" if has_train_seconds else ""
|
|
56
|
+
header = f"{'ID':<10} {'Status':<10} {'Model':<15}{metric_headers}{time_header} {'Timestamp':<22}"
|
|
50
57
|
sep = "-" * len(header)
|
|
51
58
|
lines = [header, sep]
|
|
52
59
|
|
|
@@ -62,9 +69,23 @@ def format_table(experiments: list[dict], best_id: str | None, metric_names: lis
|
|
|
62
69
|
else:
|
|
63
70
|
metric_values += f"{'N/A':>12}"
|
|
64
71
|
|
|
72
|
+
time_col = ""
|
|
73
|
+
if has_train_seconds:
|
|
74
|
+
train_secs = metrics.get("train_seconds")
|
|
75
|
+
if train_secs is not None:
|
|
76
|
+
if train_secs < 60:
|
|
77
|
+
time_col = f"{train_secs:.1f}s"
|
|
78
|
+
elif train_secs < 3600:
|
|
79
|
+
time_col = f"{train_secs / 60:.1f}m"
|
|
80
|
+
else:
|
|
81
|
+
time_col = f"{train_secs / 3600:.1f}h"
|
|
82
|
+
time_col = f"{time_col:>10}"
|
|
83
|
+
else:
|
|
84
|
+
time_col = f"{'N/A':>10}"
|
|
85
|
+
|
|
65
86
|
ts = exp.get("timestamp", "")[:19]
|
|
66
87
|
marker = " *BEST*" if exp.get("experiment_id") == best_id else ""
|
|
67
|
-
line = f"{exp.get('experiment_id', '?'):<10} {exp.get('status', '?'):<10} {model_type:<15}{metric_values} {ts}{marker}"
|
|
88
|
+
line = f"{exp.get('experiment_id', '?'):<10} {exp.get('status', '?'):<10} {model_type:<15}{metric_values}{time_col} {ts}{marker}"
|
|
68
89
|
lines.append(line)
|
|
69
90
|
|
|
70
91
|
return "\n".join(lines)
|