claude-turing 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +48 -7
- package/commands/brief.md +13 -1
- package/commands/card.md +36 -0
- package/commands/init.md +13 -0
- package/commands/train.md +16 -7
- package/commands/turing.md +4 -2
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +1 -0
- package/templates/model_contract.md +49 -0
- package/templates/model_registry.yaml +69 -0
- package/templates/program.md +2 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +54 -0
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +9 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""Compare configurations and metrics between two experiments.
|
|
2
|
+
|
|
3
|
+
Computes a structured diff of config dicts (recursively) and metric deltas,
|
|
4
|
+
showing what changed, what was added, and what was removed between two
|
|
5
|
+
experiment log entries.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python scripts/diff_configs.py exp-005 exp-012 # Human-readable diff
|
|
9
|
+
python scripts/diff_configs.py exp-005 exp-012 --json # Machine-readable JSON
|
|
10
|
+
python scripts/diff_configs.py exp-005 best # Compare against current best
|
|
11
|
+
|
|
12
|
+
The special keyword "best" resolves to the best kept experiment according to
|
|
13
|
+
the primary_metric and lower_is_better settings in config.yaml.
|
|
14
|
+
|
|
15
|
+
Ignored metadata fields: timestamp, experiment_id, git_commit.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
import yaml
|
|
27
|
+
|
|
28
|
+
from scripts.turing_io import load_config, load_experiments
|
|
29
|
+
|
|
30
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
31
|
+
|
|
32
|
+
# Fields stripped before diffing — they change every run and carry no signal
|
|
33
|
+
IGNORED_METADATA_FIELDS = {"timestamp", "experiment_id", "git_commit"}
|
|
34
|
+
|
|
35
|
+
# Hyperparameter-related config keys to surface in the focused HP section
|
|
36
|
+
HYPERPARAMETER_KEYS = {"hyperparams", "model", "learning_rate", "n_estimators",
|
|
37
|
+
"max_depth", "dropout", "weight_decay", "batch_size",
|
|
38
|
+
"optimizer", "scheduler", "epochs", "num_layers"}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Experiment loading helpers
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def load_experiment(log_path: str, experiment_id: str) -> dict | None:
|
|
46
|
+
"""Load a single experiment entry by ID from experiments/log.jsonl.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
log_path: Path to the JSONL log file.
|
|
50
|
+
experiment_id: Experiment ID string, e.g. "exp-005".
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Experiment dict, or None if the ID is not found.
|
|
54
|
+
"""
|
|
55
|
+
path = Path(log_path)
|
|
56
|
+
if not path.exists():
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
with open(path) as f:
|
|
60
|
+
for line in f:
|
|
61
|
+
line = line.strip()
|
|
62
|
+
if not line:
|
|
63
|
+
continue
|
|
64
|
+
try:
|
|
65
|
+
entry = json.loads(line)
|
|
66
|
+
if entry.get("experiment_id") == experiment_id:
|
|
67
|
+
return entry
|
|
68
|
+
except json.JSONDecodeError:
|
|
69
|
+
continue
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def resolve_best(log_path: str, primary_metric: str, lower_is_better: bool) -> dict | None:
|
|
74
|
+
"""Return the best kept experiment by primary metric.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
log_path: Path to the JSONL log file.
|
|
78
|
+
primary_metric: Metric name used for ranking.
|
|
79
|
+
lower_is_better: True for loss/error metrics, False for accuracy/F1.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Best experiment dict, or None if no kept experiments exist.
|
|
83
|
+
"""
|
|
84
|
+
experiments = load_experiments(log_path)
|
|
85
|
+
best: dict | None = None
|
|
86
|
+
best_value = float("inf") if lower_is_better else float("-inf")
|
|
87
|
+
|
|
88
|
+
for exp in experiments:
|
|
89
|
+
if exp.get("status") != "kept":
|
|
90
|
+
continue
|
|
91
|
+
value = exp.get("metrics", {}).get(primary_metric)
|
|
92
|
+
if value is None:
|
|
93
|
+
continue
|
|
94
|
+
if (lower_is_better and value < best_value) or (not lower_is_better and value > best_value):
|
|
95
|
+
best_value = value
|
|
96
|
+
best = exp
|
|
97
|
+
|
|
98
|
+
return best
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Core diff logic
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def flatten_dict(d: dict, prefix: str = "") -> dict[str, Any]:
|
|
106
|
+
"""Recursively flatten a nested dict with dot-separated keys.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
d: Dict to flatten (may contain nested dicts).
|
|
110
|
+
prefix: Key prefix accumulated during recursion.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Flat dict mapping dot-separated paths to leaf values.
|
|
114
|
+
"""
|
|
115
|
+
out: dict[str, Any] = {}
|
|
116
|
+
for k, v in d.items():
|
|
117
|
+
full_key = f"{prefix}.{k}" if prefix else k
|
|
118
|
+
if isinstance(v, dict):
|
|
119
|
+
out.update(flatten_dict(v, full_key))
|
|
120
|
+
else:
|
|
121
|
+
out[full_key] = v
|
|
122
|
+
return out
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def strip_ignored(d: dict) -> dict:
|
|
126
|
+
"""Remove ignored metadata fields from a shallow dict copy."""
|
|
127
|
+
return {k: v for k, v in d.items() if k not in IGNORED_METADATA_FIELDS}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def compute_config_diff(config_a: dict, config_b: dict) -> dict:
|
|
131
|
+
"""Compute a structured diff between two config dicts.
|
|
132
|
+
|
|
133
|
+
Recursively compares nested dicts using dot-separated key paths.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
config_a: Config from experiment A (baseline).
|
|
137
|
+
config_b: Config from experiment B (comparison target).
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dict with keys:
|
|
141
|
+
"changed": {key: {"old": val_a, "new": val_b}}
|
|
142
|
+
"added": {key: val} — keys only in B
|
|
143
|
+
"removed": {key: val} — keys only in A
|
|
144
|
+
"unchanged_count": int — number of identical keys (not listed)
|
|
145
|
+
"""
|
|
146
|
+
flat_a = flatten_dict(strip_ignored(config_a))
|
|
147
|
+
flat_b = flatten_dict(strip_ignored(config_b))
|
|
148
|
+
|
|
149
|
+
all_keys = set(flat_a) | set(flat_b)
|
|
150
|
+
changed: dict[str, dict] = {}
|
|
151
|
+
added: dict[str, Any] = {}
|
|
152
|
+
removed: dict[str, Any] = {}
|
|
153
|
+
unchanged_count = 0
|
|
154
|
+
|
|
155
|
+
for key in sorted(all_keys):
|
|
156
|
+
in_a = key in flat_a
|
|
157
|
+
in_b = key in flat_b
|
|
158
|
+
|
|
159
|
+
if in_a and in_b:
|
|
160
|
+
if flat_a[key] != flat_b[key]:
|
|
161
|
+
changed[key] = {"old": flat_a[key], "new": flat_b[key]}
|
|
162
|
+
else:
|
|
163
|
+
unchanged_count += 1
|
|
164
|
+
elif in_b:
|
|
165
|
+
added[key] = flat_b[key]
|
|
166
|
+
else:
|
|
167
|
+
removed[key] = flat_a[key]
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
"changed": changed,
|
|
171
|
+
"added": added,
|
|
172
|
+
"removed": removed,
|
|
173
|
+
"unchanged_count": unchanged_count,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def compute_metric_diff(
|
|
178
|
+
metrics_a: dict,
|
|
179
|
+
metrics_b: dict,
|
|
180
|
+
primary_metric: str,
|
|
181
|
+
lower_is_better: bool,
|
|
182
|
+
) -> dict:
|
|
183
|
+
"""Compute metric deltas with improvement direction indicators.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
metrics_a: Metrics from experiment A.
|
|
187
|
+
metrics_b: Metrics from experiment B.
|
|
188
|
+
primary_metric: Name of the primary metric for the project.
|
|
189
|
+
lower_is_better: True for loss/error metrics.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Dict with keys:
|
|
193
|
+
"primary": {metric: {val_a, val_b, delta, direction, is_improvement}}
|
|
194
|
+
"others": {metric: {val_a, val_b, delta, direction}}
|
|
195
|
+
"added": {metric: val} — metrics only in B
|
|
196
|
+
"removed": {metric: val} — metrics only in A
|
|
197
|
+
"""
|
|
198
|
+
all_keys = set(metrics_a) | set(metrics_b)
|
|
199
|
+
primary_result: dict[str, Any] = {}
|
|
200
|
+
others: dict[str, dict] = {}
|
|
201
|
+
added: dict[str, Any] = {}
|
|
202
|
+
removed: dict[str, Any] = {}
|
|
203
|
+
|
|
204
|
+
for key in sorted(all_keys):
|
|
205
|
+
in_a = key in metrics_a
|
|
206
|
+
in_b = key in metrics_b
|
|
207
|
+
|
|
208
|
+
if in_a and in_b:
|
|
209
|
+
val_a = metrics_a[key]
|
|
210
|
+
val_b = metrics_b[key]
|
|
211
|
+
|
|
212
|
+
if isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
|
|
213
|
+
delta = val_b - val_a
|
|
214
|
+
if abs(delta) < 1e-10:
|
|
215
|
+
direction = "="
|
|
216
|
+
is_improvement = False
|
|
217
|
+
elif lower_is_better:
|
|
218
|
+
direction = "↓" if delta < 0 else "↑"
|
|
219
|
+
is_improvement = delta < 0
|
|
220
|
+
else:
|
|
221
|
+
direction = "↑" if delta > 0 else "↓"
|
|
222
|
+
is_improvement = delta > 0
|
|
223
|
+
|
|
224
|
+
entry = {
|
|
225
|
+
"val_a": val_a,
|
|
226
|
+
"val_b": val_b,
|
|
227
|
+
"delta": delta,
|
|
228
|
+
"direction": direction,
|
|
229
|
+
"is_improvement": is_improvement,
|
|
230
|
+
}
|
|
231
|
+
else:
|
|
232
|
+
entry = {
|
|
233
|
+
"val_a": val_a,
|
|
234
|
+
"val_b": val_b,
|
|
235
|
+
"delta": None,
|
|
236
|
+
"direction": "=" if val_a == val_b else "~",
|
|
237
|
+
"is_improvement": False,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if key == primary_metric:
|
|
241
|
+
primary_result[key] = entry
|
|
242
|
+
else:
|
|
243
|
+
others[key] = entry
|
|
244
|
+
|
|
245
|
+
elif in_b:
|
|
246
|
+
added[key] = metrics_b[key]
|
|
247
|
+
else:
|
|
248
|
+
removed[key] = metrics_a[key]
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
"primary": primary_result,
|
|
252
|
+
"others": others,
|
|
253
|
+
"added": added,
|
|
254
|
+
"removed": removed,
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# ---------------------------------------------------------------------------
|
|
259
|
+
# Formatting helpers
|
|
260
|
+
# ---------------------------------------------------------------------------
|
|
261
|
+
|
|
262
|
+
def _is_hp_key(key: str) -> bool:
|
|
263
|
+
"""Return True if a flat config key belongs to a hyperparameter section."""
|
|
264
|
+
parts = key.split(".")
|
|
265
|
+
return bool(set(parts) & HYPERPARAMETER_KEYS)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def format_value(v: Any) -> str:
|
|
269
|
+
"""Render a value for display; floats use 6 significant figures."""
|
|
270
|
+
if isinstance(v, float):
|
|
271
|
+
return f"{v:.6g}"
|
|
272
|
+
return str(v)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def format_text_diff(
|
|
276
|
+
exp_a: dict,
|
|
277
|
+
exp_b: dict,
|
|
278
|
+
config_diff: dict,
|
|
279
|
+
metric_diff: dict,
|
|
280
|
+
primary_metric: str,
|
|
281
|
+
lower_is_better: bool,
|
|
282
|
+
) -> str:
|
|
283
|
+
"""Render a human-readable diff report.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
exp_a: Full experiment A dict.
|
|
287
|
+
exp_b: Full experiment B dict.
|
|
288
|
+
config_diff: Output of compute_config_diff.
|
|
289
|
+
metric_diff: Output of compute_metric_diff.
|
|
290
|
+
primary_metric: Name of primary metric.
|
|
291
|
+
lower_is_better: True for loss/error metrics.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Formatted multi-line string ready for printing.
|
|
295
|
+
"""
|
|
296
|
+
id_a = exp_a.get("experiment_id", "?")
|
|
297
|
+
id_b = exp_b.get("experiment_id", "?")
|
|
298
|
+
|
|
299
|
+
lines: list[str] = []
|
|
300
|
+
lines.append("=" * 70)
|
|
301
|
+
lines.append(f" Config & Metric Diff: {id_a} → {id_b}")
|
|
302
|
+
lines.append("=" * 70)
|
|
303
|
+
|
|
304
|
+
# --- Primary metric spotlight ---
|
|
305
|
+
lines.append("")
|
|
306
|
+
lines.append("PRIMARY METRIC")
|
|
307
|
+
lines.append("-" * 40)
|
|
308
|
+
if config_diff.get("primary"):
|
|
309
|
+
# Shouldn't happen (primary_result is keyed by metric name), just a guard
|
|
310
|
+
pass
|
|
311
|
+
primary = metric_diff.get("primary", {})
|
|
312
|
+
if primary:
|
|
313
|
+
for metric_name, info in primary.items():
|
|
314
|
+
val_a = info["val_a"]
|
|
315
|
+
val_b = info["val_b"]
|
|
316
|
+
delta = info["delta"]
|
|
317
|
+
direction = info["direction"]
|
|
318
|
+
improvement_label = ""
|
|
319
|
+
if direction != "=":
|
|
320
|
+
improvement_label = " [IMPROVED]" if info["is_improvement"] else " [REGRESSED]"
|
|
321
|
+
|
|
322
|
+
direction_label = lower_is_better and "lower=better" or "higher=better"
|
|
323
|
+
lines.append(
|
|
324
|
+
f" {metric_name:<20s} {format_value(val_a):>12} → {format_value(val_b):<12}"
|
|
325
|
+
f" {direction} Δ={format_value(delta) if delta is not None else 'N/A'}"
|
|
326
|
+
f" ({direction_label}){improvement_label}"
|
|
327
|
+
)
|
|
328
|
+
else:
|
|
329
|
+
lines.append(f" {primary_metric} not found in one or both experiments.")
|
|
330
|
+
|
|
331
|
+
# --- Hyperparameter changes ---
|
|
332
|
+
changed = config_diff.get("changed", {})
|
|
333
|
+
added_cfg = config_diff.get("added", {})
|
|
334
|
+
removed_cfg = config_diff.get("removed", {})
|
|
335
|
+
|
|
336
|
+
hp_changed = {k: v for k, v in changed.items() if _is_hp_key(k)}
|
|
337
|
+
hp_added = {k: v for k, v in added_cfg.items() if _is_hp_key(k)}
|
|
338
|
+
hp_removed = {k: v for k, v in removed_cfg.items() if _is_hp_key(k)}
|
|
339
|
+
|
|
340
|
+
other_changed = {k: v for k, v in changed.items() if not _is_hp_key(k)}
|
|
341
|
+
other_added = {k: v for k, v in added_cfg.items() if not _is_hp_key(k)}
|
|
342
|
+
other_removed = {k: v for k, v in removed_cfg.items() if not _is_hp_key(k)}
|
|
343
|
+
|
|
344
|
+
if hp_changed or hp_added or hp_removed:
|
|
345
|
+
lines.append("")
|
|
346
|
+
lines.append("HYPERPARAMETER CHANGES")
|
|
347
|
+
lines.append("-" * 40)
|
|
348
|
+
for key, diff in sorted(hp_changed.items()):
|
|
349
|
+
lines.append(f" ~ {key:<30s} {format_value(diff['old']):>15} → {format_value(diff['new'])}")
|
|
350
|
+
for key, val in sorted(hp_added.items()):
|
|
351
|
+
lines.append(f" + {key:<30s} {'':>15} {format_value(val)} (added)")
|
|
352
|
+
for key, val in sorted(hp_removed.items()):
|
|
353
|
+
lines.append(f" - {key:<30s} {format_value(val):>15} {''} (removed)")
|
|
354
|
+
else:
|
|
355
|
+
lines.append("")
|
|
356
|
+
lines.append("HYPERPARAMETER CHANGES")
|
|
357
|
+
lines.append("-" * 40)
|
|
358
|
+
lines.append(" (no hyperparameter changes)")
|
|
359
|
+
|
|
360
|
+
# --- Other config changes ---
|
|
361
|
+
if other_changed or other_added or other_removed:
|
|
362
|
+
lines.append("")
|
|
363
|
+
lines.append("OTHER CONFIG CHANGES")
|
|
364
|
+
lines.append("-" * 40)
|
|
365
|
+
for key, diff in sorted(other_changed.items()):
|
|
366
|
+
lines.append(f" ~ {key:<30s} {format_value(diff['old']):>15} → {format_value(diff['new'])}")
|
|
367
|
+
for key, val in sorted(other_added.items()):
|
|
368
|
+
lines.append(f" + {key:<30s} {'':>15} {format_value(val)} (added)")
|
|
369
|
+
for key, val in sorted(other_removed.items()):
|
|
370
|
+
lines.append(f" - {key:<30s} {format_value(val):>15} {''} (removed)")
|
|
371
|
+
|
|
372
|
+
unchanged_count = config_diff.get("unchanged_count", 0)
|
|
373
|
+
lines.append("")
|
|
374
|
+
lines.append(f" ({unchanged_count} config keys unchanged)")
|
|
375
|
+
|
|
376
|
+
# --- Other metrics ---
|
|
377
|
+
other_metrics = metric_diff.get("others", {})
|
|
378
|
+
metric_added = metric_diff.get("added", {})
|
|
379
|
+
metric_removed = metric_diff.get("removed", {})
|
|
380
|
+
|
|
381
|
+
if other_metrics or metric_added or metric_removed:
|
|
382
|
+
lines.append("")
|
|
383
|
+
lines.append("OTHER METRICS")
|
|
384
|
+
lines.append("-" * 40)
|
|
385
|
+
for metric_name, info in sorted(other_metrics.items()):
|
|
386
|
+
direction = info["direction"]
|
|
387
|
+
delta = info["delta"]
|
|
388
|
+
delta_str = format_value(delta) if delta is not None else "N/A"
|
|
389
|
+
lines.append(
|
|
390
|
+
f" {metric_name:<22s} {format_value(info['val_a']):>12} → {format_value(info['val_b']):<12}"
|
|
391
|
+
f" {direction} Δ={delta_str}"
|
|
392
|
+
)
|
|
393
|
+
for metric_name, val in sorted(metric_added.items()):
|
|
394
|
+
lines.append(f" + {metric_name:<20s} (only in {id_b}): {format_value(val)}")
|
|
395
|
+
for metric_name, val in sorted(metric_removed.items()):
|
|
396
|
+
lines.append(f" - {metric_name:<20s} (only in {id_a}): {format_value(val)}")
|
|
397
|
+
|
|
398
|
+
# --- Experiment metadata footer ---
|
|
399
|
+
lines.append("")
|
|
400
|
+
lines.append("EXPERIMENT INFO")
|
|
401
|
+
lines.append("-" * 40)
|
|
402
|
+
lines.append(f" {'ID':<12} {id_a:<24} {id_b}")
|
|
403
|
+
lines.append(f" {'Status':<12} {exp_a.get('status', '?'):<24} {exp_b.get('status', '?')}")
|
|
404
|
+
lines.append(f" {'Timestamp':<12} {exp_a.get('timestamp', '?')[:19]:<24} {exp_b.get('timestamp', '?')[:19]}")
|
|
405
|
+
desc_a = (exp_a.get("description") or "")[:50]
|
|
406
|
+
desc_b = (exp_b.get("description") or "")[:50]
|
|
407
|
+
if desc_a or desc_b:
|
|
408
|
+
lines.append(f" {'Description':<12} {desc_a:<24} {desc_b}")
|
|
409
|
+
lines.append("=" * 70)
|
|
410
|
+
|
|
411
|
+
return "\n".join(lines)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def format_json_diff(
|
|
415
|
+
exp_a: dict,
|
|
416
|
+
exp_b: dict,
|
|
417
|
+
config_diff: dict,
|
|
418
|
+
metric_diff: dict,
|
|
419
|
+
primary_metric: str,
|
|
420
|
+
) -> str:
|
|
421
|
+
"""Render a machine-readable JSON diff.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
exp_a: Full experiment A dict.
|
|
425
|
+
exp_b: Full experiment B dict.
|
|
426
|
+
config_diff: Output of compute_config_diff.
|
|
427
|
+
metric_diff: Output of compute_metric_diff.
|
|
428
|
+
primary_metric: Name of primary metric.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
JSON string.
|
|
432
|
+
"""
|
|
433
|
+
result = {
|
|
434
|
+
"experiment_a": exp_a.get("experiment_id"),
|
|
435
|
+
"experiment_b": exp_b.get("experiment_id"),
|
|
436
|
+
"primary_metric": primary_metric,
|
|
437
|
+
"config_diff": config_diff,
|
|
438
|
+
"metric_diff": metric_diff,
|
|
439
|
+
}
|
|
440
|
+
return json.dumps(result, indent=2, default=str)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# ---------------------------------------------------------------------------
|
|
444
|
+
# CLI
|
|
445
|
+
# ---------------------------------------------------------------------------
|
|
446
|
+
|
|
447
|
+
def main() -> None:
|
|
448
|
+
"""CLI entry point for config and metric diffing."""
|
|
449
|
+
parser = argparse.ArgumentParser(
|
|
450
|
+
description=(
|
|
451
|
+
"Compare configurations and metrics between two experiments. "
|
|
452
|
+
'Use "best" as the second experiment ID to compare against the '
|
|
453
|
+
"current champion."
|
|
454
|
+
),
|
|
455
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
456
|
+
epilog=(
|
|
457
|
+
"Examples:\n"
|
|
458
|
+
" python scripts/diff_configs.py exp-005 exp-012\n"
|
|
459
|
+
" python scripts/diff_configs.py exp-005 exp-012 --json\n"
|
|
460
|
+
" python scripts/diff_configs.py exp-005 best\n"
|
|
461
|
+
),
|
|
462
|
+
)
|
|
463
|
+
parser.add_argument("exp_a", help='Baseline experiment ID (e.g. "exp-005")')
|
|
464
|
+
parser.add_argument(
|
|
465
|
+
"exp_b",
|
|
466
|
+
help='Target experiment ID, or "best" to use the current champion',
|
|
467
|
+
)
|
|
468
|
+
parser.add_argument(
|
|
469
|
+
"--json",
|
|
470
|
+
dest="json_output",
|
|
471
|
+
action="store_true",
|
|
472
|
+
help="Output machine-readable JSON instead of a text table",
|
|
473
|
+
)
|
|
474
|
+
parser.add_argument(
|
|
475
|
+
"--log",
|
|
476
|
+
default=DEFAULT_LOG_PATH,
|
|
477
|
+
help=f"Path to experiments/log.jsonl (default: {DEFAULT_LOG_PATH})",
|
|
478
|
+
)
|
|
479
|
+
parser.add_argument(
|
|
480
|
+
"--config",
|
|
481
|
+
default="config.yaml",
|
|
482
|
+
help="Path to config.yaml (default: config.yaml)",
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
args = parser.parse_args()
|
|
486
|
+
|
|
487
|
+
# Load project config for primary_metric and direction
|
|
488
|
+
cfg = load_config(args.config)
|
|
489
|
+
eval_cfg = cfg.get("evaluation", {})
|
|
490
|
+
primary_metric: str = eval_cfg.get("primary_metric", "accuracy")
|
|
491
|
+
lower_is_better: bool = eval_cfg.get("lower_is_better", False)
|
|
492
|
+
|
|
493
|
+
# Resolve experiment A
|
|
494
|
+
exp_a = load_experiment(args.log, args.exp_a)
|
|
495
|
+
if exp_a is None:
|
|
496
|
+
print(f"Error: experiment '{args.exp_a}' not found in {args.log}", file=sys.stderr)
|
|
497
|
+
sys.exit(1)
|
|
498
|
+
|
|
499
|
+
# Resolve experiment B (with "best" keyword support)
|
|
500
|
+
if args.exp_b == "best":
|
|
501
|
+
exp_b = resolve_best(args.log, primary_metric, lower_is_better)
|
|
502
|
+
if exp_b is None:
|
|
503
|
+
print(
|
|
504
|
+
f"Error: no kept experiments found in {args.log} to use as 'best'.",
|
|
505
|
+
file=sys.stderr,
|
|
506
|
+
)
|
|
507
|
+
sys.exit(1)
|
|
508
|
+
resolved_id = exp_b.get("experiment_id", "?")
|
|
509
|
+
if not args.json_output:
|
|
510
|
+
print(f" Resolved 'best' → {resolved_id} (primary metric: {primary_metric})\n")
|
|
511
|
+
else:
|
|
512
|
+
exp_b = load_experiment(args.log, args.exp_b)
|
|
513
|
+
if exp_b is None:
|
|
514
|
+
print(f"Error: experiment '{args.exp_b}' not found in {args.log}", file=sys.stderr)
|
|
515
|
+
sys.exit(1)
|
|
516
|
+
|
|
517
|
+
# Compute diffs
|
|
518
|
+
config_a = exp_a.get("config", {})
|
|
519
|
+
config_b = exp_b.get("config", {})
|
|
520
|
+
config_diff = compute_config_diff(config_a, config_b)
|
|
521
|
+
|
|
522
|
+
metrics_a = exp_a.get("metrics", {})
|
|
523
|
+
metrics_b = exp_b.get("metrics", {})
|
|
524
|
+
metric_diff = compute_metric_diff(metrics_a, metrics_b, primary_metric, lower_is_better)
|
|
525
|
+
|
|
526
|
+
# Render output
|
|
527
|
+
if args.json_output:
|
|
528
|
+
print(format_json_diff(exp_a, exp_b, config_diff, metric_diff, primary_metric))
|
|
529
|
+
else:
|
|
530
|
+
print(format_text_diff(exp_a, exp_b, config_diff, metric_diff, primary_metric, lower_is_better))
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
if __name__ == "__main__":
|
|
534
|
+
main()
|