claude-turing 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +9 -2
- package/commands/annotate.md +23 -0
- package/commands/archive.md +23 -0
- package/commands/cite.md +23 -0
- package/commands/flashback.md +22 -0
- package/commands/present.md +23 -0
- package/commands/replay.md +23 -0
- package/commands/search.md +22 -0
- package/commands/template.md +22 -0
- package/commands/trend.md +21 -0
- package/commands/turing.md +14 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +7 -0
- package/templates/scripts/__pycache__/experiment_annotations.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_archive.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_replay.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_search.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_templates.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/session_flashback.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/trend_analysis.cpython-314.pyc +0 -0
- package/templates/scripts/citation_manager.py +436 -0
- package/templates/scripts/experiment_annotations.py +392 -0
- package/templates/scripts/experiment_archive.py +534 -0
- package/templates/scripts/experiment_replay.py +592 -0
- package/templates/scripts/experiment_search.py +451 -0
- package/templates/scripts/experiment_templates.py +501 -0
- package/templates/scripts/generate_changelog.py +464 -0
- package/templates/scripts/generate_figures.py +597 -0
- package/templates/scripts/scaffold.py +12 -0
- package/templates/scripts/session_flashback.py +461 -0
- package/templates/scripts/trend_analysis.py +503 -0
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Re-run historical experiments with current infrastructure.
|
|
3
|
+
|
|
4
|
+
Read an old experiment's config from log.jsonl, plan a replay with
|
|
5
|
+
current code, data, and preprocessing, then compare original vs
|
|
6
|
+
replayed metrics. Answers the question: "would this old experiment
|
|
7
|
+
perform better/worse with today's pipeline?"
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python scripts/experiment_replay.py exp-042
|
|
11
|
+
python scripts/experiment_replay.py exp-042 --with-current-data
|
|
12
|
+
python scripts/experiment_replay.py exp-042 --with-current-preprocessing
|
|
13
|
+
python scripts/experiment_replay.py exp-042 --dry-run
|
|
14
|
+
python scripts/experiment_replay.py --list
|
|
15
|
+
python scripts/experiment_replay.py --json
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
import yaml
|
|
28
|
+
|
|
29
|
+
from scripts.turing_io import load_config, load_experiments
|
|
30
|
+
|
|
31
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
32
|
+
DEFAULT_REPLAY_DIR = "experiments/replays"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Replay Planning ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def find_experiment(experiments: list[dict], experiment_id: str) -> dict | None:
|
|
39
|
+
"""Find an experiment by ID in the log."""
|
|
40
|
+
for exp in experiments:
|
|
41
|
+
if exp.get("experiment_id") == experiment_id:
|
|
42
|
+
return exp
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def plan_replay(
|
|
47
|
+
original: dict,
|
|
48
|
+
config: dict,
|
|
49
|
+
with_current_data: bool = False,
|
|
50
|
+
with_current_preprocessing: bool = False,
|
|
51
|
+
) -> dict:
|
|
52
|
+
"""Plan a replay of an original experiment.
|
|
53
|
+
|
|
54
|
+
Determines what changes between original and current infrastructure,
|
|
55
|
+
and constructs a replay configuration.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
original: Original experiment dict from log.
|
|
59
|
+
config: Current project config.
|
|
60
|
+
with_current_data: Use current data instead of original data path.
|
|
61
|
+
with_current_preprocessing: Use current preprocessing pipeline.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Replay plan dict with config, changes, and warnings.
|
|
65
|
+
"""
|
|
66
|
+
original_config = original.get("config", {})
|
|
67
|
+
replay_config = dict(original_config)
|
|
68
|
+
changes = []
|
|
69
|
+
warnings = []
|
|
70
|
+
|
|
71
|
+
# Data source
|
|
72
|
+
if with_current_data:
|
|
73
|
+
current_data = config.get("data", {}).get("path", "")
|
|
74
|
+
original_data = original_config.get("data_path", "") or original_config.get("data", {}).get("path", "")
|
|
75
|
+
if current_data and current_data != original_data:
|
|
76
|
+
replay_config["data_path"] = current_data
|
|
77
|
+
if isinstance(replay_config.get("data"), dict):
|
|
78
|
+
replay_config["data"]["path"] = current_data
|
|
79
|
+
changes.append({
|
|
80
|
+
"field": "data_path",
|
|
81
|
+
"original": original_data,
|
|
82
|
+
"replay": current_data,
|
|
83
|
+
"reason": "Using current data (--with-current-data)",
|
|
84
|
+
})
|
|
85
|
+
elif not current_data:
|
|
86
|
+
warnings.append("No data path in current config — using original data path")
|
|
87
|
+
|
|
88
|
+
# Preprocessing
|
|
89
|
+
if with_current_preprocessing:
|
|
90
|
+
current_preproc = config.get("preprocessing", {})
|
|
91
|
+
original_preproc = original_config.get("preprocessing", {})
|
|
92
|
+
if current_preproc and current_preproc != original_preproc:
|
|
93
|
+
replay_config["preprocessing"] = current_preproc
|
|
94
|
+
changes.append({
|
|
95
|
+
"field": "preprocessing",
|
|
96
|
+
"original": original_preproc,
|
|
97
|
+
"replay": current_preproc,
|
|
98
|
+
"reason": "Using current preprocessing (--with-current-preprocessing)",
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
# Check for missing dependencies or features
|
|
102
|
+
model_type = original_config.get("model_type", "")
|
|
103
|
+
if model_type:
|
|
104
|
+
# Check if model type still exists in current codebase
|
|
105
|
+
train_path = Path("train.py")
|
|
106
|
+
if train_path.exists():
|
|
107
|
+
train_content = train_path.read_text()
|
|
108
|
+
if model_type not in train_content:
|
|
109
|
+
warnings.append(
|
|
110
|
+
f"Model type '{model_type}' not found in current train.py — "
|
|
111
|
+
f"replay may fail"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Seed handling — use same seed for reproducibility
|
|
115
|
+
seed = original_config.get("seed", original.get("seed"))
|
|
116
|
+
if seed is not None:
|
|
117
|
+
replay_config["seed"] = seed
|
|
118
|
+
else:
|
|
119
|
+
replay_config["seed"] = 42
|
|
120
|
+
warnings.append("No seed in original experiment — defaulting to 42")
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
"original_id": original.get("experiment_id"),
|
|
124
|
+
"original_timestamp": original.get("timestamp"),
|
|
125
|
+
"original_metrics": original.get("metrics", {}),
|
|
126
|
+
"replay_config": replay_config,
|
|
127
|
+
"changes": changes,
|
|
128
|
+
"warnings": warnings,
|
|
129
|
+
"with_current_data": with_current_data,
|
|
130
|
+
"with_current_preprocessing": with_current_preprocessing,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# --- Replay Execution ---
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def execute_replay(
|
|
138
|
+
plan: dict,
|
|
139
|
+
timeout: int = 600,
|
|
140
|
+
) -> dict:
|
|
141
|
+
"""Execute a replay by running train.py with the replay config.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
plan: Replay plan from plan_replay.
|
|
145
|
+
timeout: Max seconds for training.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Execution result with replay metrics.
|
|
149
|
+
"""
|
|
150
|
+
replay_config = plan.get("replay_config", {})
|
|
151
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
152
|
+
|
|
153
|
+
# Write temporary config
|
|
154
|
+
tmp_config = Path("experiments/replays/.replay-config.yaml")
|
|
155
|
+
tmp_config.parent.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
with open(tmp_config, "w") as f:
|
|
157
|
+
yaml.dump(replay_config, f, default_flow_style=False, sort_keys=False)
|
|
158
|
+
|
|
159
|
+
# Run training
|
|
160
|
+
cmd = ["python", "train.py", "--config", str(tmp_config)]
|
|
161
|
+
seed = replay_config.get("seed")
|
|
162
|
+
if seed is not None:
|
|
163
|
+
cmd.extend(["--seed", str(seed)])
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
167
|
+
except subprocess.TimeoutExpired:
|
|
168
|
+
return {
|
|
169
|
+
"status": "timeout",
|
|
170
|
+
"started_at": started_at,
|
|
171
|
+
"error": f"Training exceeded {timeout}s timeout",
|
|
172
|
+
}
|
|
173
|
+
except FileNotFoundError:
|
|
174
|
+
return {
|
|
175
|
+
"status": "error",
|
|
176
|
+
"started_at": started_at,
|
|
177
|
+
"error": "train.py not found",
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
completed_at = datetime.now(timezone.utc).isoformat()
|
|
181
|
+
|
|
182
|
+
if proc.returncode != 0:
|
|
183
|
+
error_snippet = (proc.stderr + proc.stdout)[-500:]
|
|
184
|
+
return {
|
|
185
|
+
"status": "failed",
|
|
186
|
+
"started_at": started_at,
|
|
187
|
+
"completed_at": completed_at,
|
|
188
|
+
"error": _classify_error(proc.stderr + proc.stdout),
|
|
189
|
+
"stderr_tail": error_snippet,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# Parse metrics from stdout
|
|
193
|
+
metrics = _parse_metrics(proc.stdout)
|
|
194
|
+
|
|
195
|
+
# Clean up temp config
|
|
196
|
+
try:
|
|
197
|
+
tmp_config.unlink()
|
|
198
|
+
except OSError:
|
|
199
|
+
pass
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
"status": "completed",
|
|
203
|
+
"started_at": started_at,
|
|
204
|
+
"completed_at": completed_at,
|
|
205
|
+
"metrics": metrics,
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _parse_metrics(stdout: str) -> dict:
|
|
210
|
+
"""Parse metrics from training output."""
|
|
211
|
+
metrics = {}
|
|
212
|
+
in_block = False
|
|
213
|
+
for line in stdout.splitlines():
|
|
214
|
+
line = line.strip()
|
|
215
|
+
if line == "---":
|
|
216
|
+
if in_block:
|
|
217
|
+
break
|
|
218
|
+
in_block = True
|
|
219
|
+
continue
|
|
220
|
+
if in_block and ":" in line:
|
|
221
|
+
key, value = line.split(":", 1)
|
|
222
|
+
try:
|
|
223
|
+
metrics[key.strip()] = float(value.strip())
|
|
224
|
+
except ValueError:
|
|
225
|
+
metrics[key.strip()] = value.strip()
|
|
226
|
+
return metrics
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _classify_error(output: str) -> str:
|
|
230
|
+
"""Classify error from output text."""
|
|
231
|
+
output_lower = output.lower()
|
|
232
|
+
if "cuda out of memory" in output_lower or "memoryerror" in output_lower:
|
|
233
|
+
return "oom"
|
|
234
|
+
if "nan" in output_lower and "loss" in output_lower:
|
|
235
|
+
return "nan_loss"
|
|
236
|
+
if "modulenotfounderror" in output_lower or "importerror" in output_lower:
|
|
237
|
+
return "import_error"
|
|
238
|
+
if "filenotfounderror" in output_lower:
|
|
239
|
+
return "file_not_found"
|
|
240
|
+
return "unknown"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# --- Comparison ---
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def compare_metrics(
|
|
247
|
+
original_metrics: dict,
|
|
248
|
+
replay_metrics: dict,
|
|
249
|
+
primary_metric: str = "accuracy",
|
|
250
|
+
lower_is_better: bool = False,
|
|
251
|
+
) -> dict:
|
|
252
|
+
"""Compare original vs replayed metrics.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
original_metrics: Metrics from the original experiment.
|
|
256
|
+
replay_metrics: Metrics from the replay.
|
|
257
|
+
primary_metric: Primary metric name.
|
|
258
|
+
lower_is_better: Whether lower values are better.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Comparison dict with deltas and verdict.
|
|
262
|
+
"""
|
|
263
|
+
comparisons = {}
|
|
264
|
+
all_metrics = set(list(original_metrics.keys()) + list(replay_metrics.keys()))
|
|
265
|
+
|
|
266
|
+
for metric in sorted(all_metrics):
|
|
267
|
+
orig = original_metrics.get(metric)
|
|
268
|
+
replay = replay_metrics.get(metric)
|
|
269
|
+
|
|
270
|
+
entry: dict = {"original": orig, "replay": replay}
|
|
271
|
+
|
|
272
|
+
if orig is not None and replay is not None:
|
|
273
|
+
try:
|
|
274
|
+
orig_f = float(orig)
|
|
275
|
+
replay_f = float(replay)
|
|
276
|
+
delta = replay_f - orig_f
|
|
277
|
+
pct = (delta / abs(orig_f) * 100) if orig_f != 0 else 0
|
|
278
|
+
entry["delta"] = round(delta, 6)
|
|
279
|
+
entry["delta_pct"] = round(pct, 2)
|
|
280
|
+
|
|
281
|
+
lib = lower_is_better if metric == primary_metric else (
|
|
282
|
+
metric in {"loss", "mse", "rmse", "mae", "error_rate",
|
|
283
|
+
"train_seconds", "latency", "latency_ms"}
|
|
284
|
+
)
|
|
285
|
+
if lib:
|
|
286
|
+
entry["improved"] = delta < 0
|
|
287
|
+
else:
|
|
288
|
+
entry["improved"] = delta > 0
|
|
289
|
+
except (ValueError, TypeError):
|
|
290
|
+
pass
|
|
291
|
+
|
|
292
|
+
comparisons[metric] = entry
|
|
293
|
+
|
|
294
|
+
# Overall verdict
|
|
295
|
+
primary = comparisons.get(primary_metric, {})
|
|
296
|
+
if primary.get("improved") is True:
|
|
297
|
+
verdict = "improved"
|
|
298
|
+
elif primary.get("improved") is False:
|
|
299
|
+
verdict = "regressed"
|
|
300
|
+
else:
|
|
301
|
+
verdict = "inconclusive"
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
"primary_metric": primary_metric,
|
|
305
|
+
"verdict": verdict,
|
|
306
|
+
"comparisons": comparisons,
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# --- Report ---
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def format_replay_report(report: dict) -> str:
|
|
314
|
+
"""Format replay result as a readable markdown report."""
|
|
315
|
+
if "error" in report:
|
|
316
|
+
return f"ERROR: {report['error']}"
|
|
317
|
+
|
|
318
|
+
lines = [
|
|
319
|
+
"# Experiment Replay",
|
|
320
|
+
"",
|
|
321
|
+
f"*{report.get('timestamp', '?')[:19]} UTC*",
|
|
322
|
+
"",
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
plan = report.get("plan", {})
|
|
326
|
+
lines.extend([
|
|
327
|
+
"## Original Experiment",
|
|
328
|
+
"",
|
|
329
|
+
f"- **ID:** {plan.get('original_id', '?')}",
|
|
330
|
+
f"- **Timestamp:** {plan.get('original_timestamp', '?')[:19]}",
|
|
331
|
+
])
|
|
332
|
+
|
|
333
|
+
orig_metrics = plan.get("original_metrics", {})
|
|
334
|
+
if orig_metrics:
|
|
335
|
+
lines.append("- **Metrics:** " + ", ".join(
|
|
336
|
+
f"{k}={v:.4f}" if isinstance(v, float) else f"{k}={v}"
|
|
337
|
+
for k, v in orig_metrics.items()
|
|
338
|
+
))
|
|
339
|
+
lines.append("")
|
|
340
|
+
|
|
341
|
+
# Changes
|
|
342
|
+
changes = plan.get("changes", [])
|
|
343
|
+
if changes:
|
|
344
|
+
lines.extend(["## Changes from Original", ""])
|
|
345
|
+
for ch in changes:
|
|
346
|
+
lines.append(f"- **{ch['field']}**: {ch['reason']}")
|
|
347
|
+
lines.append("")
|
|
348
|
+
|
|
349
|
+
# Warnings
|
|
350
|
+
warnings = plan.get("warnings", [])
|
|
351
|
+
if warnings:
|
|
352
|
+
lines.extend(["## Warnings", ""])
|
|
353
|
+
for w in warnings:
|
|
354
|
+
lines.append(f"- {w}")
|
|
355
|
+
lines.append("")
|
|
356
|
+
|
|
357
|
+
# Execution result
|
|
358
|
+
execution = report.get("execution", {})
|
|
359
|
+
status = execution.get("status", "not_run")
|
|
360
|
+
lines.extend([
|
|
361
|
+
"## Replay Result",
|
|
362
|
+
"",
|
|
363
|
+
f"**Status:** {status}",
|
|
364
|
+
])
|
|
365
|
+
|
|
366
|
+
if status == "completed":
|
|
367
|
+
# Comparison
|
|
368
|
+
comparison = report.get("comparison", {})
|
|
369
|
+
verdict = comparison.get("verdict", "?")
|
|
370
|
+
primary = comparison.get("primary_metric", "?")
|
|
371
|
+
lines.extend([
|
|
372
|
+
f"**Verdict:** {verdict} (primary: {primary})",
|
|
373
|
+
"",
|
|
374
|
+
"| Metric | Original | Replay | Delta | Change |",
|
|
375
|
+
"|--------|----------|--------|-------|--------|",
|
|
376
|
+
])
|
|
377
|
+
|
|
378
|
+
for metric, data in comparison.get("comparisons", {}).items():
|
|
379
|
+
orig = data.get("original")
|
|
380
|
+
replay = data.get("replay")
|
|
381
|
+
orig_str = f"{orig:.4f}" if isinstance(orig, float) else str(orig or "—")
|
|
382
|
+
replay_str = f"{replay:.4f}" if isinstance(replay, float) else str(replay or "—")
|
|
383
|
+
delta = data.get("delta_pct")
|
|
384
|
+
delta_str = f"{delta:+.2f}%" if delta is not None else "—"
|
|
385
|
+
improved = data.get("improved")
|
|
386
|
+
if improved is True:
|
|
387
|
+
change = "improved"
|
|
388
|
+
elif improved is False:
|
|
389
|
+
change = "regressed"
|
|
390
|
+
else:
|
|
391
|
+
change = "—"
|
|
392
|
+
lines.append(f"| {metric} | {orig_str} | {replay_str} | {delta_str} | {change} |")
|
|
393
|
+
elif status in ("failed", "timeout", "error"):
|
|
394
|
+
lines.append(f"**Error:** {execution.get('error', 'unknown')}")
|
|
395
|
+
|
|
396
|
+
lines.extend(["", "---"])
|
|
397
|
+
return "\n".join(lines)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def save_replay_report(report: dict, replay_dir: str = DEFAULT_REPLAY_DIR) -> Path:
|
|
401
|
+
"""Save replay report to YAML."""
|
|
402
|
+
p = Path(replay_dir)
|
|
403
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
404
|
+
exp_id = report.get("plan", {}).get("original_id", "unknown")
|
|
405
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
406
|
+
out = p / f"{exp_id}-replay-{ts}.yaml"
|
|
407
|
+
with open(out, "w") as f:
|
|
408
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
409
|
+
return out
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def list_replays(replay_dir: str = DEFAULT_REPLAY_DIR) -> list[dict]:
|
|
413
|
+
"""List all saved replay reports."""
|
|
414
|
+
p = Path(replay_dir)
|
|
415
|
+
if not p.exists():
|
|
416
|
+
return []
|
|
417
|
+
|
|
418
|
+
replays = []
|
|
419
|
+
for path in sorted(p.glob("*-replay-*.yaml")):
|
|
420
|
+
try:
|
|
421
|
+
with open(path) as f:
|
|
422
|
+
data = yaml.safe_load(f)
|
|
423
|
+
if not isinstance(data, dict):
|
|
424
|
+
continue
|
|
425
|
+
plan = data.get("plan", {})
|
|
426
|
+
execution = data.get("execution", {})
|
|
427
|
+
comparison = data.get("comparison", {})
|
|
428
|
+
replays.append({
|
|
429
|
+
"file": path.name,
|
|
430
|
+
"original_id": plan.get("original_id"),
|
|
431
|
+
"timestamp": data.get("timestamp", ""),
|
|
432
|
+
"status": execution.get("status", "?"),
|
|
433
|
+
"verdict": comparison.get("verdict", "?"),
|
|
434
|
+
})
|
|
435
|
+
except (yaml.YAMLError, OSError):
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
return replays
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# --- Orchestration ---
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def run_replay(
|
|
445
|
+
experiment_id: str | None = None,
|
|
446
|
+
with_current_data: bool = False,
|
|
447
|
+
with_current_preprocessing: bool = False,
|
|
448
|
+
dry_run: bool = False,
|
|
449
|
+
list_mode: bool = False,
|
|
450
|
+
timeout: int = 600,
|
|
451
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
452
|
+
config_path: str = "config.yaml",
|
|
453
|
+
replay_dir: str = DEFAULT_REPLAY_DIR,
|
|
454
|
+
) -> dict:
|
|
455
|
+
"""Run experiment replay workflow.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
experiment_id: Experiment to replay.
|
|
459
|
+
with_current_data: Use current data.
|
|
460
|
+
with_current_preprocessing: Use current preprocessing.
|
|
461
|
+
dry_run: Plan only, don't execute.
|
|
462
|
+
list_mode: List previous replays.
|
|
463
|
+
timeout: Training timeout in seconds.
|
|
464
|
+
log_path: Path to experiment log.
|
|
465
|
+
config_path: Path to config.yaml.
|
|
466
|
+
replay_dir: Directory for replay reports.
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Replay result dict.
|
|
470
|
+
"""
|
|
471
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
472
|
+
|
|
473
|
+
if list_mode:
|
|
474
|
+
replays = list_replays(replay_dir)
|
|
475
|
+
return {
|
|
476
|
+
"timestamp": timestamp,
|
|
477
|
+
"action": "list",
|
|
478
|
+
"count": len(replays),
|
|
479
|
+
"replays": replays,
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
if not experiment_id:
|
|
483
|
+
return {"error": "Experiment ID required. Use --list to see past replays."}
|
|
484
|
+
|
|
485
|
+
config = load_config(config_path)
|
|
486
|
+
experiments = load_experiments(log_path)
|
|
487
|
+
|
|
488
|
+
if not experiments:
|
|
489
|
+
return {"timestamp": timestamp, "error": "No experiments found"}
|
|
490
|
+
|
|
491
|
+
original = find_experiment(experiments, experiment_id)
|
|
492
|
+
if original is None:
|
|
493
|
+
return {"timestamp": timestamp, "error": f"Experiment '{experiment_id}' not found"}
|
|
494
|
+
|
|
495
|
+
eval_cfg = config.get("evaluation", {})
|
|
496
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
497
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
498
|
+
|
|
499
|
+
# Plan
|
|
500
|
+
plan = plan_replay(original, config, with_current_data, with_current_preprocessing)
|
|
501
|
+
|
|
502
|
+
report: dict = {
|
|
503
|
+
"timestamp": timestamp,
|
|
504
|
+
"plan": plan,
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
if dry_run:
|
|
508
|
+
report["execution"] = {"status": "dry_run"}
|
|
509
|
+
saved = save_replay_report(report, replay_dir)
|
|
510
|
+
report["saved_to"] = str(saved)
|
|
511
|
+
return report
|
|
512
|
+
|
|
513
|
+
# Execute
|
|
514
|
+
execution = execute_replay(plan, timeout=timeout)
|
|
515
|
+
report["execution"] = execution
|
|
516
|
+
|
|
517
|
+
# Compare if completed
|
|
518
|
+
if execution.get("status") == "completed":
|
|
519
|
+
comparison = compare_metrics(
|
|
520
|
+
plan.get("original_metrics", {}),
|
|
521
|
+
execution.get("metrics", {}),
|
|
522
|
+
primary_metric=primary_metric,
|
|
523
|
+
lower_is_better=lower_is_better,
|
|
524
|
+
)
|
|
525
|
+
report["comparison"] = comparison
|
|
526
|
+
|
|
527
|
+
# Save
|
|
528
|
+
saved = save_replay_report(report, replay_dir)
|
|
529
|
+
report["saved_to"] = str(saved)
|
|
530
|
+
|
|
531
|
+
return report
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def main() -> None:
|
|
535
|
+
"""CLI entry point."""
|
|
536
|
+
parser = argparse.ArgumentParser(description="Re-run historical experiments")
|
|
537
|
+
parser.add_argument("experiment_id", nargs="?", default=None,
|
|
538
|
+
help="Experiment ID to replay")
|
|
539
|
+
parser.add_argument("--with-current-data", action="store_true",
|
|
540
|
+
help="Use current data instead of original")
|
|
541
|
+
parser.add_argument("--with-current-preprocessing", action="store_true",
|
|
542
|
+
help="Use current preprocessing pipeline")
|
|
543
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
544
|
+
help="Plan replay without executing")
|
|
545
|
+
parser.add_argument("--list", dest="list_mode", action="store_true",
|
|
546
|
+
help="List previous replays")
|
|
547
|
+
parser.add_argument("--timeout", type=int, default=600,
|
|
548
|
+
help="Training timeout in seconds (default: 600)")
|
|
549
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
550
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
|
|
551
|
+
parser.add_argument("--replay-dir", default=DEFAULT_REPLAY_DIR,
|
|
552
|
+
help="Directory for replay reports")
|
|
553
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
554
|
+
args = parser.parse_args()
|
|
555
|
+
|
|
556
|
+
report = run_replay(
|
|
557
|
+
experiment_id=args.experiment_id,
|
|
558
|
+
with_current_data=args.with_current_data,
|
|
559
|
+
with_current_preprocessing=args.with_current_preprocessing,
|
|
560
|
+
dry_run=args.dry_run,
|
|
561
|
+
list_mode=args.list_mode,
|
|
562
|
+
timeout=args.timeout,
|
|
563
|
+
log_path=args.log,
|
|
564
|
+
config_path=args.config,
|
|
565
|
+
replay_dir=args.replay_dir,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
if args.json:
|
|
569
|
+
print(json.dumps(report, indent=2, default=str))
|
|
570
|
+
else:
|
|
571
|
+
if "error" in report:
|
|
572
|
+
print(f"ERROR: {report['error']}", file=sys.stderr)
|
|
573
|
+
sys.exit(1)
|
|
574
|
+
|
|
575
|
+
if report.get("action") == "list":
|
|
576
|
+
replays = report.get("replays", [])
|
|
577
|
+
if not replays:
|
|
578
|
+
print("No replays found.")
|
|
579
|
+
else:
|
|
580
|
+
print("# Experiment Replays")
|
|
581
|
+
print()
|
|
582
|
+
print("| Original | Date | Status | Verdict |")
|
|
583
|
+
print("|----------|------|--------|---------|")
|
|
584
|
+
for r in replays:
|
|
585
|
+
print(f"| {r['original_id']} | {r['timestamp'][:10]} "
|
|
586
|
+
f"| {r['status']} | {r['verdict']} |")
|
|
587
|
+
else:
|
|
588
|
+
print(format_replay_report(report))
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
if __name__ == "__main__":
|
|
592
|
+
main()
|