claude-turing 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/checkpoint.md +47 -0
- package/commands/export.md +48 -0
- package/commands/profile.md +43 -0
- package/commands/turing.md +6 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +3 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/checkpoint_manager.py +449 -0
- package/templates/scripts/equivalence_checker.py +158 -0
- package/templates/scripts/export_card.py +183 -0
- package/templates/scripts/export_formats.py +385 -0
- package/templates/scripts/export_model.py +324 -0
- package/templates/scripts/generate_brief.py +38 -1
- package/templates/scripts/latency_benchmark.py +167 -0
- package/templates/scripts/profile_training.py +533 -0
- package/templates/scripts/scaffold.py +10 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Computational profiling for ML training runs.
|
|
3
|
+
|
|
4
|
+
Measures timing breakdown, memory usage, throughput, and identifies
|
|
5
|
+
bottlenecks. Maps bottleneck patterns to known fixes.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python scripts/profile_training.py # Profile best config
|
|
9
|
+
python scripts/profile_training.py --exp-id exp-042 # Specific experiment
|
|
10
|
+
python scripts/profile_training.py --detailed # Include per-phase breakdown
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
import subprocess
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
import tracemalloc
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
import yaml
|
|
27
|
+
|
|
28
|
+
from scripts.turing_io import load_config, load_experiments
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Known bottleneck patterns -> recommendation mappings
|
|
32
|
+
BOTTLENECK_RECOMMENDATIONS = {
|
|
33
|
+
"data_loading": [
|
|
34
|
+
"Cache preprocessed data to disk — data loading dominates training time",
|
|
35
|
+
"Use memory-mapped files or HDF5 for large datasets",
|
|
36
|
+
"Check for unnecessary file I/O in the data pipeline",
|
|
37
|
+
],
|
|
38
|
+
"preprocessing": [
|
|
39
|
+
"Move preprocessing to a one-time step before training",
|
|
40
|
+
"Cache feature transformations (encoders, scalers) to avoid recomputation",
|
|
41
|
+
"Consider using vectorized operations instead of row-by-row processing",
|
|
42
|
+
],
|
|
43
|
+
"model_training": [
|
|
44
|
+
"Reduce model complexity (fewer estimators, smaller depth)",
|
|
45
|
+
"Try a faster model type (LightGBM is typically faster than XGBoost)",
|
|
46
|
+
"Enable GPU acceleration if available and model supports it",
|
|
47
|
+
],
|
|
48
|
+
"evaluation": [
|
|
49
|
+
"Reduce validation set size for intermediate checks",
|
|
50
|
+
"Evaluate less frequently (every N iterations instead of every iteration)",
|
|
51
|
+
],
|
|
52
|
+
"memory": [
|
|
53
|
+
"Process data in batches to reduce peak memory",
|
|
54
|
+
"Use float32 instead of float64 for features",
|
|
55
|
+
"Release intermediate dataframes after use",
|
|
56
|
+
],
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def find_experiment(experiments: list[dict], exp_id: str | None, metric: str, lower_is_better: bool) -> dict | None:
|
|
61
|
+
"""Find experiment by ID or return best kept."""
|
|
62
|
+
if exp_id:
|
|
63
|
+
for exp in experiments:
|
|
64
|
+
if exp.get("experiment_id") == exp_id:
|
|
65
|
+
return exp
|
|
66
|
+
return None
|
|
67
|
+
best = None
|
|
68
|
+
best_val = float("inf") if lower_is_better else float("-inf")
|
|
69
|
+
for exp in experiments:
|
|
70
|
+
if exp.get("status") != "kept":
|
|
71
|
+
continue
|
|
72
|
+
val = exp.get("metrics", {}).get(metric)
|
|
73
|
+
if val is None:
|
|
74
|
+
continue
|
|
75
|
+
if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
76
|
+
best_val = val
|
|
77
|
+
best = exp
|
|
78
|
+
return best
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_system_memory_mb() -> float | None:
|
|
82
|
+
"""Get current process RSS memory in MB from /proc/self/status."""
|
|
83
|
+
try:
|
|
84
|
+
with open("/proc/self/status") as f:
|
|
85
|
+
for line in f:
|
|
86
|
+
if line.startswith("VmRSS:"):
|
|
87
|
+
return int(line.split()[1]) / 1024 # kB -> MB
|
|
88
|
+
except (FileNotFoundError, ValueError, IndexError):
|
|
89
|
+
pass
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_peak_memory_mb() -> float | None:
|
|
94
|
+
"""Get peak RSS memory in MB from /proc/self/status."""
|
|
95
|
+
try:
|
|
96
|
+
with open("/proc/self/status") as f:
|
|
97
|
+
for line in f:
|
|
98
|
+
if line.startswith("VmPeak:"):
|
|
99
|
+
return int(line.split()[1]) / 1024 # kB -> MB
|
|
100
|
+
except (FileNotFoundError, ValueError, IndexError):
|
|
101
|
+
pass
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def check_gpu_available() -> dict | None:
|
|
106
|
+
"""Check for GPU availability and return info."""
|
|
107
|
+
# Try torch
|
|
108
|
+
try:
|
|
109
|
+
import torch
|
|
110
|
+
if torch.cuda.is_available():
|
|
111
|
+
return {
|
|
112
|
+
"type": "cuda",
|
|
113
|
+
"device": torch.cuda.get_device_name(0),
|
|
114
|
+
"memory_total_mb": round(torch.cuda.get_device_properties(0).total_mem / 1024**2),
|
|
115
|
+
}
|
|
116
|
+
except ImportError:
|
|
117
|
+
pass
|
|
118
|
+
# Try nvidia-smi
|
|
119
|
+
try:
|
|
120
|
+
result = subprocess.run(
|
|
121
|
+
["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader,nounits"],
|
|
122
|
+
capture_output=True, text=True, timeout=5,
|
|
123
|
+
)
|
|
124
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
125
|
+
parts = result.stdout.strip().split(",")
|
|
126
|
+
return {
|
|
127
|
+
"type": "cuda",
|
|
128
|
+
"device": parts[0].strip(),
|
|
129
|
+
"memory_total_mb": int(parts[1].strip()) if len(parts) > 1 else None,
|
|
130
|
+
}
|
|
131
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
132
|
+
pass
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def profile_training_run(seed: int = 42, timeout: int = 600) -> dict:
|
|
137
|
+
"""Run train.py with profiling instrumentation.
|
|
138
|
+
|
|
139
|
+
Wraps the training run and captures timing, memory, and throughput.
|
|
140
|
+
"""
|
|
141
|
+
# Start memory tracking
|
|
142
|
+
tracemalloc.start()
|
|
143
|
+
mem_before = get_system_memory_mb()
|
|
144
|
+
|
|
145
|
+
start_time = time.perf_counter()
|
|
146
|
+
|
|
147
|
+
# Run training
|
|
148
|
+
cmd = ["python", "train.py", "--seed", str(seed)]
|
|
149
|
+
try:
|
|
150
|
+
proc = subprocess.run(
|
|
151
|
+
cmd, capture_output=True, text=True, timeout=timeout,
|
|
152
|
+
)
|
|
153
|
+
except subprocess.TimeoutExpired:
|
|
154
|
+
return {"error": f"Training timed out after {timeout}s"}
|
|
155
|
+
|
|
156
|
+
end_time = time.perf_counter()
|
|
157
|
+
total_time = end_time - start_time
|
|
158
|
+
|
|
159
|
+
# Memory snapshot
|
|
160
|
+
current, peak = tracemalloc.get_traced_memory()
|
|
161
|
+
tracemalloc.stop()
|
|
162
|
+
mem_after = get_system_memory_mb()
|
|
163
|
+
peak_rss = get_peak_memory_mb()
|
|
164
|
+
|
|
165
|
+
# Parse metrics from output
|
|
166
|
+
metrics = {}
|
|
167
|
+
timing_data = {}
|
|
168
|
+
in_block = False
|
|
169
|
+
for line in proc.stdout.splitlines():
|
|
170
|
+
line_stripped = line.strip()
|
|
171
|
+
if line_stripped == "---":
|
|
172
|
+
if in_block:
|
|
173
|
+
break
|
|
174
|
+
in_block = True
|
|
175
|
+
continue
|
|
176
|
+
if in_block and ":" in line_stripped:
|
|
177
|
+
key, value = line_stripped.split(":", 1)
|
|
178
|
+
key = key.strip()
|
|
179
|
+
value = value.strip()
|
|
180
|
+
try:
|
|
181
|
+
metrics[key] = float(value)
|
|
182
|
+
except ValueError:
|
|
183
|
+
metrics[key] = value
|
|
184
|
+
|
|
185
|
+
# Extract timing if train.py reports it
|
|
186
|
+
train_seconds = metrics.get("train_seconds")
|
|
187
|
+
if isinstance(train_seconds, str):
|
|
188
|
+
try:
|
|
189
|
+
train_seconds = float(train_seconds)
|
|
190
|
+
except ValueError:
|
|
191
|
+
train_seconds = None
|
|
192
|
+
|
|
193
|
+
# Build profile
|
|
194
|
+
profile = {
|
|
195
|
+
"total_time_sec": round(total_time, 2),
|
|
196
|
+
"train_time_sec": round(float(train_seconds), 2) if train_seconds else round(total_time, 2),
|
|
197
|
+
"overhead_sec": round(total_time - float(train_seconds), 2) if train_seconds else 0,
|
|
198
|
+
"memory": {
|
|
199
|
+
"peak_rss_mb": round(peak_rss, 1) if peak_rss else None,
|
|
200
|
+
"python_peak_mb": round(peak / 1024**2, 1),
|
|
201
|
+
"rss_before_mb": round(mem_before, 1) if mem_before else None,
|
|
202
|
+
"rss_after_mb": round(mem_after, 1) if mem_after else None,
|
|
203
|
+
},
|
|
204
|
+
"metrics": metrics,
|
|
205
|
+
"returncode": proc.returncode,
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# Check GPU
|
|
209
|
+
gpu_info = check_gpu_available()
|
|
210
|
+
if gpu_info:
|
|
211
|
+
profile["gpu"] = gpu_info
|
|
212
|
+
# Try to get GPU memory usage
|
|
213
|
+
try:
|
|
214
|
+
import torch
|
|
215
|
+
if torch.cuda.is_available():
|
|
216
|
+
profile["memory"]["peak_gpu_mb"] = round(torch.cuda.max_memory_allocated() / 1024**2, 1)
|
|
217
|
+
except ImportError:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
return profile
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def estimate_timing_breakdown(profile: dict) -> dict:
|
|
224
|
+
"""Estimate timing breakdown from available data.
|
|
225
|
+
|
|
226
|
+
Since we can't instrument inside train.py without modifying it,
|
|
227
|
+
we estimate based on total time and known patterns.
|
|
228
|
+
"""
|
|
229
|
+
total = profile.get("train_time_sec", profile.get("total_time_sec", 0))
|
|
230
|
+
if total <= 0:
|
|
231
|
+
return {}
|
|
232
|
+
|
|
233
|
+
train_secs = profile.get("metrics", {}).get("train_seconds")
|
|
234
|
+
if isinstance(train_secs, str):
|
|
235
|
+
try:
|
|
236
|
+
train_secs = float(train_secs)
|
|
237
|
+
except ValueError:
|
|
238
|
+
train_secs = None
|
|
239
|
+
|
|
240
|
+
overhead = profile.get("overhead_sec", 0)
|
|
241
|
+
|
|
242
|
+
breakdown = {
|
|
243
|
+
"total_sec": round(total, 2),
|
|
244
|
+
"overhead_sec": round(overhead, 2),
|
|
245
|
+
"overhead_pct": round(overhead / total * 100, 1) if total > 0 else 0,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if train_secs:
|
|
249
|
+
breakdown["training_sec"] = round(train_secs, 2)
|
|
250
|
+
breakdown["training_pct"] = round(train_secs / total * 100, 1)
|
|
251
|
+
|
|
252
|
+
return breakdown
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def detect_bottleneck(profile: dict, breakdown: dict) -> dict:
|
|
256
|
+
"""Identify the primary bottleneck from profiling data."""
|
|
257
|
+
bottlenecks = []
|
|
258
|
+
|
|
259
|
+
total = profile.get("total_time_sec", 0)
|
|
260
|
+
overhead = profile.get("overhead_sec", 0)
|
|
261
|
+
|
|
262
|
+
# Check if overhead (data loading + setup) is dominant
|
|
263
|
+
if total > 0 and overhead > 0:
|
|
264
|
+
overhead_pct = overhead / total * 100
|
|
265
|
+
if overhead_pct > 50:
|
|
266
|
+
bottlenecks.append({
|
|
267
|
+
"type": "data_loading",
|
|
268
|
+
"severity": "high",
|
|
269
|
+
"description": f"Non-training overhead is {overhead_pct:.0f}% of total time ({overhead:.1f}s of {total:.1f}s)",
|
|
270
|
+
"pct_of_total": round(overhead_pct, 1),
|
|
271
|
+
})
|
|
272
|
+
|
|
273
|
+
# Check memory pressure
|
|
274
|
+
peak_rss = profile.get("memory", {}).get("peak_rss_mb")
|
|
275
|
+
if peak_rss and peak_rss > 4096:
|
|
276
|
+
bottlenecks.append({
|
|
277
|
+
"type": "memory",
|
|
278
|
+
"severity": "medium" if peak_rss < 8192 else "high",
|
|
279
|
+
"description": f"Peak memory usage is {peak_rss:.0f} MB",
|
|
280
|
+
"pct_of_total": 0,
|
|
281
|
+
})
|
|
282
|
+
|
|
283
|
+
# Check GPU utilization
|
|
284
|
+
gpu = profile.get("gpu")
|
|
285
|
+
peak_gpu = profile.get("memory", {}).get("peak_gpu_mb")
|
|
286
|
+
if gpu and peak_gpu:
|
|
287
|
+
gpu_total = gpu.get("memory_total_mb", 0)
|
|
288
|
+
if gpu_total > 0:
|
|
289
|
+
util_pct = peak_gpu / gpu_total * 100
|
|
290
|
+
if util_pct < 30:
|
|
291
|
+
bottlenecks.append({
|
|
292
|
+
"type": "gpu_underutilized",
|
|
293
|
+
"severity": "medium",
|
|
294
|
+
"description": f"GPU memory utilization is only {util_pct:.0f}% — consider larger batch size",
|
|
295
|
+
"pct_of_total": 0,
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
# Check training time relative to expectations
|
|
299
|
+
train_secs = profile.get("metrics", {}).get("train_seconds")
|
|
300
|
+
if isinstance(train_secs, (int, float)) and train_secs > 300:
|
|
301
|
+
bottlenecks.append({
|
|
302
|
+
"type": "model_training",
|
|
303
|
+
"severity": "medium",
|
|
304
|
+
"description": f"Training takes {train_secs:.0f}s — consider model simplification",
|
|
305
|
+
"pct_of_total": 0,
|
|
306
|
+
})
|
|
307
|
+
|
|
308
|
+
if not bottlenecks:
|
|
309
|
+
return {
|
|
310
|
+
"type": "none_detected",
|
|
311
|
+
"severity": "low",
|
|
312
|
+
"description": "No significant bottleneck detected",
|
|
313
|
+
"pct_of_total": 0,
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
# Return most severe
|
|
317
|
+
severity_order = {"high": 3, "medium": 2, "low": 1}
|
|
318
|
+
bottlenecks.sort(key=lambda b: -severity_order.get(b["severity"], 0))
|
|
319
|
+
return bottlenecks[0]
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def generate_recommendations(bottleneck: dict, profile: dict) -> list[str]:
|
|
323
|
+
"""Generate actionable recommendations based on detected bottleneck."""
|
|
324
|
+
bt = bottleneck.get("type", "none_detected")
|
|
325
|
+
recs = BOTTLENECK_RECOMMENDATIONS.get(bt, [])
|
|
326
|
+
|
|
327
|
+
# Add context-specific recommendations
|
|
328
|
+
extra = []
|
|
329
|
+
peak_rss = profile.get("memory", {}).get("peak_rss_mb")
|
|
330
|
+
if peak_rss and peak_rss > 2048 and bt != "memory":
|
|
331
|
+
extra.append(f"Memory usage is {peak_rss:.0f} MB — monitor for OOM risk on smaller machines")
|
|
332
|
+
|
|
333
|
+
gpu = profile.get("gpu")
|
|
334
|
+
if not gpu:
|
|
335
|
+
extra.append("No GPU detected — GPU acceleration could significantly speed up training")
|
|
336
|
+
|
|
337
|
+
total = profile.get("total_time_sec", 0)
|
|
338
|
+
if total < 10:
|
|
339
|
+
extra.append("Training is very fast — profiling overhead may distort results")
|
|
340
|
+
|
|
341
|
+
return list(recs) + extra
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def compute_throughput(profile: dict) -> dict:
|
|
345
|
+
"""Compute throughput metrics from profile data."""
|
|
346
|
+
total_time = profile.get("train_time_sec", profile.get("total_time_sec", 0))
|
|
347
|
+
metrics = profile.get("metrics", {})
|
|
348
|
+
|
|
349
|
+
n_samples = metrics.get("n_samples") or metrics.get("n_train_samples")
|
|
350
|
+
if isinstance(n_samples, str):
|
|
351
|
+
try:
|
|
352
|
+
n_samples = int(n_samples)
|
|
353
|
+
except ValueError:
|
|
354
|
+
n_samples = None
|
|
355
|
+
|
|
356
|
+
throughput = {}
|
|
357
|
+
if n_samples and total_time > 0:
|
|
358
|
+
throughput["samples_per_sec"] = round(n_samples / total_time, 1)
|
|
359
|
+
|
|
360
|
+
return throughput
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def save_profile(profile_data: dict, output_dir: str = "experiments/profiles") -> Path:
|
|
364
|
+
"""Save profile results to YAML file."""
|
|
365
|
+
out_path = Path(output_dir)
|
|
366
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
367
|
+
exp_id = profile_data.get("experiment_id", "unknown")
|
|
368
|
+
filepath = out_path / f"{exp_id}-profile.yaml"
|
|
369
|
+
with open(filepath, "w") as f:
|
|
370
|
+
yaml.dump(profile_data, f, default_flow_style=False, sort_keys=False)
|
|
371
|
+
return filepath
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def format_profile_report(profile_data: dict) -> str:
|
|
375
|
+
"""Format profile results as human-readable markdown."""
|
|
376
|
+
if "error" in profile_data:
|
|
377
|
+
return f"ERROR: {profile_data['error']}"
|
|
378
|
+
|
|
379
|
+
exp_id = profile_data.get("experiment_id", "?")
|
|
380
|
+
profile = profile_data.get("profile", {})
|
|
381
|
+
breakdown = profile_data.get("breakdown", {})
|
|
382
|
+
bottleneck = profile_data.get("bottleneck", {})
|
|
383
|
+
recommendations = profile_data.get("recommendations", [])
|
|
384
|
+
throughput = profile_data.get("throughput", {})
|
|
385
|
+
|
|
386
|
+
lines = [
|
|
387
|
+
f"# Training Profile: {exp_id}",
|
|
388
|
+
"",
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
# Timing
|
|
392
|
+
lines.extend([
|
|
393
|
+
"## Timing",
|
|
394
|
+
"",
|
|
395
|
+
f"- **Total time:** {profile.get('total_time_sec', 0):.1f}s",
|
|
396
|
+
])
|
|
397
|
+
if breakdown.get("training_sec"):
|
|
398
|
+
lines.append(f"- **Training:** {breakdown['training_sec']:.1f}s ({breakdown.get('training_pct', 0):.0f}%)")
|
|
399
|
+
if breakdown.get("overhead_sec", 0) > 0:
|
|
400
|
+
lines.append(f"- **Overhead:** {breakdown['overhead_sec']:.1f}s ({breakdown.get('overhead_pct', 0):.0f}%)")
|
|
401
|
+
|
|
402
|
+
# Memory
|
|
403
|
+
mem = profile.get("memory", {})
|
|
404
|
+
lines.extend(["", "## Memory", ""])
|
|
405
|
+
if mem.get("peak_rss_mb"):
|
|
406
|
+
lines.append(f"- **Peak RSS:** {mem['peak_rss_mb']:.0f} MB")
|
|
407
|
+
lines.append(f"- **Python peak:** {mem.get('python_peak_mb', 0):.1f} MB")
|
|
408
|
+
if mem.get("peak_gpu_mb"):
|
|
409
|
+
lines.append(f"- **GPU peak:** {mem['peak_gpu_mb']:.0f} MB")
|
|
410
|
+
|
|
411
|
+
# GPU
|
|
412
|
+
gpu = profile.get("gpu")
|
|
413
|
+
if gpu:
|
|
414
|
+
lines.extend(["", "## GPU", ""])
|
|
415
|
+
lines.append(f"- **Device:** {gpu.get('device', 'unknown')}")
|
|
416
|
+
if gpu.get("memory_total_mb"):
|
|
417
|
+
lines.append(f"- **Total VRAM:** {gpu['memory_total_mb']} MB")
|
|
418
|
+
|
|
419
|
+
# Throughput
|
|
420
|
+
if throughput:
|
|
421
|
+
lines.extend(["", "## Throughput", ""])
|
|
422
|
+
if throughput.get("samples_per_sec"):
|
|
423
|
+
lines.append(f"- **Samples/sec:** {throughput['samples_per_sec']:.1f}")
|
|
424
|
+
|
|
425
|
+
# Bottleneck
|
|
426
|
+
lines.extend([
|
|
427
|
+
"",
|
|
428
|
+
"## Bottleneck",
|
|
429
|
+
"",
|
|
430
|
+
f"**{bottleneck.get('type', 'none')}** ({bottleneck.get('severity', 'low')})",
|
|
431
|
+
"",
|
|
432
|
+
bottleneck.get("description", "No bottleneck detected."),
|
|
433
|
+
])
|
|
434
|
+
|
|
435
|
+
# Recommendations
|
|
436
|
+
if recommendations:
|
|
437
|
+
lines.extend(["", "## Recommendations", ""])
|
|
438
|
+
for rec in recommendations:
|
|
439
|
+
lines.append(f"- {rec}")
|
|
440
|
+
|
|
441
|
+
return "\n".join(lines)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def run_profile(
|
|
445
|
+
exp_id: str | None = None,
|
|
446
|
+
config_path: str = "config.yaml",
|
|
447
|
+
log_path: str = "experiments/log.jsonl",
|
|
448
|
+
seed: int = 42,
|
|
449
|
+
timeout: int = 600,
|
|
450
|
+
) -> dict:
|
|
451
|
+
"""Run a complete training profile.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
exp_id: Experiment ID (defaults to best).
|
|
455
|
+
config_path: Path to config.yaml.
|
|
456
|
+
log_path: Path to experiment log.
|
|
457
|
+
seed: Random seed for the profiling run.
|
|
458
|
+
timeout: Training timeout in seconds.
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Complete profile result dict.
|
|
462
|
+
"""
|
|
463
|
+
config = load_config(config_path)
|
|
464
|
+
eval_cfg = config.get("evaluation", {})
|
|
465
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
466
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
467
|
+
|
|
468
|
+
experiments = load_experiments(log_path)
|
|
469
|
+
target_exp = find_experiment(experiments, exp_id, primary_metric, lower_is_better)
|
|
470
|
+
|
|
471
|
+
if not target_exp:
|
|
472
|
+
return {"error": f"No experiment found{f' with ID {exp_id}' if exp_id else ''}"}
|
|
473
|
+
|
|
474
|
+
target_id = target_exp.get("experiment_id", "unknown")
|
|
475
|
+
|
|
476
|
+
print(f"Profiling {target_id}...", file=sys.stderr)
|
|
477
|
+
|
|
478
|
+
# Run profiled training
|
|
479
|
+
profile = profile_training_run(seed=seed, timeout=timeout)
|
|
480
|
+
|
|
481
|
+
if "error" in profile:
|
|
482
|
+
return {"error": profile["error"], "experiment_id": target_id}
|
|
483
|
+
|
|
484
|
+
# Analyze results
|
|
485
|
+
breakdown = estimate_timing_breakdown(profile)
|
|
486
|
+
bottleneck = detect_bottleneck(profile, breakdown)
|
|
487
|
+
recommendations = generate_recommendations(bottleneck, profile)
|
|
488
|
+
throughput = compute_throughput(profile)
|
|
489
|
+
|
|
490
|
+
result = {
|
|
491
|
+
"experiment_id": target_id,
|
|
492
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
493
|
+
"profile": profile,
|
|
494
|
+
"breakdown": breakdown,
|
|
495
|
+
"bottleneck": bottleneck,
|
|
496
|
+
"recommendations": recommendations,
|
|
497
|
+
"throughput": throughput,
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
return result
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def main() -> None:
|
|
504
|
+
"""CLI entry point."""
|
|
505
|
+
parser = argparse.ArgumentParser(description="Computational profiling for ML training")
|
|
506
|
+
parser.add_argument("--exp-id", default=None, help="Experiment ID (defaults to best)")
|
|
507
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
508
|
+
parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
|
|
509
|
+
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
510
|
+
parser.add_argument("--timeout", type=int, default=600, help="Training timeout in seconds")
|
|
511
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
512
|
+
args = parser.parse_args()
|
|
513
|
+
|
|
514
|
+
result = run_profile(
|
|
515
|
+
exp_id=args.exp_id,
|
|
516
|
+
config_path=args.config,
|
|
517
|
+
log_path=args.log,
|
|
518
|
+
seed=args.seed,
|
|
519
|
+
timeout=args.timeout,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
if "error" not in result:
|
|
523
|
+
filepath = save_profile(result)
|
|
524
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
525
|
+
|
|
526
|
+
if args.json:
|
|
527
|
+
print(json.dumps(result, indent=2, default=str))
|
|
528
|
+
else:
|
|
529
|
+
print(format_profile_report(result))
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
if __name__ == "__main__":
|
|
533
|
+
main()
|
|
@@ -95,6 +95,13 @@ TEMPLATE_DIRS = {
|
|
|
95
95
|
"diagnose_errors.py",
|
|
96
96
|
"ablation_study.py",
|
|
97
97
|
"pareto_frontier.py",
|
|
98
|
+
"profile_training.py",
|
|
99
|
+
"checkpoint_manager.py",
|
|
100
|
+
"export_model.py",
|
|
101
|
+
"export_formats.py",
|
|
102
|
+
"equivalence_checker.py",
|
|
103
|
+
"latency_benchmark.py",
|
|
104
|
+
"export_card.py",
|
|
98
105
|
],
|
|
99
106
|
"tests": ["__init__.py", "conftest.py"],
|
|
100
107
|
}
|
|
@@ -108,6 +115,9 @@ DIRECTORIES_TO_CREATE = [
|
|
|
108
115
|
"experiments/ablations",
|
|
109
116
|
"experiments/frontiers",
|
|
110
117
|
"experiments/predictions",
|
|
118
|
+
"experiments/profiles",
|
|
119
|
+
"experiments/checkpoints",
|
|
120
|
+
"exports",
|
|
111
121
|
"models/best",
|
|
112
122
|
"models/archive",
|
|
113
123
|
]
|