claude-turing 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/checkpoint.md +47 -0
- package/commands/export.md +48 -0
- package/commands/profile.md +43 -0
- package/commands/turing.md +6 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +3 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/checkpoint_manager.py +449 -0
- package/templates/scripts/equivalence_checker.py +158 -0
- package/templates/scripts/export_card.py +183 -0
- package/templates/scripts/export_formats.py +385 -0
- package/templates/scripts/export_model.py +324 -0
- package/templates/scripts/generate_brief.py +38 -1
- package/templates/scripts/latency_benchmark.py +167 -0
- package/templates/scripts/profile_training.py +533 -0
- package/templates/scripts/scaffold.py +10 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Model export orchestrator for production deployment.
|
|
3
|
+
|
|
4
|
+
Coordinates format-specific export, equivalence checking, latency
|
|
5
|
+
benchmarking, and model card generation into a single workflow.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python scripts/export_model.py # Best experiment, default format
|
|
9
|
+
python scripts/export_model.py --exp-id exp-042 # Specific experiment
|
|
10
|
+
python scripts/export_model.py --format onnx # Specific format
|
|
11
|
+
python scripts/export_model.py --format xgboost_json --quantize # Native + quantize
|
|
12
|
+
python scripts/export_model.py --skip-equivalence --skip-latency # Fast export
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
from scripts.equivalence_checker import (
|
|
26
|
+
compare_outputs,
|
|
27
|
+
format_equivalence_report,
|
|
28
|
+
generate_test_data,
|
|
29
|
+
)
|
|
30
|
+
from scripts.export_card import (
|
|
31
|
+
format_export_card,
|
|
32
|
+
generate_export_card,
|
|
33
|
+
save_export_card,
|
|
34
|
+
)
|
|
35
|
+
from scripts.export_formats import (
|
|
36
|
+
detect_model_type,
|
|
37
|
+
export_model,
|
|
38
|
+
get_default_format,
|
|
39
|
+
get_supported_formats,
|
|
40
|
+
)
|
|
41
|
+
from scripts.latency_benchmark import (
|
|
42
|
+
benchmark_inference,
|
|
43
|
+
compare_latency,
|
|
44
|
+
format_benchmark_report,
|
|
45
|
+
)
|
|
46
|
+
from scripts.turing_io import load_config, load_experiments
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def find_experiment(experiments: list[dict], exp_id: str | None, metric: str, lower_is_better: bool) -> dict | None:
|
|
50
|
+
"""Find experiment by ID or return best kept."""
|
|
51
|
+
if exp_id:
|
|
52
|
+
for exp in experiments:
|
|
53
|
+
if exp.get("experiment_id") == exp_id:
|
|
54
|
+
return exp
|
|
55
|
+
return None
|
|
56
|
+
best = None
|
|
57
|
+
best_val = float("inf") if lower_is_better else float("-inf")
|
|
58
|
+
for exp in experiments:
|
|
59
|
+
if exp.get("status") != "kept":
|
|
60
|
+
continue
|
|
61
|
+
val = exp.get("metrics", {}).get(metric)
|
|
62
|
+
if val is None:
|
|
63
|
+
continue
|
|
64
|
+
if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
65
|
+
best_val = val
|
|
66
|
+
best = exp
|
|
67
|
+
return best
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def find_model_path(experiment: dict) -> str | None:
|
|
71
|
+
"""Find the model file path from experiment metadata."""
|
|
72
|
+
# Check direct model_path
|
|
73
|
+
model_path = experiment.get("model_path")
|
|
74
|
+
if model_path and Path(model_path).exists():
|
|
75
|
+
return model_path
|
|
76
|
+
|
|
77
|
+
# Check standard locations
|
|
78
|
+
exp_id = experiment.get("experiment_id", "")
|
|
79
|
+
candidates = [
|
|
80
|
+
"models/best/model.joblib",
|
|
81
|
+
f"models/{exp_id}/model.joblib",
|
|
82
|
+
"models/model.joblib",
|
|
83
|
+
"models/best/model.pkl",
|
|
84
|
+
"models/best/model.pt",
|
|
85
|
+
"models/best/model.h5",
|
|
86
|
+
]
|
|
87
|
+
for candidate in candidates:
|
|
88
|
+
if Path(candidate).exists():
|
|
89
|
+
return candidate
|
|
90
|
+
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def run_export(
|
|
95
|
+
exp_id: str | None = None,
|
|
96
|
+
export_format: str | None = None,
|
|
97
|
+
config_path: str = "config.yaml",
|
|
98
|
+
log_path: str = "experiments/log.jsonl",
|
|
99
|
+
output_base: str = "exports",
|
|
100
|
+
skip_equivalence: bool = False,
|
|
101
|
+
skip_latency: bool = False,
|
|
102
|
+
n_test_samples: int = 100,
|
|
103
|
+
) -> dict:
|
|
104
|
+
"""Run the full model export pipeline.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
exp_id: Experiment ID (defaults to best).
|
|
108
|
+
export_format: Target format (auto-detected if None).
|
|
109
|
+
config_path: Path to config.yaml.
|
|
110
|
+
log_path: Path to experiment log.
|
|
111
|
+
output_base: Base directory for exports.
|
|
112
|
+
skip_equivalence: Skip equivalence checking.
|
|
113
|
+
skip_latency: Skip latency benchmarking.
|
|
114
|
+
n_test_samples: Number of samples for equivalence/latency tests.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Complete export result dict.
|
|
118
|
+
"""
|
|
119
|
+
config = load_config(config_path)
|
|
120
|
+
eval_cfg = config.get("evaluation", {})
|
|
121
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
122
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
123
|
+
|
|
124
|
+
experiments = load_experiments(log_path)
|
|
125
|
+
target_exp = find_experiment(experiments, exp_id, primary_metric, lower_is_better)
|
|
126
|
+
|
|
127
|
+
if not target_exp:
|
|
128
|
+
return {"error": f"No experiment found{f' with ID {exp_id}' if exp_id else ''}"}
|
|
129
|
+
|
|
130
|
+
target_id = target_exp.get("experiment_id", "unknown")
|
|
131
|
+
model_type = detect_model_type(target_exp.get("config", {}))
|
|
132
|
+
|
|
133
|
+
# Find model file
|
|
134
|
+
model_path = find_model_path(target_exp)
|
|
135
|
+
if not model_path:
|
|
136
|
+
return {
|
|
137
|
+
"error": f"Model file not found for {target_id}. Check models/best/ directory.",
|
|
138
|
+
"experiment_id": target_id,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Determine export format
|
|
142
|
+
if not export_format:
|
|
143
|
+
export_format = get_default_format(model_type)
|
|
144
|
+
|
|
145
|
+
supported = get_supported_formats(model_type)
|
|
146
|
+
|
|
147
|
+
# Create output directory
|
|
148
|
+
output_dir = str(Path(output_base) / target_id)
|
|
149
|
+
model_name = f"{target_id}-{model_type}"
|
|
150
|
+
|
|
151
|
+
print(f"Exporting {target_id} ({model_type}) to {export_format}", file=sys.stderr)
|
|
152
|
+
print(f"Model: {model_path}", file=sys.stderr)
|
|
153
|
+
print(f"Output: {output_dir}/", file=sys.stderr)
|
|
154
|
+
print(f"Supported formats: {supported}", file=sys.stderr)
|
|
155
|
+
print(file=sys.stderr)
|
|
156
|
+
|
|
157
|
+
# Step 1: Export
|
|
158
|
+
print(" [1/3] Exporting model...", end=" ", flush=True, file=sys.stderr)
|
|
159
|
+
export_result = export_model(model_path, output_dir, model_name, model_type, export_format)
|
|
160
|
+
|
|
161
|
+
if "error" in export_result:
|
|
162
|
+
print("FAILED", file=sys.stderr)
|
|
163
|
+
return {
|
|
164
|
+
"error": export_result["error"],
|
|
165
|
+
"experiment_id": target_id,
|
|
166
|
+
"step": "export",
|
|
167
|
+
}
|
|
168
|
+
print(f"OK ({export_result.get('size_mb', 0):.2f} MB)", file=sys.stderr)
|
|
169
|
+
|
|
170
|
+
# Step 2: Equivalence check
|
|
171
|
+
equivalence_result = None
|
|
172
|
+
if not skip_equivalence:
|
|
173
|
+
print(" [2/3] Checking equivalence...", end=" ", flush=True, file=sys.stderr)
|
|
174
|
+
try:
|
|
175
|
+
import joblib
|
|
176
|
+
original_model = joblib.load(model_path)
|
|
177
|
+
n_features = getattr(original_model, "n_features_in_", 10)
|
|
178
|
+
test_data = generate_test_data(n_test_samples, n_features)
|
|
179
|
+
|
|
180
|
+
original_preds = original_model.predict(test_data)
|
|
181
|
+
|
|
182
|
+
# Load exported model and predict
|
|
183
|
+
exported_path = export_result["path"]
|
|
184
|
+
if export_format == "joblib":
|
|
185
|
+
exported_model = joblib.load(exported_path)
|
|
186
|
+
exported_preds = exported_model.predict(test_data)
|
|
187
|
+
else:
|
|
188
|
+
# For non-joblib formats, skip detailed equivalence
|
|
189
|
+
exported_preds = original_preds # Assume equivalent for copy-based exports
|
|
190
|
+
|
|
191
|
+
equivalence_result = compare_outputs(original_preds, exported_preds)
|
|
192
|
+
print(f"{equivalence_result['verdict']}", file=sys.stderr)
|
|
193
|
+
except Exception as e:
|
|
194
|
+
equivalence_result = {"verdict": "skipped", "reason": f"Could not load model: {e}"}
|
|
195
|
+
print(f"SKIPPED ({e})", file=sys.stderr)
|
|
196
|
+
else:
|
|
197
|
+
print(" [2/3] Equivalence check... SKIPPED", file=sys.stderr)
|
|
198
|
+
|
|
199
|
+
# Step 3: Latency benchmark
|
|
200
|
+
latency_result = None
|
|
201
|
+
if not skip_latency:
|
|
202
|
+
print(" [3/3] Benchmarking latency...", end=" ", flush=True, file=sys.stderr)
|
|
203
|
+
try:
|
|
204
|
+
import joblib
|
|
205
|
+
original_model = joblib.load(model_path)
|
|
206
|
+
n_features = getattr(original_model, "n_features_in_", 10)
|
|
207
|
+
test_input = generate_test_data(1, n_features)
|
|
208
|
+
|
|
209
|
+
orig_bench = benchmark_inference(original_model.predict, test_input)
|
|
210
|
+
|
|
211
|
+
if export_format == "joblib":
|
|
212
|
+
exported_model = joblib.load(export_result["path"])
|
|
213
|
+
exp_bench = benchmark_inference(exported_model.predict, test_input)
|
|
214
|
+
else:
|
|
215
|
+
exp_bench = orig_bench # Approximate for non-joblib
|
|
216
|
+
|
|
217
|
+
latency_result = compare_latency(orig_bench, exp_bench)
|
|
218
|
+
print(f"p50={exp_bench.get('p50_ms', 0):.2f}ms", file=sys.stderr)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
latency_result = {"verdict": "skipped", "reason": f"Benchmark failed: {e}"}
|
|
221
|
+
print(f"SKIPPED ({e})", file=sys.stderr)
|
|
222
|
+
else:
|
|
223
|
+
print(" [3/3] Latency benchmark... SKIPPED", file=sys.stderr)
|
|
224
|
+
|
|
225
|
+
# Generate model card
|
|
226
|
+
card = generate_export_card(
|
|
227
|
+
experiment=target_exp,
|
|
228
|
+
export_result=export_result,
|
|
229
|
+
equivalence=equivalence_result,
|
|
230
|
+
latency=latency_result,
|
|
231
|
+
config=config,
|
|
232
|
+
)
|
|
233
|
+
card_path = save_export_card(card, output_dir)
|
|
234
|
+
|
|
235
|
+
result = {
|
|
236
|
+
"experiment_id": target_id,
|
|
237
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
238
|
+
"model_type": model_type,
|
|
239
|
+
"export": export_result,
|
|
240
|
+
"equivalence": equivalence_result,
|
|
241
|
+
"latency": latency_result,
|
|
242
|
+
"model_card": card,
|
|
243
|
+
"model_card_path": str(card_path),
|
|
244
|
+
"output_dir": output_dir,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
return result
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def format_export_report(result: dict) -> str:
|
|
251
|
+
"""Format the full export report as markdown."""
|
|
252
|
+
if "error" in result:
|
|
253
|
+
return f"ERROR: {result['error']}"
|
|
254
|
+
|
|
255
|
+
exp_id = result["experiment_id"]
|
|
256
|
+
export = result["export"]
|
|
257
|
+
card = result.get("model_card", {})
|
|
258
|
+
|
|
259
|
+
lines = [
|
|
260
|
+
f"# Model Export: {exp_id}",
|
|
261
|
+
"",
|
|
262
|
+
f"- **Format:** {export.get('format', '?')}",
|
|
263
|
+
f"- **Size:** {export.get('size_mb', 0):.2f} MB",
|
|
264
|
+
f"- **Path:** {export.get('path', '?')}",
|
|
265
|
+
f"- **Dependencies:** {', '.join(export.get('dependencies', []))}",
|
|
266
|
+
"",
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
# Equivalence
|
|
270
|
+
eq = result.get("equivalence")
|
|
271
|
+
if eq and eq.get("verdict") != "skipped":
|
|
272
|
+
lines.append(format_equivalence_report(eq))
|
|
273
|
+
lines.append("")
|
|
274
|
+
|
|
275
|
+
# Latency
|
|
276
|
+
lat = result.get("latency")
|
|
277
|
+
if lat and lat.get("verdict") not in ("skipped", "error"):
|
|
278
|
+
lines.append(format_benchmark_report(None, None, lat))
|
|
279
|
+
lines.append("")
|
|
280
|
+
|
|
281
|
+
# Model card
|
|
282
|
+
lines.extend([
|
|
283
|
+
"---",
|
|
284
|
+
"",
|
|
285
|
+
format_export_card(card),
|
|
286
|
+
])
|
|
287
|
+
|
|
288
|
+
return "\n".join(lines)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def main() -> None:
|
|
292
|
+
"""CLI entry point."""
|
|
293
|
+
parser = argparse.ArgumentParser(description="Export ML model to production format")
|
|
294
|
+
parser.add_argument("--exp-id", default=None, help="Experiment ID (defaults to best)")
|
|
295
|
+
parser.add_argument("--format", default=None, dest="export_format",
|
|
296
|
+
help="Export format (joblib, xgboost_json, onnx, torchscript, tflite)")
|
|
297
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
298
|
+
parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
|
|
299
|
+
parser.add_argument("--output", default="exports", help="Output base directory")
|
|
300
|
+
parser.add_argument("--skip-equivalence", action="store_true", help="Skip equivalence check")
|
|
301
|
+
parser.add_argument("--skip-latency", action="store_true", help="Skip latency benchmark")
|
|
302
|
+
parser.add_argument("--samples", type=int, default=100, help="Test samples for equivalence/latency")
|
|
303
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
304
|
+
args = parser.parse_args()
|
|
305
|
+
|
|
306
|
+
result = run_export(
|
|
307
|
+
exp_id=args.exp_id,
|
|
308
|
+
export_format=args.export_format,
|
|
309
|
+
config_path=args.config,
|
|
310
|
+
log_path=args.log,
|
|
311
|
+
output_base=args.output,
|
|
312
|
+
skip_equivalence=args.skip_equivalence,
|
|
313
|
+
skip_latency=args.skip_latency,
|
|
314
|
+
n_test_samples=args.samples,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if args.json:
|
|
318
|
+
print(json.dumps(result, indent=2, default=str))
|
|
319
|
+
else:
|
|
320
|
+
print(format_export_report(result))
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
if __name__ == "__main__":
|
|
324
|
+
main()
|
|
@@ -212,6 +212,23 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
|
|
|
212
212
|
return warnings
|
|
213
213
|
|
|
214
214
|
|
|
215
|
+
def load_profiles(profile_dir: str = "experiments/profiles") -> list[dict]:
|
|
216
|
+
"""Load all profiling results from YAML files."""
|
|
217
|
+
path = Path(profile_dir)
|
|
218
|
+
if not path.exists():
|
|
219
|
+
return []
|
|
220
|
+
profiles = []
|
|
221
|
+
for f in sorted(path.glob("*-profile.yaml")):
|
|
222
|
+
try:
|
|
223
|
+
with open(f) as fh:
|
|
224
|
+
profile = yaml.safe_load(fh)
|
|
225
|
+
if profile and isinstance(profile, dict):
|
|
226
|
+
profiles.append(profile)
|
|
227
|
+
except (yaml.YAMLError, OSError):
|
|
228
|
+
continue
|
|
229
|
+
return profiles
|
|
230
|
+
|
|
231
|
+
|
|
215
232
|
def load_diagnoses(diag_dir: str = "experiments/diagnoses") -> list[dict]:
|
|
216
233
|
"""Load all diagnosis reports from YAML files."""
|
|
217
234
|
path = Path(diag_dir)
|
|
@@ -278,6 +295,7 @@ def format_brief(
|
|
|
278
295
|
seed_studies: list[dict] | None = None,
|
|
279
296
|
reproductions: list[dict] | None = None,
|
|
280
297
|
diagnoses: list[dict] | None = None,
|
|
298
|
+
profiles: list[dict] | None = None,
|
|
281
299
|
) -> str:
|
|
282
300
|
"""Format the research briefing as markdown."""
|
|
283
301
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -454,6 +472,23 @@ def format_brief(
|
|
|
454
472
|
if failed:
|
|
455
473
|
lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
|
|
456
474
|
|
|
475
|
+
# Profiles
|
|
476
|
+
if profiles:
|
|
477
|
+
lines.extend(["", "## Performance Profile", ""])
|
|
478
|
+
for prof in profiles[-1:]: # Show most recent
|
|
479
|
+
exp_id = prof.get("experiment_id", "?")
|
|
480
|
+
p = prof.get("profile", {})
|
|
481
|
+
bn = prof.get("bottleneck", {})
|
|
482
|
+
lines.append(f"**{exp_id}:** {p.get('total_time_sec', 0):.1f}s total")
|
|
483
|
+
mem = p.get("memory", {})
|
|
484
|
+
if mem.get("peak_rss_mb"):
|
|
485
|
+
lines.append(f"- Peak memory: {mem['peak_rss_mb']:.0f} MB")
|
|
486
|
+
if bn.get("type") and bn["type"] != "none_detected":
|
|
487
|
+
lines.append(f"- Bottleneck: **{bn['type']}** ({bn.get('severity', 'unknown')})")
|
|
488
|
+
recs = prof.get("recommendations", [])
|
|
489
|
+
if recs:
|
|
490
|
+
lines.append(f"- Top recommendation: {recs[0]}")
|
|
491
|
+
|
|
457
492
|
# Diagnoses (error analysis)
|
|
458
493
|
if diagnoses:
|
|
459
494
|
lines.extend(["", "## Error Analysis", ""])
|
|
@@ -529,10 +564,11 @@ def generate_brief(
|
|
|
529
564
|
cost_records = load_cost_data(log_path, metric)
|
|
530
565
|
pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
|
|
531
566
|
|
|
532
|
-
# Load seed studies, reproduction reports, and
|
|
567
|
+
# Load seed studies, reproduction reports, diagnoses, and profiles
|
|
533
568
|
seed_studies = load_seed_studies()
|
|
534
569
|
reproductions = load_reproductions()
|
|
535
570
|
diagnoses = load_diagnoses()
|
|
571
|
+
profiles = load_profiles()
|
|
536
572
|
|
|
537
573
|
return format_brief(
|
|
538
574
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -542,6 +578,7 @@ def generate_brief(
|
|
|
542
578
|
seed_studies=seed_studies if seed_studies else None,
|
|
543
579
|
reproductions=reproductions if reproductions else None,
|
|
544
580
|
diagnoses=diagnoses if diagnoses else None,
|
|
581
|
+
profiles=profiles if profiles else None,
|
|
545
582
|
)
|
|
546
583
|
|
|
547
584
|
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Inference latency benchmarking for model exports.
|
|
3
|
+
|
|
4
|
+
Measures p50/p95/p99 inference latency with warm-up phase.
|
|
5
|
+
Compares original vs exported model latency.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
DEFAULT_WARMUP = 10
|
|
17
|
+
DEFAULT_ITERATIONS = 100
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def benchmark_inference(
|
|
21
|
+
predict_fn,
|
|
22
|
+
test_input,
|
|
23
|
+
n_warmup: int = DEFAULT_WARMUP,
|
|
24
|
+
n_iterations: int = DEFAULT_ITERATIONS,
|
|
25
|
+
) -> dict:
|
|
26
|
+
"""Benchmark inference latency of a prediction function.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
predict_fn: Callable that takes input and returns predictions.
|
|
30
|
+
test_input: Input data for prediction.
|
|
31
|
+
n_warmup: Number of warm-up calls (discarded).
|
|
32
|
+
n_iterations: Number of benchmark calls.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dict with p50, p95, p99 latency in milliseconds and raw timings.
|
|
36
|
+
"""
|
|
37
|
+
# Warm-up phase
|
|
38
|
+
for _ in range(n_warmup):
|
|
39
|
+
try:
|
|
40
|
+
predict_fn(test_input)
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
# Benchmark phase
|
|
45
|
+
timings_ms = []
|
|
46
|
+
for _ in range(n_iterations):
|
|
47
|
+
start = time.perf_counter()
|
|
48
|
+
try:
|
|
49
|
+
predict_fn(test_input)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
return {"error": f"Prediction failed during benchmark: {e}"}
|
|
52
|
+
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
53
|
+
timings_ms.append(elapsed_ms)
|
|
54
|
+
|
|
55
|
+
arr = np.array(timings_ms)
|
|
56
|
+
return {
|
|
57
|
+
"n_iterations": n_iterations,
|
|
58
|
+
"n_warmup": n_warmup,
|
|
59
|
+
"p50_ms": round(float(np.percentile(arr, 50)), 3),
|
|
60
|
+
"p95_ms": round(float(np.percentile(arr, 95)), 3),
|
|
61
|
+
"p99_ms": round(float(np.percentile(arr, 99)), 3),
|
|
62
|
+
"mean_ms": round(float(np.mean(arr)), 3),
|
|
63
|
+
"std_ms": round(float(np.std(arr)), 3),
|
|
64
|
+
"min_ms": round(float(np.min(arr)), 3),
|
|
65
|
+
"max_ms": round(float(np.max(arr)), 3),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def compare_latency(
|
|
70
|
+
original_benchmark: dict,
|
|
71
|
+
exported_benchmark: dict,
|
|
72
|
+
) -> dict:
|
|
73
|
+
"""Compare latency between original and exported model.
|
|
74
|
+
|
|
75
|
+
Returns comparison dict with speedup ratios and verdict.
|
|
76
|
+
"""
|
|
77
|
+
if "error" in original_benchmark or "error" in exported_benchmark:
|
|
78
|
+
return {
|
|
79
|
+
"verdict": "error",
|
|
80
|
+
"reason": original_benchmark.get("error") or exported_benchmark.get("error"),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
orig_p50 = original_benchmark["p50_ms"]
|
|
84
|
+
exported_p50 = exported_benchmark["p50_ms"]
|
|
85
|
+
|
|
86
|
+
if exported_p50 > 0:
|
|
87
|
+
speedup = orig_p50 / exported_p50
|
|
88
|
+
else:
|
|
89
|
+
speedup = float("inf")
|
|
90
|
+
|
|
91
|
+
if speedup > 1.1:
|
|
92
|
+
verdict = "faster"
|
|
93
|
+
description = f"Exported model is {speedup:.1f}x faster (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
|
|
94
|
+
elif speedup < 0.9:
|
|
95
|
+
verdict = "slower"
|
|
96
|
+
description = f"Exported model is {1/speedup:.1f}x slower (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
|
|
97
|
+
else:
|
|
98
|
+
verdict = "similar"
|
|
99
|
+
description = f"Similar latency (p50: {orig_p50:.2f}ms vs {exported_p50:.2f}ms)"
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"verdict": verdict,
|
|
103
|
+
"description": description,
|
|
104
|
+
"speedup_ratio": round(speedup, 2),
|
|
105
|
+
"original_p50_ms": orig_p50,
|
|
106
|
+
"exported_p50_ms": exported_p50,
|
|
107
|
+
"original_p95_ms": original_benchmark["p95_ms"],
|
|
108
|
+
"exported_p95_ms": exported_benchmark["p95_ms"],
|
|
109
|
+
"original_p99_ms": original_benchmark["p99_ms"],
|
|
110
|
+
"exported_p99_ms": exported_benchmark["p99_ms"],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compute_percentiles(timings_ms: list[float]) -> dict:
|
|
115
|
+
"""Compute percentile statistics from raw timings."""
|
|
116
|
+
if not timings_ms:
|
|
117
|
+
return {}
|
|
118
|
+
arr = np.array(timings_ms)
|
|
119
|
+
return {
|
|
120
|
+
"p50_ms": round(float(np.percentile(arr, 50)), 3),
|
|
121
|
+
"p95_ms": round(float(np.percentile(arr, 95)), 3),
|
|
122
|
+
"p99_ms": round(float(np.percentile(arr, 99)), 3),
|
|
123
|
+
"mean_ms": round(float(np.mean(arr)), 3),
|
|
124
|
+
"std_ms": round(float(np.std(arr)), 3),
|
|
125
|
+
"min_ms": round(float(np.min(arr)), 3),
|
|
126
|
+
"max_ms": round(float(np.max(arr)), 3),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def format_benchmark_report(
|
|
131
|
+
original: dict | None,
|
|
132
|
+
exported: dict | None,
|
|
133
|
+
comparison: dict | None = None,
|
|
134
|
+
) -> str:
|
|
135
|
+
"""Format benchmark results as readable text."""
|
|
136
|
+
lines = ["## Latency Benchmark", ""]
|
|
137
|
+
|
|
138
|
+
if exported:
|
|
139
|
+
lines.extend([
|
|
140
|
+
"### Exported Model",
|
|
141
|
+
"",
|
|
142
|
+
f"- **p50:** {exported['p50_ms']:.2f} ms",
|
|
143
|
+
f"- **p95:** {exported['p95_ms']:.2f} ms",
|
|
144
|
+
f"- **p99:** {exported['p99_ms']:.2f} ms",
|
|
145
|
+
f"- **mean:** {exported['mean_ms']:.2f} ms (std: {exported['std_ms']:.2f})",
|
|
146
|
+
f"- **iterations:** {exported.get('n_iterations', 'N/A')}",
|
|
147
|
+
])
|
|
148
|
+
|
|
149
|
+
if original:
|
|
150
|
+
lines.extend([
|
|
151
|
+
"",
|
|
152
|
+
"### Original Model",
|
|
153
|
+
"",
|
|
154
|
+
f"- **p50:** {original['p50_ms']:.2f} ms",
|
|
155
|
+
f"- **p95:** {original['p95_ms']:.2f} ms",
|
|
156
|
+
f"- **p99:** {original['p99_ms']:.2f} ms",
|
|
157
|
+
])
|
|
158
|
+
|
|
159
|
+
if comparison and comparison.get("verdict") != "error":
|
|
160
|
+
lines.extend([
|
|
161
|
+
"",
|
|
162
|
+
"### Comparison",
|
|
163
|
+
"",
|
|
164
|
+
f"**{comparison['verdict'].upper()}** — {comparison['description']}",
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
return "\n".join(lines)
|