claude-turing 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +3 -2
- package/commands/export.md +48 -0
- package/commands/turing.md +2 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +1 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/equivalence_checker.py +158 -0
- package/templates/scripts/export_card.py +183 -0
- package/templates/scripts/export_formats.py +385 -0
- package/templates/scripts/export_model.py +324 -0
- package/templates/scripts/latency_benchmark.py +167 -0
- package/templates/scripts/scaffold.py +6 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Inference latency benchmarking for model exports.
|
|
3
|
+
|
|
4
|
+
Measures p50/p95/p99 inference latency with warm-up phase.
|
|
5
|
+
Compares original vs exported model latency.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
DEFAULT_WARMUP = 10
|
|
17
|
+
DEFAULT_ITERATIONS = 100
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def benchmark_inference(
|
|
21
|
+
predict_fn,
|
|
22
|
+
test_input,
|
|
23
|
+
n_warmup: int = DEFAULT_WARMUP,
|
|
24
|
+
n_iterations: int = DEFAULT_ITERATIONS,
|
|
25
|
+
) -> dict:
|
|
26
|
+
"""Benchmark inference latency of a prediction function.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
predict_fn: Callable that takes input and returns predictions.
|
|
30
|
+
test_input: Input data for prediction.
|
|
31
|
+
n_warmup: Number of warm-up calls (discarded).
|
|
32
|
+
n_iterations: Number of benchmark calls.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dict with p50, p95, p99 latency in milliseconds and raw timings.
|
|
36
|
+
"""
|
|
37
|
+
# Warm-up phase
|
|
38
|
+
for _ in range(n_warmup):
|
|
39
|
+
try:
|
|
40
|
+
predict_fn(test_input)
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
# Benchmark phase
|
|
45
|
+
timings_ms = []
|
|
46
|
+
for _ in range(n_iterations):
|
|
47
|
+
start = time.perf_counter()
|
|
48
|
+
try:
|
|
49
|
+
predict_fn(test_input)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
return {"error": f"Prediction failed during benchmark: {e}"}
|
|
52
|
+
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
53
|
+
timings_ms.append(elapsed_ms)
|
|
54
|
+
|
|
55
|
+
arr = np.array(timings_ms)
|
|
56
|
+
return {
|
|
57
|
+
"n_iterations": n_iterations,
|
|
58
|
+
"n_warmup": n_warmup,
|
|
59
|
+
"p50_ms": round(float(np.percentile(arr, 50)), 3),
|
|
60
|
+
"p95_ms": round(float(np.percentile(arr, 95)), 3),
|
|
61
|
+
"p99_ms": round(float(np.percentile(arr, 99)), 3),
|
|
62
|
+
"mean_ms": round(float(np.mean(arr)), 3),
|
|
63
|
+
"std_ms": round(float(np.std(arr)), 3),
|
|
64
|
+
"min_ms": round(float(np.min(arr)), 3),
|
|
65
|
+
"max_ms": round(float(np.max(arr)), 3),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def compare_latency(
|
|
70
|
+
original_benchmark: dict,
|
|
71
|
+
exported_benchmark: dict,
|
|
72
|
+
) -> dict:
|
|
73
|
+
"""Compare latency between original and exported model.
|
|
74
|
+
|
|
75
|
+
Returns comparison dict with speedup ratios and verdict.
|
|
76
|
+
"""
|
|
77
|
+
if "error" in original_benchmark or "error" in exported_benchmark:
|
|
78
|
+
return {
|
|
79
|
+
"verdict": "error",
|
|
80
|
+
"reason": original_benchmark.get("error") or exported_benchmark.get("error"),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
orig_p50 = original_benchmark["p50_ms"]
|
|
84
|
+
exported_p50 = exported_benchmark["p50_ms"]
|
|
85
|
+
|
|
86
|
+
if exported_p50 > 0:
|
|
87
|
+
speedup = orig_p50 / exported_p50
|
|
88
|
+
else:
|
|
89
|
+
speedup = float("inf")
|
|
90
|
+
|
|
91
|
+
if speedup > 1.1:
|
|
92
|
+
verdict = "faster"
|
|
93
|
+
description = f"Exported model is {speedup:.1f}x faster (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
|
|
94
|
+
elif speedup < 0.9:
|
|
95
|
+
verdict = "slower"
|
|
96
|
+
description = f"Exported model is {1/speedup:.1f}x slower (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
|
|
97
|
+
else:
|
|
98
|
+
verdict = "similar"
|
|
99
|
+
description = f"Similar latency (p50: {orig_p50:.2f}ms vs {exported_p50:.2f}ms)"
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"verdict": verdict,
|
|
103
|
+
"description": description,
|
|
104
|
+
"speedup_ratio": round(speedup, 2),
|
|
105
|
+
"original_p50_ms": orig_p50,
|
|
106
|
+
"exported_p50_ms": exported_p50,
|
|
107
|
+
"original_p95_ms": original_benchmark["p95_ms"],
|
|
108
|
+
"exported_p95_ms": exported_benchmark["p95_ms"],
|
|
109
|
+
"original_p99_ms": original_benchmark["p99_ms"],
|
|
110
|
+
"exported_p99_ms": exported_benchmark["p99_ms"],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compute_percentiles(timings_ms: list[float]) -> dict:
|
|
115
|
+
"""Compute percentile statistics from raw timings."""
|
|
116
|
+
if not timings_ms:
|
|
117
|
+
return {}
|
|
118
|
+
arr = np.array(timings_ms)
|
|
119
|
+
return {
|
|
120
|
+
"p50_ms": round(float(np.percentile(arr, 50)), 3),
|
|
121
|
+
"p95_ms": round(float(np.percentile(arr, 95)), 3),
|
|
122
|
+
"p99_ms": round(float(np.percentile(arr, 99)), 3),
|
|
123
|
+
"mean_ms": round(float(np.mean(arr)), 3),
|
|
124
|
+
"std_ms": round(float(np.std(arr)), 3),
|
|
125
|
+
"min_ms": round(float(np.min(arr)), 3),
|
|
126
|
+
"max_ms": round(float(np.max(arr)), 3),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def format_benchmark_report(
|
|
131
|
+
original: dict | None,
|
|
132
|
+
exported: dict | None,
|
|
133
|
+
comparison: dict | None = None,
|
|
134
|
+
) -> str:
|
|
135
|
+
"""Format benchmark results as readable text."""
|
|
136
|
+
lines = ["## Latency Benchmark", ""]
|
|
137
|
+
|
|
138
|
+
if exported:
|
|
139
|
+
lines.extend([
|
|
140
|
+
"### Exported Model",
|
|
141
|
+
"",
|
|
142
|
+
f"- **p50:** {exported['p50_ms']:.2f} ms",
|
|
143
|
+
f"- **p95:** {exported['p95_ms']:.2f} ms",
|
|
144
|
+
f"- **p99:** {exported['p99_ms']:.2f} ms",
|
|
145
|
+
f"- **mean:** {exported['mean_ms']:.2f} ms (std: {exported['std_ms']:.2f})",
|
|
146
|
+
f"- **iterations:** {exported.get('n_iterations', 'N/A')}",
|
|
147
|
+
])
|
|
148
|
+
|
|
149
|
+
if original:
|
|
150
|
+
lines.extend([
|
|
151
|
+
"",
|
|
152
|
+
"### Original Model",
|
|
153
|
+
"",
|
|
154
|
+
f"- **p50:** {original['p50_ms']:.2f} ms",
|
|
155
|
+
f"- **p95:** {original['p95_ms']:.2f} ms",
|
|
156
|
+
f"- **p99:** {original['p99_ms']:.2f} ms",
|
|
157
|
+
])
|
|
158
|
+
|
|
159
|
+
if comparison and comparison.get("verdict") != "error":
|
|
160
|
+
lines.extend([
|
|
161
|
+
"",
|
|
162
|
+
"### Comparison",
|
|
163
|
+
"",
|
|
164
|
+
f"**{comparison['verdict'].upper()}** — {comparison['description']}",
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
return "\n".join(lines)
|
|
@@ -97,6 +97,11 @@ TEMPLATE_DIRS = {
|
|
|
97
97
|
"pareto_frontier.py",
|
|
98
98
|
"profile_training.py",
|
|
99
99
|
"checkpoint_manager.py",
|
|
100
|
+
"export_model.py",
|
|
101
|
+
"export_formats.py",
|
|
102
|
+
"equivalence_checker.py",
|
|
103
|
+
"latency_benchmark.py",
|
|
104
|
+
"export_card.py",
|
|
100
105
|
],
|
|
101
106
|
"tests": ["__init__.py", "conftest.py"],
|
|
102
107
|
}
|
|
@@ -112,6 +117,7 @@ DIRECTORIES_TO_CREATE = [
|
|
|
112
117
|
"experiments/predictions",
|
|
113
118
|
"experiments/profiles",
|
|
114
119
|
"experiments/checkpoints",
|
|
120
|
+
"exports",
|
|
115
121
|
"models/best",
|
|
116
122
|
"models/archive",
|
|
117
123
|
]
|