claude-turing 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env python3
2
+ """Inference latency benchmarking for model exports.
3
+
4
+ Measures p50/p95/p99 inference latency with warm-up phase.
5
+ Compares original vs exported model latency.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from pathlib import Path
12
+
13
+ import numpy as np
14
+
15
+
16
+ DEFAULT_WARMUP = 10
17
+ DEFAULT_ITERATIONS = 100
18
+
19
+
20
+ def benchmark_inference(
21
+ predict_fn,
22
+ test_input,
23
+ n_warmup: int = DEFAULT_WARMUP,
24
+ n_iterations: int = DEFAULT_ITERATIONS,
25
+ ) -> dict:
26
+ """Benchmark inference latency of a prediction function.
27
+
28
+ Args:
29
+ predict_fn: Callable that takes input and returns predictions.
30
+ test_input: Input data for prediction.
31
+ n_warmup: Number of warm-up calls (discarded).
32
+ n_iterations: Number of benchmark calls.
33
+
34
+ Returns:
35
+ Dict with p50, p95, p99 latency in milliseconds and raw timings.
36
+ """
37
+ # Warm-up phase
38
+ for _ in range(n_warmup):
39
+ try:
40
+ predict_fn(test_input)
41
+ except Exception:
42
+ pass
43
+
44
+ # Benchmark phase
45
+ timings_ms = []
46
+ for _ in range(n_iterations):
47
+ start = time.perf_counter()
48
+ try:
49
+ predict_fn(test_input)
50
+ except Exception as e:
51
+ return {"error": f"Prediction failed during benchmark: {e}"}
52
+ elapsed_ms = (time.perf_counter() - start) * 1000
53
+ timings_ms.append(elapsed_ms)
54
+
55
+ arr = np.array(timings_ms)
56
+ return {
57
+ "n_iterations": n_iterations,
58
+ "n_warmup": n_warmup,
59
+ "p50_ms": round(float(np.percentile(arr, 50)), 3),
60
+ "p95_ms": round(float(np.percentile(arr, 95)), 3),
61
+ "p99_ms": round(float(np.percentile(arr, 99)), 3),
62
+ "mean_ms": round(float(np.mean(arr)), 3),
63
+ "std_ms": round(float(np.std(arr)), 3),
64
+ "min_ms": round(float(np.min(arr)), 3),
65
+ "max_ms": round(float(np.max(arr)), 3),
66
+ }
67
+
68
+
69
+ def compare_latency(
70
+ original_benchmark: dict,
71
+ exported_benchmark: dict,
72
+ ) -> dict:
73
+ """Compare latency between original and exported model.
74
+
75
+ Returns comparison dict with speedup ratios and verdict.
76
+ """
77
+ if "error" in original_benchmark or "error" in exported_benchmark:
78
+ return {
79
+ "verdict": "error",
80
+ "reason": original_benchmark.get("error") or exported_benchmark.get("error"),
81
+ }
82
+
83
+ orig_p50 = original_benchmark["p50_ms"]
84
+ exported_p50 = exported_benchmark["p50_ms"]
85
+
86
+ if exported_p50 > 0:
87
+ speedup = orig_p50 / exported_p50
88
+ else:
89
+ speedup = float("inf")
90
+
91
+ if speedup > 1.1:
92
+ verdict = "faster"
93
+ description = f"Exported model is {speedup:.1f}x faster (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
94
+ elif speedup < 0.9:
95
+ verdict = "slower"
96
+ description = f"Exported model is {1/speedup:.1f}x slower (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
97
+ else:
98
+ verdict = "similar"
99
+ description = f"Similar latency (p50: {orig_p50:.2f}ms vs {exported_p50:.2f}ms)"
100
+
101
+ return {
102
+ "verdict": verdict,
103
+ "description": description,
104
+ "speedup_ratio": round(speedup, 2),
105
+ "original_p50_ms": orig_p50,
106
+ "exported_p50_ms": exported_p50,
107
+ "original_p95_ms": original_benchmark["p95_ms"],
108
+ "exported_p95_ms": exported_benchmark["p95_ms"],
109
+ "original_p99_ms": original_benchmark["p99_ms"],
110
+ "exported_p99_ms": exported_benchmark["p99_ms"],
111
+ }
112
+
113
+
114
+ def compute_percentiles(timings_ms: list[float]) -> dict:
115
+ """Compute percentile statistics from raw timings."""
116
+ if not timings_ms:
117
+ return {}
118
+ arr = np.array(timings_ms)
119
+ return {
120
+ "p50_ms": round(float(np.percentile(arr, 50)), 3),
121
+ "p95_ms": round(float(np.percentile(arr, 95)), 3),
122
+ "p99_ms": round(float(np.percentile(arr, 99)), 3),
123
+ "mean_ms": round(float(np.mean(arr)), 3),
124
+ "std_ms": round(float(np.std(arr)), 3),
125
+ "min_ms": round(float(np.min(arr)), 3),
126
+ "max_ms": round(float(np.max(arr)), 3),
127
+ }
128
+
129
+
130
+ def format_benchmark_report(
131
+ original: dict | None,
132
+ exported: dict | None,
133
+ comparison: dict | None = None,
134
+ ) -> str:
135
+ """Format benchmark results as readable text."""
136
+ lines = ["## Latency Benchmark", ""]
137
+
138
+ if exported:
139
+ lines.extend([
140
+ "### Exported Model",
141
+ "",
142
+ f"- **p50:** {exported['p50_ms']:.2f} ms",
143
+ f"- **p95:** {exported['p95_ms']:.2f} ms",
144
+ f"- **p99:** {exported['p99_ms']:.2f} ms",
145
+ f"- **mean:** {exported['mean_ms']:.2f} ms (std: {exported['std_ms']:.2f})",
146
+ f"- **iterations:** {exported.get('n_iterations', 'N/A')}",
147
+ ])
148
+
149
+ if original:
150
+ lines.extend([
151
+ "",
152
+ "### Original Model",
153
+ "",
154
+ f"- **p50:** {original['p50_ms']:.2f} ms",
155
+ f"- **p95:** {original['p95_ms']:.2f} ms",
156
+ f"- **p99:** {original['p99_ms']:.2f} ms",
157
+ ])
158
+
159
+ if comparison and comparison.get("verdict") != "error":
160
+ lines.extend([
161
+ "",
162
+ "### Comparison",
163
+ "",
164
+ f"**{comparison['verdict'].upper()}** — {comparison['description']}",
165
+ ])
166
+
167
+ return "\n".join(lines)
@@ -97,6 +97,11 @@ TEMPLATE_DIRS = {
97
97
  "pareto_frontier.py",
98
98
  "profile_training.py",
99
99
  "checkpoint_manager.py",
100
+ "export_model.py",
101
+ "export_formats.py",
102
+ "equivalence_checker.py",
103
+ "latency_benchmark.py",
104
+ "export_card.py",
100
105
  ],
101
106
  "tests": ["__init__.py", "conftest.py"],
102
107
  }
@@ -112,6 +117,7 @@ DIRECTORIES_TO_CREATE = [
112
117
  "experiments/predictions",
113
118
  "experiments/profiles",
114
119
  "experiments/checkpoints",
120
+ "exports",
115
121
  "models/best",
116
122
  "models/archive",
117
123
  ]