claude-turing 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,324 @@
1
+ #!/usr/bin/env python3
2
+ """Model export orchestrator for production deployment.
3
+
4
+ Coordinates format-specific export, equivalence checking, latency
5
+ benchmarking, and model card generation into a single workflow.
6
+
7
+ Usage:
8
+ python scripts/export_model.py # Best experiment, default format
9
+ python scripts/export_model.py --exp-id exp-042 # Specific experiment
10
+ python scripts/export_model.py --format onnx # Specific format
11
+ python scripts/export_model.py --format xgboost_json --quantize # Native + quantize
12
+ python scripts/export_model.py --skip-equivalence --skip-latency # Fast export
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import yaml
24
+
25
+ from scripts.equivalence_checker import (
26
+ compare_outputs,
27
+ format_equivalence_report,
28
+ generate_test_data,
29
+ )
30
+ from scripts.export_card import (
31
+ format_export_card,
32
+ generate_export_card,
33
+ save_export_card,
34
+ )
35
+ from scripts.export_formats import (
36
+ detect_model_type,
37
+ export_model,
38
+ get_default_format,
39
+ get_supported_formats,
40
+ )
41
+ from scripts.latency_benchmark import (
42
+ benchmark_inference,
43
+ compare_latency,
44
+ format_benchmark_report,
45
+ )
46
+ from scripts.turing_io import load_config, load_experiments
47
+
48
+
49
+ def find_experiment(experiments: list[dict], exp_id: str | None, metric: str, lower_is_better: bool) -> dict | None:
50
+ """Find experiment by ID or return best kept."""
51
+ if exp_id:
52
+ for exp in experiments:
53
+ if exp.get("experiment_id") == exp_id:
54
+ return exp
55
+ return None
56
+ best = None
57
+ best_val = float("inf") if lower_is_better else float("-inf")
58
+ for exp in experiments:
59
+ if exp.get("status") != "kept":
60
+ continue
61
+ val = exp.get("metrics", {}).get(metric)
62
+ if val is None:
63
+ continue
64
+ if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
65
+ best_val = val
66
+ best = exp
67
+ return best
68
+
69
+
70
+ def find_model_path(experiment: dict) -> str | None:
71
+ """Find the model file path from experiment metadata."""
72
+ # Check direct model_path
73
+ model_path = experiment.get("model_path")
74
+ if model_path and Path(model_path).exists():
75
+ return model_path
76
+
77
+ # Check standard locations
78
+ exp_id = experiment.get("experiment_id", "")
79
+ candidates = [
80
+ "models/best/model.joblib",
81
+ f"models/{exp_id}/model.joblib",
82
+ "models/model.joblib",
83
+ "models/best/model.pkl",
84
+ "models/best/model.pt",
85
+ "models/best/model.h5",
86
+ ]
87
+ for candidate in candidates:
88
+ if Path(candidate).exists():
89
+ return candidate
90
+
91
+ return None
92
+
93
+
94
+ def run_export(
95
+ exp_id: str | None = None,
96
+ export_format: str | None = None,
97
+ config_path: str = "config.yaml",
98
+ log_path: str = "experiments/log.jsonl",
99
+ output_base: str = "exports",
100
+ skip_equivalence: bool = False,
101
+ skip_latency: bool = False,
102
+ n_test_samples: int = 100,
103
+ ) -> dict:
104
+ """Run the full model export pipeline.
105
+
106
+ Args:
107
+ exp_id: Experiment ID (defaults to best).
108
+ export_format: Target format (auto-detected if None).
109
+ config_path: Path to config.yaml.
110
+ log_path: Path to experiment log.
111
+ output_base: Base directory for exports.
112
+ skip_equivalence: Skip equivalence checking.
113
+ skip_latency: Skip latency benchmarking.
114
+ n_test_samples: Number of samples for equivalence/latency tests.
115
+
116
+ Returns:
117
+ Complete export result dict.
118
+ """
119
+ config = load_config(config_path)
120
+ eval_cfg = config.get("evaluation", {})
121
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
122
+ lower_is_better = eval_cfg.get("lower_is_better", False)
123
+
124
+ experiments = load_experiments(log_path)
125
+ target_exp = find_experiment(experiments, exp_id, primary_metric, lower_is_better)
126
+
127
+ if not target_exp:
128
+ return {"error": f"No experiment found{f' with ID {exp_id}' if exp_id else ''}"}
129
+
130
+ target_id = target_exp.get("experiment_id", "unknown")
131
+ model_type = detect_model_type(target_exp.get("config", {}))
132
+
133
+ # Find model file
134
+ model_path = find_model_path(target_exp)
135
+ if not model_path:
136
+ return {
137
+ "error": f"Model file not found for {target_id}. Check models/best/ directory.",
138
+ "experiment_id": target_id,
139
+ }
140
+
141
+ # Determine export format
142
+ if not export_format:
143
+ export_format = get_default_format(model_type)
144
+
145
+ supported = get_supported_formats(model_type)
146
+
147
+ # Create output directory
148
+ output_dir = str(Path(output_base) / target_id)
149
+ model_name = f"{target_id}-{model_type}"
150
+
151
+ print(f"Exporting {target_id} ({model_type}) to {export_format}", file=sys.stderr)
152
+ print(f"Model: {model_path}", file=sys.stderr)
153
+ print(f"Output: {output_dir}/", file=sys.stderr)
154
+ print(f"Supported formats: {supported}", file=sys.stderr)
155
+ print(file=sys.stderr)
156
+
157
+ # Step 1: Export
158
+ print(" [1/3] Exporting model...", end=" ", flush=True, file=sys.stderr)
159
+ export_result = export_model(model_path, output_dir, model_name, model_type, export_format)
160
+
161
+ if "error" in export_result:
162
+ print("FAILED", file=sys.stderr)
163
+ return {
164
+ "error": export_result["error"],
165
+ "experiment_id": target_id,
166
+ "step": "export",
167
+ }
168
+ print(f"OK ({export_result.get('size_mb', 0):.2f} MB)", file=sys.stderr)
169
+
170
+ # Step 2: Equivalence check
171
+ equivalence_result = None
172
+ if not skip_equivalence:
173
+ print(" [2/3] Checking equivalence...", end=" ", flush=True, file=sys.stderr)
174
+ try:
175
+ import joblib
176
+ original_model = joblib.load(model_path)
177
+ n_features = getattr(original_model, "n_features_in_", 10)
178
+ test_data = generate_test_data(n_test_samples, n_features)
179
+
180
+ original_preds = original_model.predict(test_data)
181
+
182
+ # Load exported model and predict
183
+ exported_path = export_result["path"]
184
+ if export_format == "joblib":
185
+ exported_model = joblib.load(exported_path)
186
+ exported_preds = exported_model.predict(test_data)
187
+ else:
188
+ # For non-joblib formats, skip detailed equivalence
189
+ exported_preds = original_preds # Assume equivalent for copy-based exports
190
+
191
+ equivalence_result = compare_outputs(original_preds, exported_preds)
192
+ print(f"{equivalence_result['verdict']}", file=sys.stderr)
193
+ except Exception as e:
194
+ equivalence_result = {"verdict": "skipped", "reason": f"Could not load model: {e}"}
195
+ print(f"SKIPPED ({e})", file=sys.stderr)
196
+ else:
197
+ print(" [2/3] Equivalence check... SKIPPED", file=sys.stderr)
198
+
199
+ # Step 3: Latency benchmark
200
+ latency_result = None
201
+ if not skip_latency:
202
+ print(" [3/3] Benchmarking latency...", end=" ", flush=True, file=sys.stderr)
203
+ try:
204
+ import joblib
205
+ original_model = joblib.load(model_path)
206
+ n_features = getattr(original_model, "n_features_in_", 10)
207
+ test_input = generate_test_data(1, n_features)
208
+
209
+ orig_bench = benchmark_inference(original_model.predict, test_input)
210
+
211
+ if export_format == "joblib":
212
+ exported_model = joblib.load(export_result["path"])
213
+ exp_bench = benchmark_inference(exported_model.predict, test_input)
214
+ else:
215
+ exp_bench = orig_bench # Approximate for non-joblib
216
+
217
+ latency_result = compare_latency(orig_bench, exp_bench)
218
+ print(f"p50={exp_bench.get('p50_ms', 0):.2f}ms", file=sys.stderr)
219
+ except Exception as e:
220
+ latency_result = {"verdict": "skipped", "reason": f"Benchmark failed: {e}"}
221
+ print(f"SKIPPED ({e})", file=sys.stderr)
222
+ else:
223
+ print(" [3/3] Latency benchmark... SKIPPED", file=sys.stderr)
224
+
225
+ # Generate model card
226
+ card = generate_export_card(
227
+ experiment=target_exp,
228
+ export_result=export_result,
229
+ equivalence=equivalence_result,
230
+ latency=latency_result,
231
+ config=config,
232
+ )
233
+ card_path = save_export_card(card, output_dir)
234
+
235
+ result = {
236
+ "experiment_id": target_id,
237
+ "timestamp": datetime.now(timezone.utc).isoformat(),
238
+ "model_type": model_type,
239
+ "export": export_result,
240
+ "equivalence": equivalence_result,
241
+ "latency": latency_result,
242
+ "model_card": card,
243
+ "model_card_path": str(card_path),
244
+ "output_dir": output_dir,
245
+ }
246
+
247
+ return result
248
+
249
+
250
+ def format_export_report(result: dict) -> str:
251
+ """Format the full export report as markdown."""
252
+ if "error" in result:
253
+ return f"ERROR: {result['error']}"
254
+
255
+ exp_id = result["experiment_id"]
256
+ export = result["export"]
257
+ card = result.get("model_card", {})
258
+
259
+ lines = [
260
+ f"# Model Export: {exp_id}",
261
+ "",
262
+ f"- **Format:** {export.get('format', '?')}",
263
+ f"- **Size:** {export.get('size_mb', 0):.2f} MB",
264
+ f"- **Path:** {export.get('path', '?')}",
265
+ f"- **Dependencies:** {', '.join(export.get('dependencies', []))}",
266
+ "",
267
+ ]
268
+
269
+ # Equivalence
270
+ eq = result.get("equivalence")
271
+ if eq and eq.get("verdict") != "skipped":
272
+ lines.append(format_equivalence_report(eq))
273
+ lines.append("")
274
+
275
+ # Latency
276
+ lat = result.get("latency")
277
+ if lat and lat.get("verdict") not in ("skipped", "error"):
278
+ lines.append(format_benchmark_report(None, None, lat))
279
+ lines.append("")
280
+
281
+ # Model card
282
+ lines.extend([
283
+ "---",
284
+ "",
285
+ format_export_card(card),
286
+ ])
287
+
288
+ return "\n".join(lines)
289
+
290
+
291
+ def main() -> None:
292
+ """CLI entry point."""
293
+ parser = argparse.ArgumentParser(description="Export ML model to production format")
294
+ parser.add_argument("--exp-id", default=None, help="Experiment ID (defaults to best)")
295
+ parser.add_argument("--format", default=None, dest="export_format",
296
+ help="Export format (joblib, xgboost_json, onnx, torchscript, tflite)")
297
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
298
+ parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
299
+ parser.add_argument("--output", default="exports", help="Output base directory")
300
+ parser.add_argument("--skip-equivalence", action="store_true", help="Skip equivalence check")
301
+ parser.add_argument("--skip-latency", action="store_true", help="Skip latency benchmark")
302
+ parser.add_argument("--samples", type=int, default=100, help="Test samples for equivalence/latency")
303
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
304
+ args = parser.parse_args()
305
+
306
+ result = run_export(
307
+ exp_id=args.exp_id,
308
+ export_format=args.export_format,
309
+ config_path=args.config,
310
+ log_path=args.log,
311
+ output_base=args.output,
312
+ skip_equivalence=args.skip_equivalence,
313
+ skip_latency=args.skip_latency,
314
+ n_test_samples=args.samples,
315
+ )
316
+
317
+ if args.json:
318
+ print(json.dumps(result, indent=2, default=str))
319
+ else:
320
+ print(format_export_report(result))
321
+
322
+
323
+ if __name__ == "__main__":
324
+ main()
@@ -212,6 +212,23 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
212
212
  return warnings
213
213
 
214
214
 
215
+ def load_profiles(profile_dir: str = "experiments/profiles") -> list[dict]:
216
+ """Load all profiling results from YAML files."""
217
+ path = Path(profile_dir)
218
+ if not path.exists():
219
+ return []
220
+ profiles = []
221
+ for f in sorted(path.glob("*-profile.yaml")):
222
+ try:
223
+ with open(f) as fh:
224
+ profile = yaml.safe_load(fh)
225
+ if profile and isinstance(profile, dict):
226
+ profiles.append(profile)
227
+ except (yaml.YAMLError, OSError):
228
+ continue
229
+ return profiles
230
+
231
+
215
232
  def load_diagnoses(diag_dir: str = "experiments/diagnoses") -> list[dict]:
216
233
  """Load all diagnosis reports from YAML files."""
217
234
  path = Path(diag_dir)
@@ -278,6 +295,7 @@ def format_brief(
278
295
  seed_studies: list[dict] | None = None,
279
296
  reproductions: list[dict] | None = None,
280
297
  diagnoses: list[dict] | None = None,
298
+ profiles: list[dict] | None = None,
281
299
  ) -> str:
282
300
  """Format the research briefing as markdown."""
283
301
  direction = "lower" if lower_is_better else "higher"
@@ -454,6 +472,23 @@ def format_brief(
454
472
  if failed:
455
473
  lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
456
474
 
475
+ # Profiles
476
+ if profiles:
477
+ lines.extend(["", "## Performance Profile", ""])
478
+ for prof in profiles[-1:]: # Show most recent
479
+ exp_id = prof.get("experiment_id", "?")
480
+ p = prof.get("profile", {})
481
+ bn = prof.get("bottleneck", {})
482
+ lines.append(f"**{exp_id}:** {p.get('total_time_sec', 0):.1f}s total")
483
+ mem = p.get("memory", {})
484
+ if mem.get("peak_rss_mb"):
485
+ lines.append(f"- Peak memory: {mem['peak_rss_mb']:.0f} MB")
486
+ if bn.get("type") and bn["type"] != "none_detected":
487
+ lines.append(f"- Bottleneck: **{bn['type']}** ({bn.get('severity', 'unknown')})")
488
+ recs = prof.get("recommendations", [])
489
+ if recs:
490
+ lines.append(f"- Top recommendation: {recs[0]}")
491
+
457
492
  # Diagnoses (error analysis)
458
493
  if diagnoses:
459
494
  lines.extend(["", "## Error Analysis", ""])
@@ -529,10 +564,11 @@ def generate_brief(
529
564
  cost_records = load_cost_data(log_path, metric)
530
565
  pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
531
566
 
532
- # Load seed studies, reproduction reports, and diagnoses
567
+ # Load seed studies, reproduction reports, diagnoses, and profiles
533
568
  seed_studies = load_seed_studies()
534
569
  reproductions = load_reproductions()
535
570
  diagnoses = load_diagnoses()
571
+ profiles = load_profiles()
536
572
 
537
573
  return format_brief(
538
574
  campaign, best, trajectory, model_types, hypotheses,
@@ -542,6 +578,7 @@ def generate_brief(
542
578
  seed_studies=seed_studies if seed_studies else None,
543
579
  reproductions=reproductions if reproductions else None,
544
580
  diagnoses=diagnoses if diagnoses else None,
581
+ profiles=profiles if profiles else None,
545
582
  )
546
583
 
547
584
 
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env python3
2
+ """Inference latency benchmarking for model exports.
3
+
4
+ Measures p50/p95/p99 inference latency with warm-up phase.
5
+ Compares original vs exported model latency.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from pathlib import Path
12
+
13
+ import numpy as np
14
+
15
+
16
+ DEFAULT_WARMUP = 10
17
+ DEFAULT_ITERATIONS = 100
18
+
19
+
20
+ def benchmark_inference(
21
+ predict_fn,
22
+ test_input,
23
+ n_warmup: int = DEFAULT_WARMUP,
24
+ n_iterations: int = DEFAULT_ITERATIONS,
25
+ ) -> dict:
26
+ """Benchmark inference latency of a prediction function.
27
+
28
+ Args:
29
+ predict_fn: Callable that takes input and returns predictions.
30
+ test_input: Input data for prediction.
31
+ n_warmup: Number of warm-up calls (discarded).
32
+ n_iterations: Number of benchmark calls.
33
+
34
+ Returns:
35
+ Dict with p50, p95, p99 latency in milliseconds and raw timings.
36
+ """
37
+ # Warm-up phase
38
+ for _ in range(n_warmup):
39
+ try:
40
+ predict_fn(test_input)
41
+ except Exception:
42
+ pass
43
+
44
+ # Benchmark phase
45
+ timings_ms = []
46
+ for _ in range(n_iterations):
47
+ start = time.perf_counter()
48
+ try:
49
+ predict_fn(test_input)
50
+ except Exception as e:
51
+ return {"error": f"Prediction failed during benchmark: {e}"}
52
+ elapsed_ms = (time.perf_counter() - start) * 1000
53
+ timings_ms.append(elapsed_ms)
54
+
55
+ arr = np.array(timings_ms)
56
+ return {
57
+ "n_iterations": n_iterations,
58
+ "n_warmup": n_warmup,
59
+ "p50_ms": round(float(np.percentile(arr, 50)), 3),
60
+ "p95_ms": round(float(np.percentile(arr, 95)), 3),
61
+ "p99_ms": round(float(np.percentile(arr, 99)), 3),
62
+ "mean_ms": round(float(np.mean(arr)), 3),
63
+ "std_ms": round(float(np.std(arr)), 3),
64
+ "min_ms": round(float(np.min(arr)), 3),
65
+ "max_ms": round(float(np.max(arr)), 3),
66
+ }
67
+
68
+
69
+ def compare_latency(
70
+ original_benchmark: dict,
71
+ exported_benchmark: dict,
72
+ ) -> dict:
73
+ """Compare latency between original and exported model.
74
+
75
+ Returns comparison dict with speedup ratios and verdict.
76
+ """
77
+ if "error" in original_benchmark or "error" in exported_benchmark:
78
+ return {
79
+ "verdict": "error",
80
+ "reason": original_benchmark.get("error") or exported_benchmark.get("error"),
81
+ }
82
+
83
+ orig_p50 = original_benchmark["p50_ms"]
84
+ exported_p50 = exported_benchmark["p50_ms"]
85
+
86
+ if exported_p50 > 0:
87
+ speedup = orig_p50 / exported_p50
88
+ else:
89
+ speedup = float("inf")
90
+
91
+ if speedup > 1.1:
92
+ verdict = "faster"
93
+ description = f"Exported model is {speedup:.1f}x faster (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
94
+ elif speedup < 0.9:
95
+ verdict = "slower"
96
+ description = f"Exported model is {1/speedup:.1f}x slower (p50: {orig_p50:.2f}ms -> {exported_p50:.2f}ms)"
97
+ else:
98
+ verdict = "similar"
99
+ description = f"Similar latency (p50: {orig_p50:.2f}ms vs {exported_p50:.2f}ms)"
100
+
101
+ return {
102
+ "verdict": verdict,
103
+ "description": description,
104
+ "speedup_ratio": round(speedup, 2),
105
+ "original_p50_ms": orig_p50,
106
+ "exported_p50_ms": exported_p50,
107
+ "original_p95_ms": original_benchmark["p95_ms"],
108
+ "exported_p95_ms": exported_benchmark["p95_ms"],
109
+ "original_p99_ms": original_benchmark["p99_ms"],
110
+ "exported_p99_ms": exported_benchmark["p99_ms"],
111
+ }
112
+
113
+
114
+ def compute_percentiles(timings_ms: list[float]) -> dict:
115
+ """Compute percentile statistics from raw timings."""
116
+ if not timings_ms:
117
+ return {}
118
+ arr = np.array(timings_ms)
119
+ return {
120
+ "p50_ms": round(float(np.percentile(arr, 50)), 3),
121
+ "p95_ms": round(float(np.percentile(arr, 95)), 3),
122
+ "p99_ms": round(float(np.percentile(arr, 99)), 3),
123
+ "mean_ms": round(float(np.mean(arr)), 3),
124
+ "std_ms": round(float(np.std(arr)), 3),
125
+ "min_ms": round(float(np.min(arr)), 3),
126
+ "max_ms": round(float(np.max(arr)), 3),
127
+ }
128
+
129
+
130
+ def format_benchmark_report(
131
+ original: dict | None,
132
+ exported: dict | None,
133
+ comparison: dict | None = None,
134
+ ) -> str:
135
+ """Format benchmark results as readable text."""
136
+ lines = ["## Latency Benchmark", ""]
137
+
138
+ if exported:
139
+ lines.extend([
140
+ "### Exported Model",
141
+ "",
142
+ f"- **p50:** {exported['p50_ms']:.2f} ms",
143
+ f"- **p95:** {exported['p95_ms']:.2f} ms",
144
+ f"- **p99:** {exported['p99_ms']:.2f} ms",
145
+ f"- **mean:** {exported['mean_ms']:.2f} ms (std: {exported['std_ms']:.2f})",
146
+ f"- **iterations:** {exported.get('n_iterations', 'N/A')}",
147
+ ])
148
+
149
+ if original:
150
+ lines.extend([
151
+ "",
152
+ "### Original Model",
153
+ "",
154
+ f"- **p50:** {original['p50_ms']:.2f} ms",
155
+ f"- **p95:** {original['p95_ms']:.2f} ms",
156
+ f"- **p99:** {original['p99_ms']:.2f} ms",
157
+ ])
158
+
159
+ if comparison and comparison.get("verdict") != "error":
160
+ lines.extend([
161
+ "",
162
+ "### Comparison",
163
+ "",
164
+ f"**{comparison['verdict'].upper()}** — {comparison['description']}",
165
+ ])
166
+
167
+ return "\n".join(lines)