claude-turing 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +3 -2
- package/commands/export.md +48 -0
- package/commands/turing.md +2 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +1 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/equivalence_checker.py +158 -0
- package/templates/scripts/export_card.py +183 -0
- package/templates/scripts/export_formats.py +385 -0
- package/templates/scripts/export_model.py +324 -0
- package/templates/scripts/latency_benchmark.py +167 -0
- package/templates/scripts/scaffold.py +6 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 25 commands, 2 specialized agents, production model export (6 formats, equivalence verification, latency benchmarking), performance profiling, smart Pareto-based checkpoint management, experiment intelligence (error analysis, ablation studies, Pareto frontiers), statistical rigor (multi-seed studies, reproducibility verification), tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -330,6 +330,7 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
330
330
|
| `/turing:frontier [--metrics]` | Pareto frontier — multi-objective tradeoff visualization |
|
|
331
331
|
| `/turing:profile [exp-id]` | Computational profiling — timing, memory, throughput, bottleneck detection |
|
|
332
332
|
| `/turing:checkpoint <action>` | Smart checkpoint management — list, prune (Pareto), average, resume, stats |
|
|
333
|
+
| `/turing:export [--format]` | Export model to production format with equivalence check + latency benchmark |
|
|
333
334
|
| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
|
|
334
335
|
| `/turing:logbook` | Generate HTML experiment logbook |
|
|
335
336
|
| `/turing:report` | Generate research report |
|
|
@@ -519,11 +520,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
519
520
|
|
|
520
521
|
## Architecture of Turing Itself
|
|
521
522
|
|
|
522
|
-
|
|
523
|
+
25 commands, 2 agents, 8 config files, 44 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, 611 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
523
524
|
|
|
524
525
|
```
|
|
525
526
|
turing/
|
|
526
|
-
├── commands/
|
|
527
|
+
├── commands/ 24 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment)
|
|
527
528
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
528
529
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
529
530
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: export
|
|
3
|
+
description: Export model to production format with equivalence verification, latency benchmarking, and deployment model card.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--format joblib|xgboost_json|onnx|torchscript|tflite]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Export a trained model to a production-ready format.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument can be an experiment ID (e.g., `exp-042`); defaults to best
|
|
20
|
+
- `--format joblib|xgboost_json|onnx|torchscript|tflite` specifies export format (auto-detected if omitted)
|
|
21
|
+
- `--skip-equivalence` skips inference equivalence check
|
|
22
|
+
- `--skip-latency` skips latency benchmark
|
|
23
|
+
- `--samples 100` sets test sample count
|
|
24
|
+
|
|
25
|
+
3. **Run export pipeline:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/export_model.py $ARGUMENTS
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **Report results:**
|
|
31
|
+
- **Export:** format, file size, output path, dependencies
|
|
32
|
+
- **Equivalence:** verdict (equivalent/approximately_equivalent/divergent), max delta
|
|
33
|
+
- **Latency:** p50/p95/p99 ms, speedup vs original
|
|
34
|
+
- **Model Card:** metrics, seed study, equivalence, latency, dependencies
|
|
35
|
+
|
|
36
|
+
5. **Output:** exported model + model_card.yaml written to `exports/exp-NNN/`
|
|
37
|
+
|
|
38
|
+
6. **If model file not found:** suggest checking models/best/ directory.
|
|
39
|
+
|
|
40
|
+
## Examples
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
/turing:export # Best experiment, default format
|
|
44
|
+
/turing:export exp-042 # Specific experiment
|
|
45
|
+
/turing:export --format xgboost_json # Native XGBoost JSON
|
|
46
|
+
/turing:export --format onnx # ONNX format
|
|
47
|
+
/turing:export --skip-equivalence --skip-latency # Fast export
|
|
48
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -31,6 +31,7 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
31
31
|
| "diagnose", "error analysis", "failure modes", "where does it fail", "confusion matrix" | `/turing:diagnose` | Analyze |
|
|
32
32
|
| "ablate", "ablation", "remove component", "which features matter", "component impact" | `/turing:ablate` | Analyze |
|
|
33
33
|
| "frontier", "pareto", "tradeoff", "tradeoffs", "multi-objective", "which model is best" | `/turing:frontier` | Analyze |
|
|
34
|
+
| "export", "deploy", "production", "onnx", "torchscript", "tflite", "ship model" | `/turing:export` | Deploy |
|
|
34
35
|
| "profile", "profiling", "bottleneck", "slow training", "why is it slow", "timing" | `/turing:profile` | Check |
|
|
35
36
|
| "checkpoint", "checkpoints", "prune checkpoints", "disk space", "resume training" | `/turing:checkpoint` | Check |
|
|
36
37
|
|
|
@@ -60,6 +61,7 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
60
61
|
| `/turing:diagnose [exp-id]` | Error analysis: failure modes, confused pairs, feature-range bias | (inline) |
|
|
61
62
|
| `/turing:ablate [--components]` | Ablation study: remove components, measure impact, flag dead weight | (inline) |
|
|
62
63
|
| `/turing:frontier [--metrics]` | Pareto frontier: multi-objective tradeoff visualization | (inline) |
|
|
64
|
+
| `/turing:export [exp-id] [--format]` | Export model to production format with equivalence check + latency benchmark | (inline) |
|
|
63
65
|
| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | (inline) |
|
|
64
66
|
| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | (inline) |
|
|
65
67
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -23,7 +23,7 @@ const SUB_COMMANDS = [
|
|
|
23
23
|
"init", "train", "status", "compare", "sweep", "validate",
|
|
24
24
|
"try", "brief", "suggest", "explore", "design", "logbook", "poster",
|
|
25
25
|
"report", "mode", "preflight", "card", "seed", "reproduce",
|
|
26
|
-
"diagnose", "ablate", "frontier", "profile", "checkpoint",
|
|
26
|
+
"diagnose", "ablate", "frontier", "profile", "checkpoint", "export",
|
|
27
27
|
];
|
|
28
28
|
|
|
29
29
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Inference equivalence verification for model exports.
|
|
3
|
+
|
|
4
|
+
Compares outputs between the original model and the exported model
|
|
5
|
+
to verify they produce identical (or near-identical) results.
|
|
6
|
+
|
|
7
|
+
Verdicts:
|
|
8
|
+
equivalent — max delta < 1e-6 (float precision)
|
|
9
|
+
approximately_equivalent — max delta < tolerance (default 1e-5)
|
|
10
|
+
divergent — max delta >= tolerance
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
FLOAT32_TOLERANCE = 1e-5
|
|
23
|
+
QUANTIZED_TOLERANCE = 1e-3
|
|
24
|
+
EXACT_TOLERANCE = 1e-6
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def compare_outputs(
|
|
28
|
+
original_outputs: list | np.ndarray,
|
|
29
|
+
exported_outputs: list | np.ndarray,
|
|
30
|
+
tolerance: float = FLOAT32_TOLERANCE,
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""Compare outputs from original and exported models.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
original_outputs: Predictions from original model.
|
|
36
|
+
exported_outputs: Predictions from exported model.
|
|
37
|
+
tolerance: Maximum allowed difference.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dict with verdict, max_delta, mean_delta, and per-sample details.
|
|
41
|
+
"""
|
|
42
|
+
orig = np.array(original_outputs, dtype=np.float64).flatten()
|
|
43
|
+
exported = np.array(exported_outputs, dtype=np.float64).flatten()
|
|
44
|
+
|
|
45
|
+
if orig.shape != exported.shape:
|
|
46
|
+
return {
|
|
47
|
+
"verdict": "divergent",
|
|
48
|
+
"reason": f"Shape mismatch: original {orig.shape} vs exported {exported.shape}",
|
|
49
|
+
"max_delta": float("inf"),
|
|
50
|
+
"mean_delta": float("inf"),
|
|
51
|
+
"n_samples": 0,
|
|
52
|
+
"n_divergent": 0,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
deltas = np.abs(orig - exported)
|
|
56
|
+
max_delta = float(np.max(deltas))
|
|
57
|
+
mean_delta = float(np.mean(deltas))
|
|
58
|
+
n_divergent = int(np.sum(deltas >= tolerance))
|
|
59
|
+
|
|
60
|
+
if max_delta < EXACT_TOLERANCE:
|
|
61
|
+
verdict = "equivalent"
|
|
62
|
+
reason = f"Exact match (max delta {max_delta:.2e} < {EXACT_TOLERANCE})"
|
|
63
|
+
elif max_delta < tolerance:
|
|
64
|
+
verdict = "approximately_equivalent"
|
|
65
|
+
reason = f"Within tolerance (max delta {max_delta:.2e} < {tolerance})"
|
|
66
|
+
else:
|
|
67
|
+
verdict = "divergent"
|
|
68
|
+
reason = (
|
|
69
|
+
f"Max delta {max_delta:.2e} exceeds tolerance {tolerance}. "
|
|
70
|
+
f"{n_divergent} of {len(orig)} samples diverge."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
"verdict": verdict,
|
|
75
|
+
"reason": reason,
|
|
76
|
+
"max_delta": round(max_delta, 10),
|
|
77
|
+
"mean_delta": round(mean_delta, 10),
|
|
78
|
+
"n_samples": len(orig),
|
|
79
|
+
"n_divergent": n_divergent,
|
|
80
|
+
"tolerance": tolerance,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def run_equivalence_check(
|
|
85
|
+
original_predict_fn,
|
|
86
|
+
exported_predict_fn,
|
|
87
|
+
test_data: np.ndarray | list,
|
|
88
|
+
tolerance: float = FLOAT32_TOLERANCE,
|
|
89
|
+
) -> dict:
|
|
90
|
+
"""Run equivalence check by predicting on test data with both models.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
original_predict_fn: Callable that takes input data and returns predictions.
|
|
94
|
+
exported_predict_fn: Callable for the exported model.
|
|
95
|
+
test_data: Input data to predict on.
|
|
96
|
+
tolerance: Maximum allowed difference.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Equivalence check result dict.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
original_preds = original_predict_fn(test_data)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
return {"verdict": "error", "reason": f"Original model prediction failed: {e}"}
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
exported_preds = exported_predict_fn(test_data)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
return {"verdict": "error", "reason": f"Exported model prediction failed: {e}"}
|
|
110
|
+
|
|
111
|
+
return compare_outputs(original_preds, exported_preds, tolerance)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def generate_test_data(n_samples: int = 100, n_features: int = 10, seed: int = 42) -> np.ndarray:
|
|
115
|
+
"""Generate random test data for equivalence checking.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
n_samples: Number of test samples.
|
|
119
|
+
n_features: Number of features per sample.
|
|
120
|
+
seed: Random seed for reproducibility.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Array of shape (n_samples, n_features).
|
|
124
|
+
"""
|
|
125
|
+
rng = np.random.RandomState(seed)
|
|
126
|
+
return rng.randn(n_samples, n_features).astype(np.float32)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def format_equivalence_report(result: dict) -> str:
|
|
130
|
+
"""Format equivalence check result as readable text."""
|
|
131
|
+
verdict = result.get("verdict", "unknown")
|
|
132
|
+
reason = result.get("reason", "")
|
|
133
|
+
|
|
134
|
+
verdict_markers = {
|
|
135
|
+
"equivalent": "PASS (exact)",
|
|
136
|
+
"approximately_equivalent": "PASS (approx)",
|
|
137
|
+
"divergent": "FAIL",
|
|
138
|
+
"error": "ERROR",
|
|
139
|
+
}
|
|
140
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
141
|
+
|
|
142
|
+
lines = [
|
|
143
|
+
f"## Equivalence Check: {marker}",
|
|
144
|
+
"",
|
|
145
|
+
f"*{reason}*",
|
|
146
|
+
"",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
if result.get("n_samples"):
|
|
150
|
+
lines.extend([
|
|
151
|
+
f"- **Samples tested:** {result['n_samples']}",
|
|
152
|
+
f"- **Max delta:** {result['max_delta']:.2e}",
|
|
153
|
+
f"- **Mean delta:** {result['mean_delta']:.2e}",
|
|
154
|
+
f"- **Divergent samples:** {result['n_divergent']}",
|
|
155
|
+
f"- **Tolerance:** {result['tolerance']:.0e}",
|
|
156
|
+
])
|
|
157
|
+
|
|
158
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Deployment model card generation for exported models.
|
|
3
|
+
|
|
4
|
+
Produces a structured model card with metrics, seed study results,
|
|
5
|
+
export format, equivalence check, latency benchmarks, and dependencies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from scripts.turing_io import load_config, load_seed_study
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate_export_card(
|
|
18
|
+
experiment: dict,
|
|
19
|
+
export_result: dict,
|
|
20
|
+
equivalence: dict | None = None,
|
|
21
|
+
latency: dict | None = None,
|
|
22
|
+
config: dict | None = None,
|
|
23
|
+
) -> dict:
|
|
24
|
+
"""Generate a deployment model card for an exported model.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
experiment: Original experiment dict from log.jsonl.
|
|
28
|
+
export_result: Result from export_formats.export_model().
|
|
29
|
+
equivalence: Result from equivalence_checker.compare_outputs().
|
|
30
|
+
latency: Latency comparison from latency_benchmark.compare_latency().
|
|
31
|
+
config: Project config dict.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Model card dict.
|
|
35
|
+
"""
|
|
36
|
+
exp_id = experiment.get("experiment_id", "unknown")
|
|
37
|
+
metrics = experiment.get("metrics", {})
|
|
38
|
+
exp_config = experiment.get("config", {})
|
|
39
|
+
model_type = exp_config.get("model_type", config.get("model", {}).get("type", "unknown") if config else "unknown")
|
|
40
|
+
|
|
41
|
+
eval_cfg = config.get("evaluation", {}) if config else {}
|
|
42
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
43
|
+
task_desc = config.get("task_description", eval_cfg.get("primary_metric", "N/A")) if config else "N/A"
|
|
44
|
+
|
|
45
|
+
card = {
|
|
46
|
+
"name": f"{exp_id}-{model_type}",
|
|
47
|
+
"experiment_id": exp_id,
|
|
48
|
+
"task": task_desc,
|
|
49
|
+
"model_type": model_type,
|
|
50
|
+
"primary_metric": primary_metric,
|
|
51
|
+
"metrics": {k: round(v, 4) if isinstance(v, float) else v for k, v in metrics.items()},
|
|
52
|
+
"export_format": export_result.get("format", "unknown"),
|
|
53
|
+
"export_path": export_result.get("path"),
|
|
54
|
+
"size_mb": export_result.get("size_mb", 0),
|
|
55
|
+
"dependencies": export_result.get("dependencies", []),
|
|
56
|
+
"training_date": experiment.get("timestamp", "unknown"),
|
|
57
|
+
"export_date": datetime.now(timezone.utc).isoformat(),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Seed study (if available)
|
|
61
|
+
seed_study = load_seed_study(exp_id)
|
|
62
|
+
if seed_study and "mean" in seed_study:
|
|
63
|
+
card["seed_study"] = {
|
|
64
|
+
"mean": seed_study["mean"],
|
|
65
|
+
"std": seed_study.get("std", 0),
|
|
66
|
+
"cv_percent": seed_study.get("cv_percent", 0),
|
|
67
|
+
"seed_sensitive": seed_study.get("seed_sensitive", False),
|
|
68
|
+
"seeds_tested": len(seed_study.get("seeds_run", [])),
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Equivalence check
|
|
72
|
+
if equivalence:
|
|
73
|
+
card["equivalence"] = {
|
|
74
|
+
"verdict": equivalence.get("verdict", "unknown"),
|
|
75
|
+
"max_delta": equivalence.get("max_delta", 0),
|
|
76
|
+
"n_samples_tested": equivalence.get("n_samples", 0),
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Latency benchmark
|
|
80
|
+
if latency and latency.get("verdict") != "error":
|
|
81
|
+
card["inference_latency"] = {
|
|
82
|
+
"exported_p50_ms": latency.get("exported_p50_ms"),
|
|
83
|
+
"exported_p95_ms": latency.get("exported_p95_ms"),
|
|
84
|
+
"original_p50_ms": latency.get("original_p50_ms"),
|
|
85
|
+
"speedup": latency.get("speedup_ratio"),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Environment
|
|
89
|
+
env = experiment.get("environment")
|
|
90
|
+
if env:
|
|
91
|
+
card["training_environment"] = {
|
|
92
|
+
"python_version": env.get("python_version"),
|
|
93
|
+
"gpu": env.get("gpu_name") or env.get("gpu"),
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return card
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def save_export_card(card: dict, output_dir: str) -> Path:
|
|
100
|
+
"""Save export model card to YAML file."""
|
|
101
|
+
out_path = Path(output_dir)
|
|
102
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
filepath = out_path / "model_card.yaml"
|
|
104
|
+
with open(filepath, "w") as f:
|
|
105
|
+
yaml.dump(card, f, default_flow_style=False, sort_keys=False)
|
|
106
|
+
return filepath
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def format_export_card(card: dict) -> str:
|
|
110
|
+
"""Format export model card as readable markdown."""
|
|
111
|
+
lines = [
|
|
112
|
+
f"# Export Model Card: {card.get('name', 'unknown')}",
|
|
113
|
+
"",
|
|
114
|
+
f"- **Experiment:** {card.get('experiment_id', '?')}",
|
|
115
|
+
f"- **Task:** {card.get('task', 'N/A')}",
|
|
116
|
+
f"- **Model type:** {card.get('model_type', '?')}",
|
|
117
|
+
f"- **Export format:** {card.get('export_format', '?')}",
|
|
118
|
+
f"- **Size:** {card.get('size_mb', 0):.2f} MB",
|
|
119
|
+
f"- **Dependencies:** {', '.join(card.get('dependencies', []))}",
|
|
120
|
+
"",
|
|
121
|
+
"## Metrics",
|
|
122
|
+
"",
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
for metric, value in card.get("metrics", {}).items():
|
|
126
|
+
if isinstance(value, float):
|
|
127
|
+
lines.append(f"- **{metric}:** {value:.4f}")
|
|
128
|
+
else:
|
|
129
|
+
lines.append(f"- **{metric}:** {value}")
|
|
130
|
+
|
|
131
|
+
# Seed study
|
|
132
|
+
seed = card.get("seed_study")
|
|
133
|
+
if seed:
|
|
134
|
+
status = "SEED-SENSITIVE" if seed.get("seed_sensitive") else "STABLE"
|
|
135
|
+
lines.extend([
|
|
136
|
+
"",
|
|
137
|
+
"## Seed Study",
|
|
138
|
+
"",
|
|
139
|
+
f"- **Status:** {status}",
|
|
140
|
+
f"- **Mean ± Std:** {seed['mean']:.4f} ± {seed.get('std', 0):.4f}",
|
|
141
|
+
f"- **CV:** {seed.get('cv_percent', 0):.2f}%",
|
|
142
|
+
f"- **Seeds tested:** {seed.get('seeds_tested', 0)}",
|
|
143
|
+
])
|
|
144
|
+
|
|
145
|
+
# Equivalence
|
|
146
|
+
eq = card.get("equivalence")
|
|
147
|
+
if eq:
|
|
148
|
+
verdict_markers = {
|
|
149
|
+
"equivalent": "PASS (exact)",
|
|
150
|
+
"approximately_equivalent": "PASS (approx)",
|
|
151
|
+
"divergent": "FAIL",
|
|
152
|
+
}
|
|
153
|
+
marker = verdict_markers.get(eq["verdict"], eq["verdict"])
|
|
154
|
+
lines.extend([
|
|
155
|
+
"",
|
|
156
|
+
"## Equivalence",
|
|
157
|
+
"",
|
|
158
|
+
f"- **Verdict:** {marker}",
|
|
159
|
+
f"- **Max delta:** {eq.get('max_delta', 0):.2e}",
|
|
160
|
+
f"- **Samples tested:** {eq.get('n_samples_tested', 0)}",
|
|
161
|
+
])
|
|
162
|
+
|
|
163
|
+
# Latency
|
|
164
|
+
lat = card.get("inference_latency")
|
|
165
|
+
if lat:
|
|
166
|
+
lines.extend([
|
|
167
|
+
"",
|
|
168
|
+
"## Inference Latency",
|
|
169
|
+
"",
|
|
170
|
+
f"- **Exported p50:** {lat.get('exported_p50_ms', 0):.2f} ms",
|
|
171
|
+
f"- **Exported p95:** {lat.get('exported_p95_ms', 0):.2f} ms",
|
|
172
|
+
])
|
|
173
|
+
if lat.get("original_p50_ms"):
|
|
174
|
+
lines.append(f"- **Original p50:** {lat['original_p50_ms']:.2f} ms")
|
|
175
|
+
if lat.get("speedup"):
|
|
176
|
+
lines.append(f"- **Speedup:** {lat['speedup']:.1f}x")
|
|
177
|
+
|
|
178
|
+
lines.extend([
|
|
179
|
+
"",
|
|
180
|
+
f"*Exported: {card.get('export_date', 'unknown')}*",
|
|
181
|
+
])
|
|
182
|
+
|
|
183
|
+
return "\n".join(lines)
|