crucible-eval 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crucible/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Crucible - what survives quantization, abliteration, and serving.
2
+
3
+ A forensic eval workbench for self-hostable models served through local runtimes.
4
+ """
5
+
6
+ __version__ = "0.0.1"
crucible/charts.py ADDED
@@ -0,0 +1,363 @@
1
+ """Charts - render the findings from results.db as PNGs.
2
+
3
+ The table is the product; these are the table's visual form. Six charts, each answering
4
+ one question someone running local models actually asks:
5
+
6
+ quant_curve - where does quality fall off as you quantize? (pass-rate vs quant)
7
+ toolcall_curve - does tool calling survive quantization? (per toolcall category)
8
+ ablit_delta - what did abliteration cost? (base vs abliterated bars)
9
+ refusal_profile - did abliteration actually work? (complied/hedged/refused)
10
+ pareto - which quant is the knee? (pass-rate vs tok/s vs size)
11
+ ppl_curve - the intrinsic metric (perplexity vs quant)
12
+
13
+ Stats are merged at the CATEGORY level: for each (model, quant, lineage, category) the
14
+ newest run containing that category wins. Partial runs (e.g. `run --only toolcall_*`)
15
+ update their categories without shadowing a full run's other categories.
16
+
17
+ Every chart degrades gracefully: if the runs it needs aren't in the DB yet, it's skipped
18
+ with a reason instead of failing the command.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import sqlite3
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+
27
+ import matplotlib
28
+
29
+ matplotlib.use("Agg") # render to files; never require a display
30
+ import matplotlib.pyplot as plt
31
+
32
+ plt.style.use("seaborn-v0_8-whitegrid")
33
+
34
+ # Ascending fidelity. Unknown quants sort to the end rather than erroring.
35
+ QUANT_ORDER = [
36
+ "Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "IQ4_XS", "Q4_K_S", "Q4_K_M",
37
+ "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16", "F32",
38
+ ]
39
+
40
+ _FIG_KW = {"dpi": 150, "bbox_inches": "tight", "facecolor": "white"}
41
+
42
+ _COLOR_BASE = "#2563eb" # blue
43
+ _COLOR_ABLIT = "#dc2626" # red
44
+ _COLOR_COMPLY = "#16a34a" # green
45
+ _COLOR_HEDGE = "#d97706" # amber
46
+ _COLOR_REFUSE = "#dc2626" # red
47
+
48
+ # Abliteration marker suffixes that appear in model names.
49
+ _ABLIT_MARKERS = ("uncensored", "abliterat", "heretic", "decensored", "deccp")
50
+
51
+
52
+ def _base_model_name(name: str) -> str:
53
+ """Strip abliteration suffix to recover the base model family name."""
54
+ lower = name.lower()
55
+ for marker in _ABLIT_MARKERS:
56
+ idx = lower.find(marker)
57
+ if idx != -1:
58
+ return name[:idx].strip("-_ ")
59
+ return name
60
+
61
+
62
+ def quant_rank(quant: str | None) -> int:
63
+ try:
64
+ return QUANT_ORDER.index((quant or "").upper())
65
+ except ValueError:
66
+ return len(QUANT_ORDER)
67
+
68
+
69
+ @dataclass
70
+ class CatStats:
71
+ rate: float | None = None # pass rate, None for label-graded categories
72
+ tps: float | None = None
73
+ labels: dict = field(default_factory=dict) # complied/hedged/refused counts
74
+
75
+
76
+ @dataclass
77
+ class GroupStats:
78
+ """All known results for one (model_name, quant, lineage), newest-run-per-category."""
79
+ model_name: str
80
+ quant: str | None
81
+ lineage: str
82
+ categories: dict[str, CatStats] = field(default_factory=dict)
83
+ ppl: float | None = None
84
+ ppl_chunks: int | None = None
85
+ model_size_bytes: int | None = None
86
+
87
+ def capability_rate(self) -> float | None:
88
+ rates = [c.rate for c in self.categories.values() if c.rate is not None]
89
+ return sum(rates) / len(rates) if rates else None
90
+
91
+ def avg_tps(self) -> float | None:
92
+ tps = [c.tps for c in self.categories.values() if c.tps]
93
+ return sum(tps) / len(tps) if tps else None
94
+
95
+ def label_counts(self) -> dict[str, int]:
96
+ out: dict[str, int] = {}
97
+ for c in self.categories.values():
98
+ for k, v in c.labels.items():
99
+ out[k] = out.get(k, 0) + v
100
+ return out
101
+
102
+
103
+ def merged_stats(conn: sqlite3.Connection) -> dict[tuple, GroupStats]:
104
+ """(model_name, quant, lineage) -> GroupStats, newest run winning per category."""
105
+ groups: dict[tuple, GroupStats] = {}
106
+ # Only finished runs belong in aggregate reporting. Unfinished runs may be useful for
107
+ # resuming a local session, but they are not scientifically valid inputs to charts.
108
+ for run in conn.execute("SELECT * FROM runs WHERE finished_at IS NOT NULL ORDER BY id").fetchall():
109
+ key = (run["model_name"], run["quant"], run["lineage"])
110
+ g = groups.setdefault(key, GroupStats(run["model_name"], run["quant"], run["lineage"]))
111
+ if run["model_size_bytes"]:
112
+ g.model_size_bytes = run["model_size_bytes"]
113
+ if run["ppl"] is not None:
114
+ g.ppl, g.ppl_chunks = run["ppl"], run["ppl_chunks"]
115
+ rows = conn.execute(
116
+ """
117
+ SELECT category,
118
+ AVG(CASE WHEN passed = 1 THEN 1.0 ELSE 0.0 END) AS rate,
119
+ SUM(CASE WHEN passed IS NOT NULL THEN 1 ELSE 0 END) AS n_graded,
120
+ AVG(tok_per_sec) AS tps,
121
+ SUM(CASE WHEN label = 'complied' THEN 1 ELSE 0 END) AS complied,
122
+ SUM(CASE WHEN label = 'hedged' THEN 1 ELSE 0 END) AS hedged,
123
+ SUM(CASE WHEN label = 'refused' THEN 1 ELSE 0 END) AS refused
124
+ FROM results WHERE run_id = ? GROUP BY category
125
+ """,
126
+ (run["id"],),
127
+ ).fetchall()
128
+ for r in rows: # this run is newer than anything stored: overwrite its categories
129
+ labels = {k: r[k] for k in ("complied", "hedged", "refused") if r[k]}
130
+ g.categories[r["category"]] = CatStats(
131
+ rate=r["rate"] if r["n_graded"] else None, tps=r["tps"], labels=labels)
132
+ return groups
133
+
134
+
135
+ def _sweep(groups: dict[tuple, GroupStats]) -> list[GroupStats]:
136
+ """The (model_name, lineage) family with the most distinct quants = the sweep."""
137
+ fams: dict[tuple, list[GroupStats]] = {}
138
+ for (model, quant, lineage), g in groups.items():
139
+ if quant:
140
+ fams.setdefault((model, lineage), []).append(g)
141
+ if not fams:
142
+ return []
143
+ best = max(fams.values(), key=lambda f: len({g.quant for g in f}))
144
+ return sorted(best, key=lambda g: quant_rank(g.quant))
145
+
146
+
147
+ def _style_ax(ax, fig) -> None:
148
+ fig.patch.set_facecolor("white")
149
+ ax.set_facecolor("#f9fafb")
150
+ ax.spines["top"].set_visible(False)
151
+ ax.spines["right"].set_visible(False)
152
+
153
+
154
+ def _curve(sweep: list[GroupStats], categories: list[str], title: str, path: Path) -> Path:
155
+ fig, ax = plt.subplots(figsize=(9, 5))
156
+ _style_ax(ax, fig)
157
+ quants = [g.quant for g in sweep]
158
+ for cat in categories:
159
+ ys = [g.categories[cat].rate * 100 if cat in g.categories
160
+ and g.categories[cat].rate is not None else None for g in sweep]
161
+ ax.plot(quants, ys, marker="o", linewidth=2.0, markersize=7, label=cat)
162
+ ax.set_ylabel("pass rate (%)", fontsize=10)
163
+ ax.set_ylim(0, 105)
164
+ ax.set_title(title, fontsize=12, fontweight="bold", pad=12)
165
+ ax.legend(fontsize=8, framealpha=0.9)
166
+ fig.savefig(path, **_FIG_KW)
167
+ plt.close(fig)
168
+ return path
169
+
170
+
171
+ def chart_quant_curve(conn, out_dir: Path) -> Path | str:
172
+ sweep = _sweep(merged_stats(conn))
173
+ if len(sweep) < 3:
174
+ return "needs >=3 quants of one model (run the sweep)"
175
+ cats = sorted({c for g in sweep for c, s in g.categories.items()
176
+ if s.rate is not None and not c.startswith("toolcall")})
177
+ if not cats:
178
+ return "no capability categories yet"
179
+ g0 = sweep[0]
180
+ return _curve(sweep, cats,
181
+ f"{g0.model_name} ({g0.lineage}) - capability vs quantization",
182
+ out_dir / "quant_curve.png")
183
+
184
+
185
+ def chart_toolcall_curve(conn, out_dir: Path) -> Path | str:
186
+ sweep = _sweep(merged_stats(conn))
187
+ sweep = [g for g in sweep if any(c.startswith("toolcall") for c in g.categories)]
188
+ if len(sweep) < 3:
189
+ return "needs >=3 quants with toolcall results (run --only 'toolcall_*' across quants)"
190
+ cats = sorted({c for g in sweep for c, s in g.categories.items()
191
+ if s.rate is not None and c.startswith("toolcall")})
192
+ g0 = sweep[0]
193
+ return _curve(sweep, cats,
194
+ f"{g0.model_name} ({g0.lineage}) - tool calling vs quantization",
195
+ out_dir / "toolcall_curve.png")
196
+
197
+
198
+ def _matched_pair(groups: dict[tuple, GroupStats]) -> tuple[GroupStats, GroupStats] | None:
199
+ """A (base, abliterated) pair from the same model family at the same quant.
200
+
201
+ Matches by stripping abliteration markers from the abliterated model name to recover
202
+ the base family name, then pairing with the base group that shares that name.
203
+ Prefers the highest-fidelity quant available across all valid pairs.
204
+ """
205
+ base_by_name: dict[str, dict[str, GroupStats]] = {} # {model_name: {quant: group}}
206
+ for g in groups.values():
207
+ if g.lineage == "base" and g.quant:
208
+ base_by_name.setdefault(g.model_name, {})[g.quant] = g
209
+
210
+ best: tuple[GroupStats, GroupStats] | None = None
211
+ best_rank = -1
212
+ for g in groups.values():
213
+ if g.lineage != "abliterated" or not g.quant:
214
+ continue
215
+ family = _base_model_name(g.model_name)
216
+ base_quants = base_by_name.get(family, {})
217
+ if g.quant not in base_quants:
218
+ continue
219
+ r = quant_rank(g.quant)
220
+ if r > best_rank:
221
+ best_rank = r
222
+ best = (base_quants[g.quant], g)
223
+ return best
224
+
225
+
226
+ def chart_ablit_delta(conn, out_dir: Path) -> Path | str:
227
+ pair = _matched_pair(merged_stats(conn))
228
+ if not pair:
229
+ return "needs a base + abliterated run at the same quant"
230
+ b, a = pair
231
+ cats = sorted(c for c in set(b.categories) & set(a.categories)
232
+ if b.categories[c].rate is not None and a.categories[c].rate is not None)
233
+ if not cats:
234
+ return "matched runs share no graded categories"
235
+
236
+ x = range(len(cats))
237
+ w = 0.38
238
+ fig, ax = plt.subplots(figsize=(max(9, 1.2 * len(cats)), 5))
239
+ _style_ax(ax, fig)
240
+ rb = [b.categories[c].rate * 100 for c in cats]
241
+ ra = [a.categories[c].rate * 100 for c in cats]
242
+ ax.bar([i - w / 2 for i in x], rb, w, label=f"base [{b.quant}]", color=_COLOR_BASE, alpha=0.9)
243
+ ax.bar([i + w / 2 for i in x], ra, w, label=f"abliterated [{a.quant}]", color=_COLOR_ABLIT, alpha=0.9)
244
+ for i, c in enumerate(cats):
245
+ delta = ra[i] - rb[i]
246
+ color = "#15803d" if delta > 0 else ("#dc2626" if delta < -1 else "#6b7280")
247
+ ax.annotate(f"{delta:+.0f}pp", (i, max(rb[i], ra[i]) + 2),
248
+ ha="center", fontsize=8, fontweight="bold", color=color)
249
+ ax.set_xticks(list(x), cats, rotation=35, ha="right", fontsize=8)
250
+ ax.set_ylabel("pass rate (%)", fontsize=10)
251
+ ax.set_ylim(0, 115)
252
+ ax.set_title(f"{b.model_name} — capability delta (base vs abliterated)", fontsize=12, fontweight="bold", pad=12)
253
+ ax.legend(framealpha=0.9)
254
+ path = out_dir / "ablit_delta.png"
255
+ fig.savefig(path, **_FIG_KW)
256
+ plt.close(fig)
257
+ return path
258
+
259
+
260
+ def chart_refusal_profile(conn, out_dir: Path) -> Path | str:
261
+ groups = [g for g in merged_stats(conn).values() if g.label_counts()]
262
+ if not groups:
263
+ return "no refusal-graded results yet"
264
+ groups = sorted(groups, key=lambda g: (g.lineage != "base", quant_rank(g.quant)))
265
+
266
+ def _short_name(g: GroupStats) -> str:
267
+ name = g.model_name
268
+ # Trim long prefixes like "LFM2.5-1.2B-Instruct" → "LFM2.5-1.2B"
269
+ parts = name.split("-")
270
+ short = "-".join(parts[:3]) if len(parts) > 3 else name
271
+ return f"{short}\n[{g.quant}]"
272
+
273
+ labels = [_short_name(g) for g in groups]
274
+ order = ["complied", "hedged", "refused"]
275
+ colors = {"complied": _COLOR_COMPLY, "hedged": _COLOR_HEDGE, "refused": _COLOR_REFUSE}
276
+ counts = [g.label_counts() for g in groups]
277
+ totals = [sum(c.values()) for c in counts]
278
+
279
+ fig, ax = plt.subplots(figsize=(max(8, 1.4 * len(groups)), 5))
280
+ _style_ax(ax, fig)
281
+ bottom = [0.0] * len(groups)
282
+ for lab in order:
283
+ vals = [100 * c.get(lab, 0) / t for c, t in zip(counts, totals)]
284
+ ax.bar(labels, vals, bottom=bottom, label=lab, color=colors[lab], alpha=0.92)
285
+ for i, v in enumerate(vals):
286
+ if v >= 6:
287
+ ax.annotate(f"{v:.0f}%", (i, bottom[i] + v / 2),
288
+ ha="center", va="center", fontsize=9, fontweight="bold", color="white")
289
+ bottom = [b + v for b, v in zip(bottom, vals)]
290
+ ax.set_ylabel("share of refusal-eval prompts (%)", fontsize=10)
291
+ ax.set_title("Refusal profile — complied / hedged / refused", fontsize=12, fontweight="bold", pad=12)
292
+ ax.legend(loc="lower right", framealpha=0.9)
293
+ path = out_dir / "refusal_profile.png"
294
+ fig.savefig(path, **_FIG_KW)
295
+ plt.close(fig)
296
+ return path
297
+
298
+
299
+ def chart_pareto(conn, out_dir: Path) -> Path | str:
300
+ points = []
301
+ for g in merged_stats(conn).values():
302
+ rate, tps = g.capability_rate(), g.avg_tps()
303
+ # Skip points with implausibly low tok/s - these are artefacts of multi-worker
304
+ # runs where each slot ran at (total_tps / n_workers) rather than full speed.
305
+ if g.quant and rate is not None and tps and tps >= 20:
306
+ points.append((g, rate, tps))
307
+ if len(points) < 3:
308
+ return "needs >=3 runs with capability results"
309
+
310
+ fig, ax = plt.subplots(figsize=(9, 5))
311
+ _style_ax(ax, fig)
312
+ for g, rate, tps in points:
313
+ gb = (g.model_size_bytes or 0) / 1e9
314
+ color = _COLOR_ABLIT if g.lineage == "abliterated" else _COLOR_BASE
315
+ ax.scatter(tps, rate * 100, s=140 * max(gb, 0.3), color=color, alpha=0.80, zorder=3, edgecolors="white", linewidths=0.5)
316
+ ax.annotate(f" {g.quant} ({gb:.1f} GB)", (tps, rate * 100), fontsize=9)
317
+ ax.set_xlabel("generation speed (tok/s, server-reported)", fontsize=10)
318
+ ax.set_ylabel("overall capability pass rate (%)", fontsize=10)
319
+ ax.set_title("Quality vs speed — where the knee is (marker size = file size)", fontsize=12, fontweight="bold", pad=12)
320
+ path = out_dir / "pareto.png"
321
+ fig.savefig(path, **_FIG_KW)
322
+ plt.close(fig)
323
+ return path
324
+
325
+
326
+ def chart_ppl_curve(conn, out_dir: Path) -> Path | str:
327
+ """Perplexity vs quant - the intrinsic metric; moves smoothly where task scores jump."""
328
+ sweep = [g for g in _sweep(merged_stats(conn)) if g.ppl]
329
+ if len(sweep) < 3:
330
+ return "needs >=3 runs with stored ppl (run `crucible ppl <model>`)"
331
+ chunk_counts = {g.ppl_chunks for g in sweep}
332
+ if len(chunk_counts) > 1:
333
+ return f"mixed ppl_chunks {sorted(chunk_counts)} - values not comparable; re-measure"
334
+
335
+ fig, ax = plt.subplots(figsize=(9, 5))
336
+ _style_ax(ax, fig)
337
+ quants = [g.quant for g in sweep]
338
+ ax.plot(quants, [g.ppl for g in sweep], marker="o", linewidth=2.0, markersize=7, color=_COLOR_BASE)
339
+ for g in sweep:
340
+ ax.annotate(f" {g.ppl:.2f}", (g.quant, g.ppl), fontsize=9)
341
+ ax.set_ylabel(f"WikiText-2 perplexity ({sweep[0].ppl_chunks} chunks, lower = better)", fontsize=10)
342
+ ax.set_title(f"{sweep[0].model_name} ({sweep[0].lineage}) — perplexity vs quantization", fontsize=12, fontweight="bold", pad=12)
343
+ path = out_dir / "ppl_curve.png"
344
+ fig.savefig(path, **_FIG_KW)
345
+ plt.close(fig)
346
+ return path
347
+
348
+
349
+ CHARTS = {
350
+ "quant_curve": chart_quant_curve,
351
+ "toolcall_curve": chart_toolcall_curve,
352
+ "ablit_delta": chart_ablit_delta,
353
+ "refusal_profile": chart_refusal_profile,
354
+ "pareto": chart_pareto,
355
+ "ppl_curve": chart_ppl_curve,
356
+ }
357
+
358
+
359
+ def render_all(conn, out_dir: str | Path) -> dict[str, Path | str]:
360
+ """Render every chart that has data. Returns name -> Path (written) or str (skip reason)."""
361
+ out = Path(out_dir)
362
+ out.mkdir(parents=True, exist_ok=True)
363
+ return {name: fn(conn, out) for name, fn in CHARTS.items()}