proofrag 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
proofrag/judge.py ADDED
@@ -0,0 +1,142 @@
1
+ """LLM-as-judge scoring + rank-aware retrieval metrics → results.json.
2
+
3
+ Generation quality is scored by a pinned judge model on four dimensions
4
+ (groundedness, correctness, completeness, citation_quality). Retrieval quality is
5
+ scored separately (Recall@k, Precision@k, NDCG@k, MRR), so a retriever miss is
6
+ never blamed on the generator. The judge fingerprint is recorded so two scorecards
7
+ are only compared when they used the same judge.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import datetime as _dt
13
+ import json
14
+ from typing import Any
15
+
16
+ from .llm import LLM
17
+ from .metrics import RETRIEVAL_METRICS, Matcher, lexical_matcher, retrieval_metrics
18
+
19
+ JUDGE_DIMENSIONS = ["groundedness", "correctness", "completeness", "citation_quality"]
20
+
21
+ JUDGE_SYS = (
22
+ "You are a strict, consistent evaluator of RAG answers. "
23
+ "Score each dimension from 0.0 to 1.0. Be calibrated: 1.0 means flawless, "
24
+ "0.5 means partially right, 0.0 means absent or wrong. Output JSON only."
25
+ )
26
+
27
+ JUDGE_TMPL = '''Question: {q}
28
+
29
+ Reference (gold) answer: {gold}
30
+
31
+ Context the system retrieved:
32
+ """{ctx}"""
33
+
34
+ System's answer: {ans}
35
+
36
+ Score 0.0-1.0:
37
+ - groundedness: is the answer supported by the retrieved context (no hallucination)?
38
+ - correctness: do its facts match the reference answer?
39
+ - completeness: does it cover what the reference covers?
40
+ - citation_quality: are claims attributable to the retrieved context?
41
+
42
+ Return JSON:
43
+ {{"groundedness": 0.0, "correctness": 0.0, "completeness": 0.0, "citation_quality": 0.0, "rationale": "one short sentence"}}'''
44
+
45
+
46
+ def evaluate(
47
+ goldenset: list[dict],
48
+ predictions: list[dict],
49
+ llm: LLM | None = None,
50
+ k: int = 5,
51
+ matcher: Matcher | None = None,
52
+ ) -> dict:
53
+ """Join goldenset to predictions by id, judge each, aggregate.
54
+
55
+ `k` is the cutoff for retrieval metrics. `matcher` decides chunk relevance
56
+ (defaults to lexical token-overlap; pass `embedding_matcher()` for semantic).
57
+ """
58
+ llm = llm or LLM()
59
+ matcher = matcher or lexical_matcher()
60
+ preds = {p["id"]: p for p in predictions}
61
+
62
+ records: list[dict] = []
63
+ for g in goldenset:
64
+ pred = preds.get(g["id"])
65
+ if pred is None:
66
+ continue
67
+ retrieved = pred.get("retrieved_contexts", []) or []
68
+ answer = pred.get("answer", "")
69
+ gold_contexts = g.get("gold_contexts", []) or []
70
+
71
+ scores = _judge_one(llm, g, answer, retrieved)
72
+ # Unanswerable cases have no gold context to retrieve — skip retrieval scoring.
73
+ retrieval = (
74
+ retrieval_metrics(gold_contexts, retrieved, k, matcher) if gold_contexts else None
75
+ )
76
+
77
+ records.append(
78
+ {
79
+ "id": g["id"],
80
+ "question": g["question"],
81
+ "difficulty": g.get("difficulty", "single_doc"),
82
+ "answer": answer,
83
+ "scores": scores,
84
+ "retrieval": retrieval,
85
+ "rationale": scores.pop("rationale", ""),
86
+ }
87
+ )
88
+
89
+ return {
90
+ "judge_fingerprint": llm.fingerprint,
91
+ "created": _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds"),
92
+ "k": k,
93
+ "n": len(records),
94
+ "aggregate": _aggregate(records),
95
+ "records": records,
96
+ }
97
+
98
+
99
+ def _judge_one(llm: LLM, gold: dict, answer: str, retrieved: list[str]) -> dict:
100
+ ctx = "\n\n---\n\n".join(retrieved) if retrieved else "(no context retrieved)"
101
+ prompt = JUDGE_TMPL.format(
102
+ q=gold["question"],
103
+ gold=gold.get("gold_answer", ""),
104
+ ctx=ctx[:4000],
105
+ ans=answer or "(no answer)",
106
+ )
107
+ try:
108
+ out = llm.complete_json(JUDGE_SYS, prompt)
109
+ except Exception as e: # noqa: BLE001 - record the failure, keep going
110
+ return {d: 0.0 for d in JUDGE_DIMENSIONS} | {"rationale": f"judge error: {e}"}
111
+ result: dict[str, Any] = {d: _clamp(out.get(d, 0.0)) for d in JUDGE_DIMENSIONS}
112
+ result["rationale"] = str(out.get("rationale", ""))[:300]
113
+ return result
114
+
115
+
116
+ def _clamp(v) -> float:
117
+ try:
118
+ return round(max(0.0, min(1.0, float(v))), 3)
119
+ except (TypeError, ValueError):
120
+ return 0.0
121
+
122
+
123
+ def _mean(values: list[float]) -> float:
124
+ return round(sum(values) / len(values), 3) if values else 0.0
125
+
126
+
127
+ def _aggregate(records: list[dict]) -> dict:
128
+ agg = {d: _mean([r["scores"][d] for r in records]) for d in JUDGE_DIMENSIONS}
129
+ scored = [r["retrieval"] for r in records if r.get("retrieval")]
130
+ for m in RETRIEVAL_METRICS:
131
+ agg[m] = _mean([r[m] for r in scored])
132
+ return agg
133
+
134
+
135
+ def write_results(results: dict, path: str) -> None:
136
+ with open(path, "w", encoding="utf-8") as f:
137
+ json.dump(results, f, indent=2, ensure_ascii=False)
138
+
139
+
140
+ def read_results(path: str) -> dict:
141
+ with open(path, encoding="utf-8") as f:
142
+ return json.load(f)
proofrag/llm.py ADDED
@@ -0,0 +1,117 @@
1
+ """Provider-agnostic LLM client.
2
+
3
+ Auto-detects Anthropic or OpenAI-compatible (incl. local/Ollama via OPENAI_BASE_URL).
4
+ Defaults to a cheap model so generating a golden set + judging doesn't cost much.
5
+ The judge model is pinned and surfaced as a `fingerprint` so scores stay comparable.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import re
13
+ from typing import Any
14
+
15
+ # Cheap-by-default. Override with PROOFRAG_MODEL.
16
+ DEFAULT_ANTHROPIC_MODEL = "claude-haiku-4-5-20251001"
17
+ DEFAULT_OPENAI_MODEL = "gpt-4o-mini"
18
+
19
+
20
+ class LLMError(RuntimeError):
21
+ """Raised when the LLM backend is misconfigured or unavailable."""
22
+
23
+
24
+ class LLM:
25
+ """Thin wrapper over Anthropic / OpenAI-compatible chat completions."""
26
+
27
+ def __init__(self, provider: str | None = None, model: str | None = None):
28
+ self.provider = provider or os.environ.get("PROOFRAG_PROVIDER") or self._autodetect()
29
+ self.model = model or os.environ.get("PROOFRAG_MODEL") or self._default_model()
30
+ self._client: Any = None # one of several backend SDK clients, set lazily
31
+
32
+ @staticmethod
33
+ def _autodetect() -> str:
34
+ if os.environ.get("ANTHROPIC_API_KEY"):
35
+ return "anthropic"
36
+ if os.environ.get("OPENAI_API_KEY"):
37
+ return "openai"
38
+ raise LLMError(
39
+ "No LLM credentials found. Set ANTHROPIC_API_KEY or OPENAI_API_KEY "
40
+ "(or run `proofrag demo` to see a scorecard with no API key)."
41
+ )
42
+
43
+ def _default_model(self) -> str:
44
+ return DEFAULT_ANTHROPIC_MODEL if self.provider == "anthropic" else DEFAULT_OPENAI_MODEL
45
+
46
+ @property
47
+ def fingerprint(self) -> str:
48
+ """Stable id of the judge backend, recorded in every scorecard."""
49
+ return f"{self.provider}:{self.model}"
50
+
51
+ def complete_json(self, system: str, prompt: str) -> dict[str, Any]:
52
+ """Complete and parse the first JSON object out of the response."""
53
+ return _extract_json(self._complete(system, prompt))
54
+
55
+ # -- backends ---------------------------------------------------------
56
+
57
+ def _complete(self, system: str, prompt: str) -> str:
58
+ if self.provider == "anthropic":
59
+ return self._anthropic(system, prompt)
60
+ if self.provider == "openai":
61
+ return self._openai(system, prompt)
62
+ raise LLMError(f"Unknown provider: {self.provider!r}")
63
+
64
+ def _anthropic(self, system: str, prompt: str) -> str:
65
+ try:
66
+ import anthropic
67
+ except ImportError as e:
68
+ raise LLMError("Anthropic backend needs: pip install 'proofrag[anthropic]'") from e
69
+ if self._client is None:
70
+ self._client = anthropic.Anthropic()
71
+ msg = self._client.messages.create(
72
+ model=self.model,
73
+ max_tokens=2048,
74
+ system=system,
75
+ messages=[{"role": "user", "content": prompt}],
76
+ )
77
+ return "".join(
78
+ getattr(b, "text", "") for b in msg.content if getattr(b, "type", "") == "text"
79
+ )
80
+
81
+ def _openai(self, system: str, prompt: str) -> str:
82
+ try:
83
+ import openai
84
+ except ImportError as e:
85
+ raise LLMError("OpenAI backend needs: pip install 'proofrag[openai]'") from e
86
+ if self._client is None:
87
+ base = os.environ.get("OPENAI_BASE_URL")
88
+ self._client = openai.OpenAI(base_url=base) if base else openai.OpenAI()
89
+ resp = self._client.chat.completions.create(
90
+ model=self.model,
91
+ temperature=0,
92
+ messages=[
93
+ {"role": "system", "content": system},
94
+ {"role": "user", "content": prompt},
95
+ ],
96
+ )
97
+ return resp.choices[0].message.content or ""
98
+
99
+
100
+ def _extract_json(text: str) -> dict:
101
+ """Pull the first JSON object out of a model response (handles code fences)."""
102
+ text = text.strip()
103
+ fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
104
+ if fence:
105
+ text = fence.group(1)
106
+ start = text.find("{")
107
+ if start == -1:
108
+ raise LLMError(f"No JSON object in response: {text[:200]!r}")
109
+ depth = 0
110
+ for i in range(start, len(text)):
111
+ if text[i] == "{":
112
+ depth += 1
113
+ elif text[i] == "}":
114
+ depth -= 1
115
+ if depth == 0:
116
+ return json.loads(text[start : i + 1])
117
+ raise LLMError(f"Unbalanced JSON in response: {text[:200]!r}")
proofrag/metrics.py ADDED
@@ -0,0 +1,106 @@
1
+ """Rank-aware retrieval metrics (Recall@k, Precision@k, NDCG@k, MRR).
2
+
3
+ These separate retriever failures from generator failures: if NDCG@k is low, the
4
+ right context never reached the model in a usable rank, so a bad answer isn't the
5
+ LLM's fault.
6
+
7
+ Relevance is decided by a pluggable *matcher* `(gold_context, retrieved_chunk) ->
8
+ bool`. The default is token-overlap (Jaccard) — dependency-free, runs anywhere.
9
+ Swap in `embedding_matcher()` (see embeddings.py) for semantic matching without
10
+ touching the metric code.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import math
16
+ import re
17
+ from collections.abc import Callable
18
+
19
+ _WORD = re.compile(r"[a-z0-9]+")
20
+
21
+ Matcher = Callable[[str, str], bool]
22
+ RETRIEVAL_METRICS = ["recall_at_k", "precision_at_k", "ndcg_at_k", "mrr"]
23
+
24
+
25
+ def _tokens(text: str) -> set[str]:
26
+ return set(_WORD.findall(text.lower()))
27
+
28
+
29
+ def _jaccard(a: str, b: str) -> float:
30
+ ta, tb = _tokens(a), _tokens(b)
31
+ if not ta or not tb:
32
+ return 0.0
33
+ return len(ta & tb) / len(ta | tb)
34
+
35
+
36
+ def lexical_matcher(threshold: float = 0.4) -> Matcher:
37
+ """Default matcher: relevant if token-overlap (Jaccard) >= threshold."""
38
+ return lambda gold, chunk: _jaccard(gold, chunk) >= threshold
39
+
40
+
41
+ def _relevance(gold_contexts: list[str], chunk: str, matcher: Matcher) -> bool:
42
+ return any(matcher(g, chunk) for g in gold_contexts)
43
+
44
+
45
+ def recall_at_k(gold_contexts, retrieved, k, matcher) -> float:
46
+ """Fraction of gold contexts found among the top-k retrieved."""
47
+ if not gold_contexts:
48
+ return 1.0 # nothing to retrieve (e.g. an unanswerable case)
49
+ topk = retrieved[:k]
50
+ hits = sum(1 for g in gold_contexts if any(matcher(g, c) for c in topk))
51
+ return hits / len(gold_contexts)
52
+
53
+
54
+ def precision_at_k(gold_contexts, retrieved, k, matcher) -> float:
55
+ """Fraction of the top-k retrieved that are relevant."""
56
+ topk = retrieved[:k]
57
+ if not topk:
58
+ return 0.0
59
+ rel = sum(1 for c in topk if _relevance(gold_contexts, c, matcher))
60
+ return rel / len(topk)
61
+
62
+
63
+ def ndcg_at_k(gold_contexts, retrieved, k, matcher) -> float:
64
+ """Normalized DCG@k with binary relevance — rewards relevant chunks ranked high."""
65
+ rels = [1.0 if _relevance(gold_contexts, c, matcher) else 0.0 for c in retrieved[:k]]
66
+ dcg = sum(r / math.log2(i + 2) for i, r in enumerate(rels))
67
+ idcg = sum(r / math.log2(i + 2) for i, r in enumerate(sorted(rels, reverse=True)))
68
+ return dcg / idcg if idcg > 0 else 0.0
69
+
70
+
71
+ def mrr(gold_contexts, retrieved, matcher) -> float:
72
+ """Reciprocal rank of the first relevant chunk (0 if none)."""
73
+ for i, c in enumerate(retrieved):
74
+ if _relevance(gold_contexts, c, matcher):
75
+ return 1.0 / (i + 1)
76
+ return 0.0
77
+
78
+
79
+ def retrieval_metrics(
80
+ gold_contexts: list[str],
81
+ retrieved: list[str],
82
+ k: int = 5,
83
+ matcher: Matcher | None = None,
84
+ ) -> dict:
85
+ """All retrieval metrics for one (gold_contexts, retrieved) pair."""
86
+ matcher = matcher or lexical_matcher()
87
+ return {
88
+ "recall_at_k": round(recall_at_k(gold_contexts, retrieved, k, matcher), 3),
89
+ "precision_at_k": round(precision_at_k(gold_contexts, retrieved, k, matcher), 3),
90
+ "ndcg_at_k": round(ndcg_at_k(gold_contexts, retrieved, k, matcher), 3),
91
+ "mrr": round(mrr(gold_contexts, retrieved, matcher), 3),
92
+ }
93
+
94
+
95
+ # --- back-compat -------------------------------------------------------------
96
+
97
+
98
+ def context_matches(gold: str, retrieved: list[str], threshold: float = 0.4) -> bool:
99
+ """True if any retrieved context matches the gold context above threshold."""
100
+ m = lexical_matcher(threshold)
101
+ return any(m(gold, r) for r in retrieved)
102
+
103
+
104
+ def retrieval_recall(gold_contexts, retrieved, threshold: float = 0.4) -> float:
105
+ """Recall over all retrieved (k = len). Kept for back-compat."""
106
+ return recall_at_k(gold_contexts, retrieved, len(retrieved) or 1, lexical_matcher(threshold))
proofrag/scorecard.py ADDED
@@ -0,0 +1,218 @@
1
+ """Render a results dict into a self-contained, shareable HTML scorecard.
2
+
3
+ Zero external assets — inline CSS, no JS, no fonts fetched. Open the file
4
+ anywhere, attach it to a PR, drop it in CI artifacts. This is the artifact
5
+ people screenshot, so it is built to look good.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import html
11
+
12
+ from .judge import JUDGE_DIMENSIONS
13
+ from .metrics import RETRIEVAL_METRICS
14
+
15
+ _GEN_LABELS = {
16
+ "groundedness": "Groundedness",
17
+ "correctness": "Correctness",
18
+ "completeness": "Completeness",
19
+ "citation_quality": "Citation Quality",
20
+ }
21
+
22
+
23
+ def _ret_labels(k: int) -> dict:
24
+ return {
25
+ "recall_at_k": f"Recall@{k}",
26
+ "precision_at_k": f"Precision@{k}",
27
+ "ndcg_at_k": f"NDCG@{k}",
28
+ "mrr": "MRR",
29
+ }
30
+
31
+
32
+ def _grade(v: float) -> str:
33
+ if v >= 0.85:
34
+ return "good"
35
+ if v >= 0.65:
36
+ return "ok"
37
+ return "bad"
38
+
39
+
40
+ def _bar(label: str, value: float) -> str:
41
+ pct = round(value * 100)
42
+ return f"""
43
+ <div class="metric">
44
+ <div class="metric-head"><span>{html.escape(label)}</span><b>{pct}</b></div>
45
+ <div class="track"><div class="fill {_grade(value)}" style="width:{pct}%"></div></div>
46
+ </div>"""
47
+
48
+
49
+ def _card(label: str, value: float) -> str:
50
+ pct = round(value * 100)
51
+ return f"""
52
+ <div class="card {_grade(value)}">
53
+ <div class="card-val">{pct}</div>
54
+ <div class="card-label">{html.escape(label)}</div>
55
+ </div>"""
56
+
57
+
58
+ def _gen_mean(r: dict) -> float:
59
+ s = r["scores"]
60
+ return sum(s[d] for d in JUDGE_DIMENSIONS) / len(JUDGE_DIMENSIONS)
61
+
62
+
63
+ def _num_cell(value) -> str:
64
+ if value is None:
65
+ return '<td class="num mut">—</td>'
66
+ return f'<td class="num {_grade(value)}">{round(value * 100)}</td>'
67
+
68
+
69
+ def render(results: dict) -> str:
70
+ agg = results.get("aggregate", {})
71
+ records = results.get("records", [])
72
+ k = results.get("k", 5)
73
+ ret_labels = _ret_labels(k)
74
+
75
+ overall = (
76
+ round(sum(agg.get(d, 0.0) for d in JUDGE_DIMENSIONS) / len(JUDGE_DIMENSIONS) * 100)
77
+ if records
78
+ else 0
79
+ )
80
+ # Headline cards: 4 generation dims + NDCG@k as the single best retrieval signal.
81
+ cards = "".join(_card(_GEN_LABELS[d], agg.get(d, 0.0)) for d in JUDGE_DIMENSIONS)
82
+ cards += _card(ret_labels["ndcg_at_k"], agg.get("ndcg_at_k", 0.0))
83
+
84
+ gen_bars = "".join(_bar(_GEN_LABELS[d], agg.get(d, 0.0)) for d in JUDGE_DIMENSIONS)
85
+ ret_bars = "".join(_bar(ret_labels[m], agg.get(m, 0.0)) for m in RETRIEVAL_METRICS)
86
+
87
+ rows = []
88
+ for r in sorted(records, key=_gen_mean)[:8]:
89
+ s = r["scores"]
90
+ cells = "".join(_num_cell(s[d]) for d in JUDGE_DIMENSIONS)
91
+ ndcg = r["retrieval"]["ndcg_at_k"] if r.get("retrieval") else None
92
+ rows.append(
93
+ f"<tr><td class='q'>{html.escape(r['question'])}"
94
+ f"<div class='why'>{html.escape(r.get('rationale', ''))}</div></td>"
95
+ f"<td><span class='tag'>{html.escape(r.get('difficulty', ''))}</span></td>"
96
+ f"{cells}{_num_cell(ndcg)}</tr>"
97
+ )
98
+ failing = "".join(rows) or "<tr><td colspan='7'>No records.</td></tr>"
99
+
100
+ return _TEMPLATE.format(
101
+ overall=overall,
102
+ overall_grade=_grade(overall / 100) if records else "bad",
103
+ n=results.get("n", 0),
104
+ judge=html.escape(results.get("judge_fingerprint", "unknown")),
105
+ created=html.escape(results.get("created", "")),
106
+ ndcg_head=html.escape(ret_labels["ndcg_at_k"]),
107
+ cards=cards,
108
+ gen_bars=gen_bars,
109
+ ret_bars=ret_bars,
110
+ failing=failing,
111
+ )
112
+
113
+
114
+ def write_html(results: dict, path: str) -> None:
115
+ with open(path, "w", encoding="utf-8") as f:
116
+ f.write(render(results))
117
+
118
+
119
+ _TEMPLATE = """<!doctype html>
120
+ <html lang="en"><head><meta charset="utf-8">
121
+ <meta name="viewport" content="width=device-width, initial-scale=1">
122
+ <title>RAG Eval Scorecard</title>
123
+ <style>
124
+ :root {{
125
+ --bg:#0b0e14; --panel:#141925; --line:#222b3a; --ink:#e6edf3; --mut:#8b98ad;
126
+ --good:#3fb950; --ok:#d29922; --bad:#f85149;
127
+ }}
128
+ * {{ box-sizing:border-box; }}
129
+ body {{ margin:0; background:var(--bg); color:var(--ink);
130
+ font:15px/1.5 -apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,Helvetica,Arial,sans-serif; }}
131
+ .wrap {{ max-width:980px; margin:0 auto; padding:40px 24px 64px; }}
132
+ header {{ display:flex; align-items:center; justify-content:space-between; flex-wrap:wrap; gap:16px;
133
+ border-bottom:1px solid var(--line); padding-bottom:24px; margin-bottom:28px; }}
134
+ h1 {{ margin:0; font-size:22px; letter-spacing:.2px; }}
135
+ h1 .kit {{ color:var(--mut); font-weight:500; }}
136
+ .meta {{ color:var(--mut); font-size:13px; text-align:right; }}
137
+ .meta code {{ color:var(--ink); background:var(--panel); padding:2px 6px; border-radius:5px; }}
138
+ .hero {{ display:flex; align-items:center; gap:24px; background:var(--panel);
139
+ border:1px solid var(--line); border-radius:14px; padding:24px 28px; margin-bottom:24px; }}
140
+ .ring {{ font-size:54px; font-weight:700; line-height:1; }}
141
+ .ring.good {{ color:var(--good); }} .ring.ok {{ color:var(--ok); }} .ring.bad {{ color:var(--bad); }}
142
+ .hero .sub {{ color:var(--mut); }}
143
+ .cards {{ display:grid; grid-template-columns:repeat(5,1fr); gap:12px; margin-bottom:24px; }}
144
+ .card {{ background:var(--panel); border:1px solid var(--line); border-radius:12px;
145
+ padding:16px; text-align:center; }}
146
+ .card-val {{ font-size:30px; font-weight:700; }}
147
+ .card-label {{ color:var(--mut); font-size:12px; margin-top:4px; }}
148
+ .card.good .card-val {{ color:var(--good); }} .card.ok .card-val {{ color:var(--ok); }}
149
+ .card.bad .card-val {{ color:var(--bad); }}
150
+ .grid2 {{ display:grid; grid-template-columns:1fr 1fr; gap:16px; margin-bottom:24px; }}
151
+ .panel {{ background:var(--panel); border:1px solid var(--line); border-radius:14px;
152
+ padding:24px 28px; }}
153
+ .panel.full {{ margin-bottom:24px; }}
154
+ h2 {{ font-size:14px; text-transform:uppercase; letter-spacing:.8px; color:var(--mut);
155
+ margin:0 0 18px; }}
156
+ h2 small {{ text-transform:none; letter-spacing:0; font-weight:400; }}
157
+ .metric {{ margin-bottom:14px; }}
158
+ .metric:last-child {{ margin-bottom:0; }}
159
+ .metric-head {{ display:flex; justify-content:space-between; font-size:13px; margin-bottom:5px; }}
160
+ .track {{ height:8px; background:#0b0e14; border-radius:6px; overflow:hidden; }}
161
+ .fill {{ height:100%; border-radius:6px; }}
162
+ .fill.good {{ background:var(--good); }} .fill.ok {{ background:var(--ok); }}
163
+ .fill.bad {{ background:var(--bad); }}
164
+ table {{ width:100%; border-collapse:collapse; font-size:13px; }}
165
+ th {{ text-align:right; color:var(--mut); font-weight:600; padding:8px 10px;
166
+ border-bottom:1px solid var(--line); }}
167
+ th:first-child {{ text-align:left; }}
168
+ td {{ padding:11px 10px; border-bottom:1px solid var(--line); vertical-align:top; }}
169
+ td.q {{ max-width:380px; }}
170
+ .why {{ color:var(--mut); font-size:12px; margin-top:4px; }}
171
+ td.num {{ text-align:right; font-variant-numeric:tabular-nums; font-weight:600; }}
172
+ td.num.good {{ color:var(--good); }} td.num.ok {{ color:var(--ok); }} td.num.bad {{ color:var(--bad); }}
173
+ td.num.mut {{ color:var(--mut); }}
174
+ .tag {{ background:#0b0e14; border:1px solid var(--line); color:var(--mut);
175
+ font-size:11px; padding:2px 7px; border-radius:999px; white-space:nowrap; }}
176
+ footer {{ color:var(--mut); font-size:12px; text-align:center; margin-top:32px; }}
177
+ footer a {{ color:var(--mut); }}
178
+ @media (max-width:720px) {{ .cards {{ grid-template-columns:repeat(2,1fr); }} .grid2 {{ grid-template-columns:1fr; }} }}
179
+ </style></head>
180
+ <body><div class="wrap">
181
+ <header>
182
+ <h1>RAG Eval Scorecard <span class="kit">· proofrag</span></h1>
183
+ <div class="meta">judge <code>{judge}</code><br>{created} · {n} cases</div>
184
+ </header>
185
+
186
+ <div class="hero">
187
+ <div class="ring {overall_grade}">{overall}</div>
188
+ <div>
189
+ <div style="font-size:18px;font-weight:600;">Overall generation quality</div>
190
+ <div class="sub">Mean of groundedness, correctness, completeness & citation quality across {n} cases.</div>
191
+ </div>
192
+ </div>
193
+
194
+ <div class="cards">{cards}</div>
195
+
196
+ <div class="grid2">
197
+ <div class="panel">
198
+ <h2>Generation <small>— LLM-as-judge</small></h2>
199
+ {gen_bars}
200
+ </div>
201
+ <div class="panel">
202
+ <h2>Retrieval <small>— rank-aware</small></h2>
203
+ {ret_bars}
204
+ </div>
205
+ </div>
206
+
207
+ <div class="panel full">
208
+ <h2>Weakest cases</h2>
209
+ <table>
210
+ <thead><tr>
211
+ <th>Question</th><th>Tier</th><th>Grnd</th><th>Corr</th><th>Comp</th><th>Cite</th><th>{ndcg_head}</th>
212
+ </tr></thead>
213
+ <tbody>{failing}</tbody>
214
+ </table>
215
+ </div>
216
+
217
+ <footer>Generated by <a href="https://github.com/unshDee/proofrag">proofrag</a> — point your agent at your docs, get a golden set + scorecard in one command.</footer>
218
+ </div></body></html>"""