proofrag 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proofrag/__init__.py +8 -0
- proofrag/cli.py +187 -0
- proofrag/corpus.py +59 -0
- proofrag/demo.py +143 -0
- proofrag/diffing.py +57 -0
- proofrag/embeddings.py +53 -0
- proofrag/goldenset.py +128 -0
- proofrag/judge.py +142 -0
- proofrag/llm.py +117 -0
- proofrag/metrics.py +106 -0
- proofrag/scorecard.py +218 -0
- proofrag-0.3.0.dist-info/METADATA +183 -0
- proofrag-0.3.0.dist-info/RECORD +16 -0
- proofrag-0.3.0.dist-info/WHEEL +4 -0
- proofrag-0.3.0.dist-info/entry_points.txt +2 -0
- proofrag-0.3.0.dist-info/licenses/LICENSE +21 -0
proofrag/judge.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""LLM-as-judge scoring + rank-aware retrieval metrics → results.json.
|
|
2
|
+
|
|
3
|
+
Generation quality is scored by a pinned judge model on four dimensions
|
|
4
|
+
(groundedness, correctness, completeness, citation_quality). Retrieval quality is
|
|
5
|
+
scored separately (Recall@k, Precision@k, NDCG@k, MRR), so a retriever miss is
|
|
6
|
+
never blamed on the generator. The judge fingerprint is recorded so two scorecards
|
|
7
|
+
are only compared when they used the same judge.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import datetime as _dt
|
|
13
|
+
import json
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .llm import LLM
|
|
17
|
+
from .metrics import RETRIEVAL_METRICS, Matcher, lexical_matcher, retrieval_metrics
|
|
18
|
+
|
|
19
|
+
JUDGE_DIMENSIONS = ["groundedness", "correctness", "completeness", "citation_quality"]
|
|
20
|
+
|
|
21
|
+
JUDGE_SYS = (
|
|
22
|
+
"You are a strict, consistent evaluator of RAG answers. "
|
|
23
|
+
"Score each dimension from 0.0 to 1.0. Be calibrated: 1.0 means flawless, "
|
|
24
|
+
"0.5 means partially right, 0.0 means absent or wrong. Output JSON only."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
JUDGE_TMPL = '''Question: {q}
|
|
28
|
+
|
|
29
|
+
Reference (gold) answer: {gold}
|
|
30
|
+
|
|
31
|
+
Context the system retrieved:
|
|
32
|
+
"""{ctx}"""
|
|
33
|
+
|
|
34
|
+
System's answer: {ans}
|
|
35
|
+
|
|
36
|
+
Score 0.0-1.0:
|
|
37
|
+
- groundedness: is the answer supported by the retrieved context (no hallucination)?
|
|
38
|
+
- correctness: do its facts match the reference answer?
|
|
39
|
+
- completeness: does it cover what the reference covers?
|
|
40
|
+
- citation_quality: are claims attributable to the retrieved context?
|
|
41
|
+
|
|
42
|
+
Return JSON:
|
|
43
|
+
{{"groundedness": 0.0, "correctness": 0.0, "completeness": 0.0, "citation_quality": 0.0, "rationale": "one short sentence"}}'''
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def evaluate(
|
|
47
|
+
goldenset: list[dict],
|
|
48
|
+
predictions: list[dict],
|
|
49
|
+
llm: LLM | None = None,
|
|
50
|
+
k: int = 5,
|
|
51
|
+
matcher: Matcher | None = None,
|
|
52
|
+
) -> dict:
|
|
53
|
+
"""Join goldenset to predictions by id, judge each, aggregate.
|
|
54
|
+
|
|
55
|
+
`k` is the cutoff for retrieval metrics. `matcher` decides chunk relevance
|
|
56
|
+
(defaults to lexical token-overlap; pass `embedding_matcher()` for semantic).
|
|
57
|
+
"""
|
|
58
|
+
llm = llm or LLM()
|
|
59
|
+
matcher = matcher or lexical_matcher()
|
|
60
|
+
preds = {p["id"]: p for p in predictions}
|
|
61
|
+
|
|
62
|
+
records: list[dict] = []
|
|
63
|
+
for g in goldenset:
|
|
64
|
+
pred = preds.get(g["id"])
|
|
65
|
+
if pred is None:
|
|
66
|
+
continue
|
|
67
|
+
retrieved = pred.get("retrieved_contexts", []) or []
|
|
68
|
+
answer = pred.get("answer", "")
|
|
69
|
+
gold_contexts = g.get("gold_contexts", []) or []
|
|
70
|
+
|
|
71
|
+
scores = _judge_one(llm, g, answer, retrieved)
|
|
72
|
+
# Unanswerable cases have no gold context to retrieve — skip retrieval scoring.
|
|
73
|
+
retrieval = (
|
|
74
|
+
retrieval_metrics(gold_contexts, retrieved, k, matcher) if gold_contexts else None
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
records.append(
|
|
78
|
+
{
|
|
79
|
+
"id": g["id"],
|
|
80
|
+
"question": g["question"],
|
|
81
|
+
"difficulty": g.get("difficulty", "single_doc"),
|
|
82
|
+
"answer": answer,
|
|
83
|
+
"scores": scores,
|
|
84
|
+
"retrieval": retrieval,
|
|
85
|
+
"rationale": scores.pop("rationale", ""),
|
|
86
|
+
}
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
"judge_fingerprint": llm.fingerprint,
|
|
91
|
+
"created": _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds"),
|
|
92
|
+
"k": k,
|
|
93
|
+
"n": len(records),
|
|
94
|
+
"aggregate": _aggregate(records),
|
|
95
|
+
"records": records,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _judge_one(llm: LLM, gold: dict, answer: str, retrieved: list[str]) -> dict:
|
|
100
|
+
ctx = "\n\n---\n\n".join(retrieved) if retrieved else "(no context retrieved)"
|
|
101
|
+
prompt = JUDGE_TMPL.format(
|
|
102
|
+
q=gold["question"],
|
|
103
|
+
gold=gold.get("gold_answer", ""),
|
|
104
|
+
ctx=ctx[:4000],
|
|
105
|
+
ans=answer or "(no answer)",
|
|
106
|
+
)
|
|
107
|
+
try:
|
|
108
|
+
out = llm.complete_json(JUDGE_SYS, prompt)
|
|
109
|
+
except Exception as e: # noqa: BLE001 - record the failure, keep going
|
|
110
|
+
return {d: 0.0 for d in JUDGE_DIMENSIONS} | {"rationale": f"judge error: {e}"}
|
|
111
|
+
result: dict[str, Any] = {d: _clamp(out.get(d, 0.0)) for d in JUDGE_DIMENSIONS}
|
|
112
|
+
result["rationale"] = str(out.get("rationale", ""))[:300]
|
|
113
|
+
return result
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _clamp(v) -> float:
|
|
117
|
+
try:
|
|
118
|
+
return round(max(0.0, min(1.0, float(v))), 3)
|
|
119
|
+
except (TypeError, ValueError):
|
|
120
|
+
return 0.0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _mean(values: list[float]) -> float:
|
|
124
|
+
return round(sum(values) / len(values), 3) if values else 0.0
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _aggregate(records: list[dict]) -> dict:
|
|
128
|
+
agg = {d: _mean([r["scores"][d] for r in records]) for d in JUDGE_DIMENSIONS}
|
|
129
|
+
scored = [r["retrieval"] for r in records if r.get("retrieval")]
|
|
130
|
+
for m in RETRIEVAL_METRICS:
|
|
131
|
+
agg[m] = _mean([r[m] for r in scored])
|
|
132
|
+
return agg
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def write_results(results: dict, path: str) -> None:
|
|
136
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
137
|
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def read_results(path: str) -> dict:
|
|
141
|
+
with open(path, encoding="utf-8") as f:
|
|
142
|
+
return json.load(f)
|
proofrag/llm.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Provider-agnostic LLM client.
|
|
2
|
+
|
|
3
|
+
Auto-detects Anthropic or OpenAI-compatible (incl. local/Ollama via OPENAI_BASE_URL).
|
|
4
|
+
Defaults to a cheap model so generating a golden set + judging doesn't cost much.
|
|
5
|
+
The judge model is pinned and surfaced as a `fingerprint` so scores stay comparable.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
# Cheap-by-default. Override with PROOFRAG_MODEL.
|
|
16
|
+
DEFAULT_ANTHROPIC_MODEL = "claude-haiku-4-5-20251001"
|
|
17
|
+
DEFAULT_OPENAI_MODEL = "gpt-4o-mini"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LLMError(RuntimeError):
|
|
21
|
+
"""Raised when the LLM backend is misconfigured or unavailable."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LLM:
|
|
25
|
+
"""Thin wrapper over Anthropic / OpenAI-compatible chat completions."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, provider: str | None = None, model: str | None = None):
|
|
28
|
+
self.provider = provider or os.environ.get("PROOFRAG_PROVIDER") or self._autodetect()
|
|
29
|
+
self.model = model or os.environ.get("PROOFRAG_MODEL") or self._default_model()
|
|
30
|
+
self._client: Any = None # one of several backend SDK clients, set lazily
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _autodetect() -> str:
|
|
34
|
+
if os.environ.get("ANTHROPIC_API_KEY"):
|
|
35
|
+
return "anthropic"
|
|
36
|
+
if os.environ.get("OPENAI_API_KEY"):
|
|
37
|
+
return "openai"
|
|
38
|
+
raise LLMError(
|
|
39
|
+
"No LLM credentials found. Set ANTHROPIC_API_KEY or OPENAI_API_KEY "
|
|
40
|
+
"(or run `proofrag demo` to see a scorecard with no API key)."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _default_model(self) -> str:
|
|
44
|
+
return DEFAULT_ANTHROPIC_MODEL if self.provider == "anthropic" else DEFAULT_OPENAI_MODEL
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def fingerprint(self) -> str:
|
|
48
|
+
"""Stable id of the judge backend, recorded in every scorecard."""
|
|
49
|
+
return f"{self.provider}:{self.model}"
|
|
50
|
+
|
|
51
|
+
def complete_json(self, system: str, prompt: str) -> dict[str, Any]:
|
|
52
|
+
"""Complete and parse the first JSON object out of the response."""
|
|
53
|
+
return _extract_json(self._complete(system, prompt))
|
|
54
|
+
|
|
55
|
+
# -- backends ---------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
def _complete(self, system: str, prompt: str) -> str:
|
|
58
|
+
if self.provider == "anthropic":
|
|
59
|
+
return self._anthropic(system, prompt)
|
|
60
|
+
if self.provider == "openai":
|
|
61
|
+
return self._openai(system, prompt)
|
|
62
|
+
raise LLMError(f"Unknown provider: {self.provider!r}")
|
|
63
|
+
|
|
64
|
+
def _anthropic(self, system: str, prompt: str) -> str:
|
|
65
|
+
try:
|
|
66
|
+
import anthropic
|
|
67
|
+
except ImportError as e:
|
|
68
|
+
raise LLMError("Anthropic backend needs: pip install 'proofrag[anthropic]'") from e
|
|
69
|
+
if self._client is None:
|
|
70
|
+
self._client = anthropic.Anthropic()
|
|
71
|
+
msg = self._client.messages.create(
|
|
72
|
+
model=self.model,
|
|
73
|
+
max_tokens=2048,
|
|
74
|
+
system=system,
|
|
75
|
+
messages=[{"role": "user", "content": prompt}],
|
|
76
|
+
)
|
|
77
|
+
return "".join(
|
|
78
|
+
getattr(b, "text", "") for b in msg.content if getattr(b, "type", "") == "text"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _openai(self, system: str, prompt: str) -> str:
|
|
82
|
+
try:
|
|
83
|
+
import openai
|
|
84
|
+
except ImportError as e:
|
|
85
|
+
raise LLMError("OpenAI backend needs: pip install 'proofrag[openai]'") from e
|
|
86
|
+
if self._client is None:
|
|
87
|
+
base = os.environ.get("OPENAI_BASE_URL")
|
|
88
|
+
self._client = openai.OpenAI(base_url=base) if base else openai.OpenAI()
|
|
89
|
+
resp = self._client.chat.completions.create(
|
|
90
|
+
model=self.model,
|
|
91
|
+
temperature=0,
|
|
92
|
+
messages=[
|
|
93
|
+
{"role": "system", "content": system},
|
|
94
|
+
{"role": "user", "content": prompt},
|
|
95
|
+
],
|
|
96
|
+
)
|
|
97
|
+
return resp.choices[0].message.content or ""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _extract_json(text: str) -> dict:
|
|
101
|
+
"""Pull the first JSON object out of a model response (handles code fences)."""
|
|
102
|
+
text = text.strip()
|
|
103
|
+
fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
|
104
|
+
if fence:
|
|
105
|
+
text = fence.group(1)
|
|
106
|
+
start = text.find("{")
|
|
107
|
+
if start == -1:
|
|
108
|
+
raise LLMError(f"No JSON object in response: {text[:200]!r}")
|
|
109
|
+
depth = 0
|
|
110
|
+
for i in range(start, len(text)):
|
|
111
|
+
if text[i] == "{":
|
|
112
|
+
depth += 1
|
|
113
|
+
elif text[i] == "}":
|
|
114
|
+
depth -= 1
|
|
115
|
+
if depth == 0:
|
|
116
|
+
return json.loads(text[start : i + 1])
|
|
117
|
+
raise LLMError(f"Unbalanced JSON in response: {text[:200]!r}")
|
proofrag/metrics.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Rank-aware retrieval metrics (Recall@k, Precision@k, NDCG@k, MRR).
|
|
2
|
+
|
|
3
|
+
These separate retriever failures from generator failures: if NDCG@k is low, the
|
|
4
|
+
right context never reached the model in a usable rank, so a bad answer isn't the
|
|
5
|
+
LLM's fault.
|
|
6
|
+
|
|
7
|
+
Relevance is decided by a pluggable *matcher* `(gold_context, retrieved_chunk) ->
|
|
8
|
+
bool`. The default is token-overlap (Jaccard) — dependency-free, runs anywhere.
|
|
9
|
+
Swap in `embedding_matcher()` (see embeddings.py) for semantic matching without
|
|
10
|
+
touching the metric code.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import math
|
|
16
|
+
import re
|
|
17
|
+
from collections.abc import Callable
|
|
18
|
+
|
|
19
|
+
_WORD = re.compile(r"[a-z0-9]+")
|
|
20
|
+
|
|
21
|
+
Matcher = Callable[[str, str], bool]
|
|
22
|
+
RETRIEVAL_METRICS = ["recall_at_k", "precision_at_k", "ndcg_at_k", "mrr"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _tokens(text: str) -> set[str]:
|
|
26
|
+
return set(_WORD.findall(text.lower()))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _jaccard(a: str, b: str) -> float:
|
|
30
|
+
ta, tb = _tokens(a), _tokens(b)
|
|
31
|
+
if not ta or not tb:
|
|
32
|
+
return 0.0
|
|
33
|
+
return len(ta & tb) / len(ta | tb)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def lexical_matcher(threshold: float = 0.4) -> Matcher:
|
|
37
|
+
"""Default matcher: relevant if token-overlap (Jaccard) >= threshold."""
|
|
38
|
+
return lambda gold, chunk: _jaccard(gold, chunk) >= threshold
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _relevance(gold_contexts: list[str], chunk: str, matcher: Matcher) -> bool:
|
|
42
|
+
return any(matcher(g, chunk) for g in gold_contexts)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def recall_at_k(gold_contexts, retrieved, k, matcher) -> float:
|
|
46
|
+
"""Fraction of gold contexts found among the top-k retrieved."""
|
|
47
|
+
if not gold_contexts:
|
|
48
|
+
return 1.0 # nothing to retrieve (e.g. an unanswerable case)
|
|
49
|
+
topk = retrieved[:k]
|
|
50
|
+
hits = sum(1 for g in gold_contexts if any(matcher(g, c) for c in topk))
|
|
51
|
+
return hits / len(gold_contexts)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def precision_at_k(gold_contexts, retrieved, k, matcher) -> float:
|
|
55
|
+
"""Fraction of the top-k retrieved that are relevant."""
|
|
56
|
+
topk = retrieved[:k]
|
|
57
|
+
if not topk:
|
|
58
|
+
return 0.0
|
|
59
|
+
rel = sum(1 for c in topk if _relevance(gold_contexts, c, matcher))
|
|
60
|
+
return rel / len(topk)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def ndcg_at_k(gold_contexts, retrieved, k, matcher) -> float:
|
|
64
|
+
"""Normalized DCG@k with binary relevance — rewards relevant chunks ranked high."""
|
|
65
|
+
rels = [1.0 if _relevance(gold_contexts, c, matcher) else 0.0 for c in retrieved[:k]]
|
|
66
|
+
dcg = sum(r / math.log2(i + 2) for i, r in enumerate(rels))
|
|
67
|
+
idcg = sum(r / math.log2(i + 2) for i, r in enumerate(sorted(rels, reverse=True)))
|
|
68
|
+
return dcg / idcg if idcg > 0 else 0.0
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def mrr(gold_contexts, retrieved, matcher) -> float:
|
|
72
|
+
"""Reciprocal rank of the first relevant chunk (0 if none)."""
|
|
73
|
+
for i, c in enumerate(retrieved):
|
|
74
|
+
if _relevance(gold_contexts, c, matcher):
|
|
75
|
+
return 1.0 / (i + 1)
|
|
76
|
+
return 0.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def retrieval_metrics(
|
|
80
|
+
gold_contexts: list[str],
|
|
81
|
+
retrieved: list[str],
|
|
82
|
+
k: int = 5,
|
|
83
|
+
matcher: Matcher | None = None,
|
|
84
|
+
) -> dict:
|
|
85
|
+
"""All retrieval metrics for one (gold_contexts, retrieved) pair."""
|
|
86
|
+
matcher = matcher or lexical_matcher()
|
|
87
|
+
return {
|
|
88
|
+
"recall_at_k": round(recall_at_k(gold_contexts, retrieved, k, matcher), 3),
|
|
89
|
+
"precision_at_k": round(precision_at_k(gold_contexts, retrieved, k, matcher), 3),
|
|
90
|
+
"ndcg_at_k": round(ndcg_at_k(gold_contexts, retrieved, k, matcher), 3),
|
|
91
|
+
"mrr": round(mrr(gold_contexts, retrieved, matcher), 3),
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# --- back-compat -------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def context_matches(gold: str, retrieved: list[str], threshold: float = 0.4) -> bool:
|
|
99
|
+
"""True if any retrieved context matches the gold context above threshold."""
|
|
100
|
+
m = lexical_matcher(threshold)
|
|
101
|
+
return any(m(gold, r) for r in retrieved)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def retrieval_recall(gold_contexts, retrieved, threshold: float = 0.4) -> float:
|
|
105
|
+
"""Recall over all retrieved (k = len). Kept for back-compat."""
|
|
106
|
+
return recall_at_k(gold_contexts, retrieved, len(retrieved) or 1, lexical_matcher(threshold))
|
proofrag/scorecard.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Render a results dict into a self-contained, shareable HTML scorecard.
|
|
2
|
+
|
|
3
|
+
Zero external assets — inline CSS, no JS, no fonts fetched. Open the file
|
|
4
|
+
anywhere, attach it to a PR, drop it in CI artifacts. This is the artifact
|
|
5
|
+
people screenshot, so it is built to look good.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import html
|
|
11
|
+
|
|
12
|
+
from .judge import JUDGE_DIMENSIONS
|
|
13
|
+
from .metrics import RETRIEVAL_METRICS
|
|
14
|
+
|
|
15
|
+
_GEN_LABELS = {
|
|
16
|
+
"groundedness": "Groundedness",
|
|
17
|
+
"correctness": "Correctness",
|
|
18
|
+
"completeness": "Completeness",
|
|
19
|
+
"citation_quality": "Citation Quality",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ret_labels(k: int) -> dict:
|
|
24
|
+
return {
|
|
25
|
+
"recall_at_k": f"Recall@{k}",
|
|
26
|
+
"precision_at_k": f"Precision@{k}",
|
|
27
|
+
"ndcg_at_k": f"NDCG@{k}",
|
|
28
|
+
"mrr": "MRR",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _grade(v: float) -> str:
|
|
33
|
+
if v >= 0.85:
|
|
34
|
+
return "good"
|
|
35
|
+
if v >= 0.65:
|
|
36
|
+
return "ok"
|
|
37
|
+
return "bad"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _bar(label: str, value: float) -> str:
|
|
41
|
+
pct = round(value * 100)
|
|
42
|
+
return f"""
|
|
43
|
+
<div class="metric">
|
|
44
|
+
<div class="metric-head"><span>{html.escape(label)}</span><b>{pct}</b></div>
|
|
45
|
+
<div class="track"><div class="fill {_grade(value)}" style="width:{pct}%"></div></div>
|
|
46
|
+
</div>"""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _card(label: str, value: float) -> str:
|
|
50
|
+
pct = round(value * 100)
|
|
51
|
+
return f"""
|
|
52
|
+
<div class="card {_grade(value)}">
|
|
53
|
+
<div class="card-val">{pct}</div>
|
|
54
|
+
<div class="card-label">{html.escape(label)}</div>
|
|
55
|
+
</div>"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _gen_mean(r: dict) -> float:
|
|
59
|
+
s = r["scores"]
|
|
60
|
+
return sum(s[d] for d in JUDGE_DIMENSIONS) / len(JUDGE_DIMENSIONS)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _num_cell(value) -> str:
|
|
64
|
+
if value is None:
|
|
65
|
+
return '<td class="num mut">—</td>'
|
|
66
|
+
return f'<td class="num {_grade(value)}">{round(value * 100)}</td>'
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def render(results: dict) -> str:
|
|
70
|
+
agg = results.get("aggregate", {})
|
|
71
|
+
records = results.get("records", [])
|
|
72
|
+
k = results.get("k", 5)
|
|
73
|
+
ret_labels = _ret_labels(k)
|
|
74
|
+
|
|
75
|
+
overall = (
|
|
76
|
+
round(sum(agg.get(d, 0.0) for d in JUDGE_DIMENSIONS) / len(JUDGE_DIMENSIONS) * 100)
|
|
77
|
+
if records
|
|
78
|
+
else 0
|
|
79
|
+
)
|
|
80
|
+
# Headline cards: 4 generation dims + NDCG@k as the single best retrieval signal.
|
|
81
|
+
cards = "".join(_card(_GEN_LABELS[d], agg.get(d, 0.0)) for d in JUDGE_DIMENSIONS)
|
|
82
|
+
cards += _card(ret_labels["ndcg_at_k"], agg.get("ndcg_at_k", 0.0))
|
|
83
|
+
|
|
84
|
+
gen_bars = "".join(_bar(_GEN_LABELS[d], agg.get(d, 0.0)) for d in JUDGE_DIMENSIONS)
|
|
85
|
+
ret_bars = "".join(_bar(ret_labels[m], agg.get(m, 0.0)) for m in RETRIEVAL_METRICS)
|
|
86
|
+
|
|
87
|
+
rows = []
|
|
88
|
+
for r in sorted(records, key=_gen_mean)[:8]:
|
|
89
|
+
s = r["scores"]
|
|
90
|
+
cells = "".join(_num_cell(s[d]) for d in JUDGE_DIMENSIONS)
|
|
91
|
+
ndcg = r["retrieval"]["ndcg_at_k"] if r.get("retrieval") else None
|
|
92
|
+
rows.append(
|
|
93
|
+
f"<tr><td class='q'>{html.escape(r['question'])}"
|
|
94
|
+
f"<div class='why'>{html.escape(r.get('rationale', ''))}</div></td>"
|
|
95
|
+
f"<td><span class='tag'>{html.escape(r.get('difficulty', ''))}</span></td>"
|
|
96
|
+
f"{cells}{_num_cell(ndcg)}</tr>"
|
|
97
|
+
)
|
|
98
|
+
failing = "".join(rows) or "<tr><td colspan='7'>No records.</td></tr>"
|
|
99
|
+
|
|
100
|
+
return _TEMPLATE.format(
|
|
101
|
+
overall=overall,
|
|
102
|
+
overall_grade=_grade(overall / 100) if records else "bad",
|
|
103
|
+
n=results.get("n", 0),
|
|
104
|
+
judge=html.escape(results.get("judge_fingerprint", "unknown")),
|
|
105
|
+
created=html.escape(results.get("created", "")),
|
|
106
|
+
ndcg_head=html.escape(ret_labels["ndcg_at_k"]),
|
|
107
|
+
cards=cards,
|
|
108
|
+
gen_bars=gen_bars,
|
|
109
|
+
ret_bars=ret_bars,
|
|
110
|
+
failing=failing,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def write_html(results: dict, path: str) -> None:
|
|
115
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
116
|
+
f.write(render(results))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
_TEMPLATE = """<!doctype html>
|
|
120
|
+
<html lang="en"><head><meta charset="utf-8">
|
|
121
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
122
|
+
<title>RAG Eval Scorecard</title>
|
|
123
|
+
<style>
|
|
124
|
+
:root {{
|
|
125
|
+
--bg:#0b0e14; --panel:#141925; --line:#222b3a; --ink:#e6edf3; --mut:#8b98ad;
|
|
126
|
+
--good:#3fb950; --ok:#d29922; --bad:#f85149;
|
|
127
|
+
}}
|
|
128
|
+
* {{ box-sizing:border-box; }}
|
|
129
|
+
body {{ margin:0; background:var(--bg); color:var(--ink);
|
|
130
|
+
font:15px/1.5 -apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,Helvetica,Arial,sans-serif; }}
|
|
131
|
+
.wrap {{ max-width:980px; margin:0 auto; padding:40px 24px 64px; }}
|
|
132
|
+
header {{ display:flex; align-items:center; justify-content:space-between; flex-wrap:wrap; gap:16px;
|
|
133
|
+
border-bottom:1px solid var(--line); padding-bottom:24px; margin-bottom:28px; }}
|
|
134
|
+
h1 {{ margin:0; font-size:22px; letter-spacing:.2px; }}
|
|
135
|
+
h1 .kit {{ color:var(--mut); font-weight:500; }}
|
|
136
|
+
.meta {{ color:var(--mut); font-size:13px; text-align:right; }}
|
|
137
|
+
.meta code {{ color:var(--ink); background:var(--panel); padding:2px 6px; border-radius:5px; }}
|
|
138
|
+
.hero {{ display:flex; align-items:center; gap:24px; background:var(--panel);
|
|
139
|
+
border:1px solid var(--line); border-radius:14px; padding:24px 28px; margin-bottom:24px; }}
|
|
140
|
+
.ring {{ font-size:54px; font-weight:700; line-height:1; }}
|
|
141
|
+
.ring.good {{ color:var(--good); }} .ring.ok {{ color:var(--ok); }} .ring.bad {{ color:var(--bad); }}
|
|
142
|
+
.hero .sub {{ color:var(--mut); }}
|
|
143
|
+
.cards {{ display:grid; grid-template-columns:repeat(5,1fr); gap:12px; margin-bottom:24px; }}
|
|
144
|
+
.card {{ background:var(--panel); border:1px solid var(--line); border-radius:12px;
|
|
145
|
+
padding:16px; text-align:center; }}
|
|
146
|
+
.card-val {{ font-size:30px; font-weight:700; }}
|
|
147
|
+
.card-label {{ color:var(--mut); font-size:12px; margin-top:4px; }}
|
|
148
|
+
.card.good .card-val {{ color:var(--good); }} .card.ok .card-val {{ color:var(--ok); }}
|
|
149
|
+
.card.bad .card-val {{ color:var(--bad); }}
|
|
150
|
+
.grid2 {{ display:grid; grid-template-columns:1fr 1fr; gap:16px; margin-bottom:24px; }}
|
|
151
|
+
.panel {{ background:var(--panel); border:1px solid var(--line); border-radius:14px;
|
|
152
|
+
padding:24px 28px; }}
|
|
153
|
+
.panel.full {{ margin-bottom:24px; }}
|
|
154
|
+
h2 {{ font-size:14px; text-transform:uppercase; letter-spacing:.8px; color:var(--mut);
|
|
155
|
+
margin:0 0 18px; }}
|
|
156
|
+
h2 small {{ text-transform:none; letter-spacing:0; font-weight:400; }}
|
|
157
|
+
.metric {{ margin-bottom:14px; }}
|
|
158
|
+
.metric:last-child {{ margin-bottom:0; }}
|
|
159
|
+
.metric-head {{ display:flex; justify-content:space-between; font-size:13px; margin-bottom:5px; }}
|
|
160
|
+
.track {{ height:8px; background:#0b0e14; border-radius:6px; overflow:hidden; }}
|
|
161
|
+
.fill {{ height:100%; border-radius:6px; }}
|
|
162
|
+
.fill.good {{ background:var(--good); }} .fill.ok {{ background:var(--ok); }}
|
|
163
|
+
.fill.bad {{ background:var(--bad); }}
|
|
164
|
+
table {{ width:100%; border-collapse:collapse; font-size:13px; }}
|
|
165
|
+
th {{ text-align:right; color:var(--mut); font-weight:600; padding:8px 10px;
|
|
166
|
+
border-bottom:1px solid var(--line); }}
|
|
167
|
+
th:first-child {{ text-align:left; }}
|
|
168
|
+
td {{ padding:11px 10px; border-bottom:1px solid var(--line); vertical-align:top; }}
|
|
169
|
+
td.q {{ max-width:380px; }}
|
|
170
|
+
.why {{ color:var(--mut); font-size:12px; margin-top:4px; }}
|
|
171
|
+
td.num {{ text-align:right; font-variant-numeric:tabular-nums; font-weight:600; }}
|
|
172
|
+
td.num.good {{ color:var(--good); }} td.num.ok {{ color:var(--ok); }} td.num.bad {{ color:var(--bad); }}
|
|
173
|
+
td.num.mut {{ color:var(--mut); }}
|
|
174
|
+
.tag {{ background:#0b0e14; border:1px solid var(--line); color:var(--mut);
|
|
175
|
+
font-size:11px; padding:2px 7px; border-radius:999px; white-space:nowrap; }}
|
|
176
|
+
footer {{ color:var(--mut); font-size:12px; text-align:center; margin-top:32px; }}
|
|
177
|
+
footer a {{ color:var(--mut); }}
|
|
178
|
+
@media (max-width:720px) {{ .cards {{ grid-template-columns:repeat(2,1fr); }} .grid2 {{ grid-template-columns:1fr; }} }}
|
|
179
|
+
</style></head>
|
|
180
|
+
<body><div class="wrap">
|
|
181
|
+
<header>
|
|
182
|
+
<h1>RAG Eval Scorecard <span class="kit">· proofrag</span></h1>
|
|
183
|
+
<div class="meta">judge <code>{judge}</code><br>{created} · {n} cases</div>
|
|
184
|
+
</header>
|
|
185
|
+
|
|
186
|
+
<div class="hero">
|
|
187
|
+
<div class="ring {overall_grade}">{overall}</div>
|
|
188
|
+
<div>
|
|
189
|
+
<div style="font-size:18px;font-weight:600;">Overall generation quality</div>
|
|
190
|
+
<div class="sub">Mean of groundedness, correctness, completeness & citation quality across {n} cases.</div>
|
|
191
|
+
</div>
|
|
192
|
+
</div>
|
|
193
|
+
|
|
194
|
+
<div class="cards">{cards}</div>
|
|
195
|
+
|
|
196
|
+
<div class="grid2">
|
|
197
|
+
<div class="panel">
|
|
198
|
+
<h2>Generation <small>— LLM-as-judge</small></h2>
|
|
199
|
+
{gen_bars}
|
|
200
|
+
</div>
|
|
201
|
+
<div class="panel">
|
|
202
|
+
<h2>Retrieval <small>— rank-aware</small></h2>
|
|
203
|
+
{ret_bars}
|
|
204
|
+
</div>
|
|
205
|
+
</div>
|
|
206
|
+
|
|
207
|
+
<div class="panel full">
|
|
208
|
+
<h2>Weakest cases</h2>
|
|
209
|
+
<table>
|
|
210
|
+
<thead><tr>
|
|
211
|
+
<th>Question</th><th>Tier</th><th>Grnd</th><th>Corr</th><th>Comp</th><th>Cite</th><th>{ndcg_head}</th>
|
|
212
|
+
</tr></thead>
|
|
213
|
+
<tbody>{failing}</tbody>
|
|
214
|
+
</table>
|
|
215
|
+
</div>
|
|
216
|
+
|
|
217
|
+
<footer>Generated by <a href="https://github.com/unshDee/proofrag">proofrag</a> — point your agent at your docs, get a golden set + scorecard in one command.</footer>
|
|
218
|
+
</div></body></html>"""
|