PyPI - proofrag - Versions diffs - 0.3.0__py3-none-any.whl - Mend

proofrag 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

proofrag/__init__.py +8 -0
proofrag/cli.py +187 -0
proofrag/corpus.py +59 -0
proofrag/demo.py +143 -0
proofrag/diffing.py +57 -0
proofrag/embeddings.py +53 -0
proofrag/goldenset.py +128 -0
proofrag/judge.py +142 -0
proofrag/llm.py +117 -0
proofrag/metrics.py +106 -0
proofrag/scorecard.py +218 -0
proofrag-0.3.0.dist-info/METADATA +183 -0
proofrag-0.3.0.dist-info/RECORD +16 -0
proofrag-0.3.0.dist-info/WHEEL +4 -0
proofrag-0.3.0.dist-info/entry_points.txt +2 -0
proofrag-0.3.0.dist-info/licenses/LICENSE +21 -0

proofrag/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""proofrag: zero-config RAG/LLM evaluation — golden sets, LLM-as-judge, scorecards."""
+from importlib.metadata import PackageNotFoundError, version
+try:
+    __version__ = version("proofrag")
+except PackageNotFoundError:  # running from a source tree without install metadata
+    __version__ = "0+unknown"

proofrag/cli.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""proofrag command-line interface.
+proofrag generate --corpus DIR     # docs  -> goldenset.jsonl
+proofrag evaluate --goldenset ...  # +preds -> results.json  (+ optional CI gate)
+proofrag report   --results ...    # results -> scorecard.html
+proofrag diff      --baseline ...  # compare vs a baseline; fail on regression
+proofrag demo                      # canned scorecard, no API key
+"""
+from __future__ import annotations
+import argparse
+import sys
+from . import __version__
+from .judge import JUDGE_DIMENSIONS
+def _eprint(*a):
+    print(*a, file=sys.stderr)
+def cmd_generate(args) -> int:
+    from .corpus import load_corpus
+    from .goldenset import generate, write_jsonl
+    from .llm import LLM, LLMError
+    chunks = load_corpus(args.corpus, max_chars=args.chunk_chars)
+    _eprint(f"Loaded {len(chunks)} chunks from {args.corpus}")
+    try:
+        records = generate(chunks, n=args.n, seed=args.seed, llm=LLM(model=args.model))
+    except LLMError as e:
+        _eprint(f"error: {e}")
+        return 2
+    write_jsonl(records, args.out)
+    tiers = {}
+    for r in records:
+        tiers[r["difficulty"]] = tiers.get(r["difficulty"], 0) + 1
+    _eprint(f"Wrote {len(records)} golden cases -> {args.out}  ({dict(tiers)})")
+    return 0
+def cmd_evaluate(args) -> int:
+    from .goldenset import read_jsonl
+    from .judge import evaluate, write_results
+    from .llm import LLM, LLMError
+    goldenset = read_jsonl(args.goldenset)
+    predictions = read_jsonl(args.predictions)
+    matcher = None
+    if args.semantic:
+        from .embeddings import embedding_matcher
+        matcher = embedding_matcher()
+    try:
+        results = evaluate(
+            goldenset, predictions, llm=LLM(model=args.model), k=args.k, matcher=matcher
+        )
+    except LLMError as e:
+        _eprint(f"error: {e}")
+        return 2
+    write_results(results, args.out)
+    agg = results["aggregate"]
+    _eprint(f"Judged {results['n']} cases with {results['judge_fingerprint']} -> {args.out}")
+    for k, v in agg.items():
+        _eprint(f"  {k:>18}: {v:.3f}")
+    if args.fail_under is not None:
+        overall = sum(agg[d] for d in JUDGE_DIMENSIONS) / len(JUDGE_DIMENSIONS)
+        if overall < args.fail_under:
+            _eprint(f"GATE FAIL: overall {overall:.3f} < {args.fail_under:.3f}")
+            return 1
+        _eprint(f"GATE PASS: overall {overall:.3f} >= {args.fail_under:.3f}")
+    return 0
+def cmd_report(args) -> int:
+    from .judge import read_results
+    from .scorecard import write_html
+    results = read_results(args.results)
+    write_html(results, args.out)
+    _eprint(f"Wrote scorecard -> {args.out}")
+    return 0
+def cmd_diff(args) -> int:
+    from .diffing import diff, format_table
+    from .judge import read_results
+    baseline = read_results(args.baseline)
+    candidate = read_results(args.candidate)
+    res = diff(baseline, candidate, tolerance=args.tolerance)
+    _eprint(format_table(res))
+    if res["judge_mismatch"]:
+        msg = (
+            f"judge mismatch: baseline={res['baseline_judge']} vs "
+            f"candidate={res['candidate_judge']} — scores are not comparable across judges"
+        )
+        if not args.allow_judge_mismatch:
+            _eprint(
+                f"error: {msg} (re-run both with the same judge, or pass --allow-judge-mismatch)"
+            )
+            return 2
+        _eprint(f"warning: {msg}")
+    if res["regressed"]:
+        _eprint(f"REGRESSION: {', '.join(res['regressed'])} dropped more than {args.tolerance}")
+        return 1
+    _eprint(f"OK: no metric regressed beyond {args.tolerance}")
+    return 0
+def cmd_demo(args) -> int:
+    from .demo import DEMO_RESULTS
+    from .scorecard import write_html
+    write_html(DEMO_RESULTS, args.out)
+    _eprint(f"Wrote demo scorecard -> {args.out}  (open it in a browser)")
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(prog="proofrag", description="Zero-config RAG/LLM evaluation.")
+    p.add_argument("--version", action="version", version=f"proofrag {__version__}")
+    sub = p.add_subparsers(dest="cmd", required=True)
+    g = sub.add_parser("generate", help="synthesize a golden set from a corpus")
+    g.add_argument("--corpus", required=True, help="file or directory of docs/code")
+    g.add_argument("--out", default="goldenset.jsonl")
+    g.add_argument("--n", type=int, default=20, help="number of cases")
+    g.add_argument("--seed", type=int, default=0)
+    g.add_argument("--chunk-chars", type=int, default=1200)
+    g.add_argument("--model", default=None, help="override judge/generator model")
+    g.set_defaults(func=cmd_generate)
+    e = sub.add_parser("evaluate", help="judge predictions against a golden set")
+    e.add_argument("--goldenset", required=True)
+    e.add_argument("--predictions", required=True, help="jsonl of {id, answer, retrieved_contexts}")
+    e.add_argument("--out", default="results.json")
+    e.add_argument("--model", default=None)
+    e.add_argument(
+        "--k", type=int, default=5, help="cutoff for retrieval metrics (Recall@k, NDCG@k, ...)"
+    )
+    e.add_argument(
+        "--semantic",
+        action="store_true",
+        help="use embedding cosine for chunk relevance instead of token overlap (needs [openai])",
+    )
+    e.add_argument(
+        "--fail-under",
+        type=float,
+        default=None,
+        help="CI gate: exit 1 if overall generation score < this (0-1)",
+    )
+    e.set_defaults(func=cmd_evaluate)
+    r = sub.add_parser("report", help="render results.json to an HTML scorecard")
+    r.add_argument("--results", required=True)
+    r.add_argument("--out", default="scorecard.html")
+    r.set_defaults(func=cmd_report)
+    df = sub.add_parser("diff", help="compare results against a baseline; fail on regression")
+    df.add_argument("--baseline", required=True, help="baseline results.json (a known-good run)")
+    df.add_argument("--candidate", required=True, help="new results.json to compare")
+    df.add_argument(
+        "--tolerance", type=float, default=0.02, help="allowed drop before flagging a regression"
+    )
+    df.add_argument(
+        "--allow-judge-mismatch", action="store_true", help="compare even if judge models differ"
+    )
+    df.set_defaults(func=cmd_diff)
+    d = sub.add_parser("demo", help="render a sample scorecard (no API key needed)")
+    d.add_argument("--out", default="scorecard.html")
+    d.set_defaults(func=cmd_demo)
+    return p
+def main(argv=None) -> int:
+    args = build_parser().parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    raise SystemExit(main())

proofrag/corpus.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Load and chunk a corpus from a file or directory tree."""
+from __future__ import annotations
+from pathlib import Path
+TEXT_EXT = {
+    ".md",
+    ".markdown",
+    ".txt",
+    ".rst",
+    ".mdx",
+    ".py",
+    ".js",
+    ".ts",
+    ".tsx",
+    ".java",
+    ".go",
+    ".rb",
+    ".rs",
+}
+def load_corpus(path: str, max_chars: int = 1200) -> list[dict]:
+    """Return a flat list of chunks: {source, chunk_id, text}."""
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"Corpus path not found: {path}")
+    files = (
+        [p]
+        if p.is_file()
+        else sorted(f for f in p.rglob("*") if f.is_file() and f.suffix.lower() in TEXT_EXT)
+    )
+    chunks: list[dict] = []
+    for f in files:
+        try:
+            text = f.read_text(encoding="utf-8", errors="ignore")
+        except OSError:
+            continue
+        for i, body in enumerate(_split(text, max_chars)):
+            chunks.append({"source": str(f), "chunk_id": f"{f.name}::{i}", "text": body})
+    if not chunks:
+        raise ValueError(f"No readable text chunks found under {path}")
+    return chunks
+def _split(text: str, max_chars: int) -> list[str]:
+    """Greedy paragraph packing so chunks stay under max_chars where possible."""
+    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
+    out: list[str] = []
+    buf = ""
+    for para in paras:
+        if buf and len(buf) + len(para) > max_chars:
+            out.append(buf.strip())
+            buf = ""
+        buf += para + "\n\n"
+    if buf.strip():
+        out.append(buf.strip())
+    return out

proofrag/demo.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""Canned results so `proofrag demo` renders a real scorecard with no API key.
+Used for the README screenshot, for trying the tool in 5 seconds, and for CI
+smoke tests that must run without credentials.
+"""
+from __future__ import annotations
+def _ret(recall, precision, ndcg, mrr):
+    return {"recall_at_k": recall, "precision_at_k": precision, "ndcg_at_k": ndcg, "mrr": mrr}
+DEMO_RESULTS = {
+    "judge_fingerprint": "anthropic:claude-haiku-4-5-20251001",
+    "created": "2026-05-31T00:00:00+00:00",
+    "k": 5,
+    "n": 8,
+    "aggregate": {
+        "groundedness": 0.86,
+        "correctness": 0.79,
+        "completeness": 0.71,
+        "citation_quality": 0.68,
+        "recall_at_k": 0.77,
+        "precision_at_k": 0.55,
+        "ndcg_at_k": 0.73,
+        "mrr": 0.81,
+    },
+    "records": [
+        {
+            "id": "q000",
+            "question": "How do I rotate an API key without downtime?",
+            "difficulty": "single_doc",
+            "answer": "Create a new key, deploy it, then revoke the old one.",
+            "scores": {
+                "groundedness": 0.95,
+                "correctness": 0.92,
+                "completeness": 0.88,
+                "citation_quality": 0.85,
+            },
+            "retrieval": _ret(1.0, 0.6, 1.0, 1.0),
+            "rationale": "Fully grounded and matches the reference.",
+        },
+        {
+            "id": "q001",
+            "question": "What regions support the EU data residency tier?",
+            "difficulty": "single_doc",
+            "answer": "Frankfurt and Dublin.",
+            "scores": {
+                "groundedness": 0.9,
+                "correctness": 0.85,
+                "completeness": 0.6,
+                "citation_quality": 0.7,
+            },
+            "retrieval": _ret(1.0, 0.4, 0.92, 1.0),
+            "rationale": "Correct but omits the Paris region the reference lists.",
+        },
+        {
+            "id": "q002",
+            "question": "Does the free plan include webhook retries and a dead-letter queue?",
+            "difficulty": "multi_doc",
+            "answer": "Yes, the free plan includes both.",
+            "scores": {
+                "groundedness": 0.3,
+                "correctness": 0.2,
+                "completeness": 0.4,
+                "citation_quality": 0.25,
+            },
+            "retrieval": _ret(0.5, 0.2, 0.39, 0.33),
+            "rationale": "Hallucinated: only retries are free; DLQ is paid. Retriever ranked the pricing doc low.",
+        },
+        {
+            "id": "q003",
+            "question": "What is the maximum payload size for the batch endpoint?",
+            "difficulty": "single_doc",
+            "answer": "10 MB per request.",
+            "scores": {
+                "groundedness": 0.88,
+                "correctness": 0.9,
+                "completeness": 0.8,
+                "citation_quality": 0.75,
+            },
+            "retrieval": _ret(1.0, 0.8, 1.0, 1.0),
+            "rationale": "Accurate and grounded.",
+        },
+        {
+            "id": "q004",
+            "question": "How does SSO group mapping interact with custom roles?",
+            "difficulty": "multi_doc",
+            "answer": "Groups map to roles automatically; custom roles override defaults.",
+            "scores": {
+                "groundedness": 0.6,
+                "correctness": 0.55,
+                "completeness": 0.5,
+                "citation_quality": 0.45,
+            },
+            "retrieval": _ret(0.5, 0.4, 0.63, 0.5),
+            "rationale": "Partially right; the precedence rule is stated backwards.",
+        },
+        {
+            "id": "q005",
+            "question": "What is the CEO's home address?",
+            "difficulty": "unanswerable",
+            "answer": "I don't have that information in the provided context.",
+            "scores": {
+                "groundedness": 1.0,
+                "correctness": 1.0,
+                "completeness": 1.0,
+                "citation_quality": 0.9,
+            },
+            "retrieval": None,
+            "rationale": "Correctly refused an unanswerable question.",
+        },
+        {
+            "id": "q006",
+            "question": "How long are audit logs retained on the enterprise plan?",
+            "difficulty": "single_doc",
+            "answer": "Forever.",
+            "scores": {
+                "groundedness": 0.2,
+                "correctness": 0.15,
+                "completeness": 0.3,
+                "citation_quality": 0.2,
+            },
+            "retrieval": _ret(0.0, 0.0, 0.0, 0.0),
+            "rationale": "Wrong (retention is 2 years) and no relevant context was retrieved.",
+        },
+        {
+            "id": "q007",
+            "question": "Which auth methods does the CLI support?",
+            "difficulty": "single_doc",
+            "answer": "API key and OAuth device flow.",
+            "scores": {
+                "groundedness": 0.92,
+                "correctness": 0.88,
+                "completeness": 0.85,
+                "citation_quality": 0.8,
+            },
+            "retrieval": _ret(1.0, 0.8, 1.0, 1.0),
+            "rationale": "Grounded and complete.",
+        },
+    ],
+}

proofrag/diffing.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Baseline diffing: compare two results.json runs and flag regressions.
+A baseline is just a results.json from a known-good run (commit it to the repo).
+On every change you re-evaluate and `diff` against it: any metric that drops by
+more than the tolerance is a regression and fails the build. Because all metrics
+here are higher-is-better, "regression" simply means delta < -tolerance.
+Judge models are pinned for a reason — comparing scores produced by different
+judges is meaningless, so a fingerprint mismatch is refused unless explicitly
+overridden.
+"""
+from __future__ import annotations
+from .judge import JUDGE_DIMENSIONS
+from .metrics import RETRIEVAL_METRICS
+ALL_METRICS = JUDGE_DIMENSIONS + RETRIEVAL_METRICS
+def diff(baseline: dict, candidate: dict, tolerance: float = 0.02) -> dict:
+    """Compare candidate vs baseline aggregates. All metrics are higher-is-better."""
+    b = baseline.get("aggregate", {})
+    c = candidate.get("aggregate", {})
+    rows = []
+    regressed = []
+    for m in ALL_METRICS:
+        if m not in b and m not in c:
+            continue
+        bv, cv = b.get(m), c.get(m)
+        delta = None if bv is None or cv is None else round(cv - bv, 3)
+        is_reg = delta is not None and delta < -tolerance
+        rows.append(
+            {"metric": m, "baseline": bv, "candidate": cv, "delta": delta, "regressed": is_reg}
+        )
+        if is_reg:
+            regressed.append(m)
+    return {
+        "rows": rows,
+        "regressed": regressed,
+        "tolerance": tolerance,
+        "judge_mismatch": baseline.get("judge_fingerprint") != candidate.get("judge_fingerprint"),
+        "baseline_judge": baseline.get("judge_fingerprint"),
+        "candidate_judge": candidate.get("judge_fingerprint"),
+    }
+def format_table(result: dict) -> str:
+    """Plain-text delta table for the terminal / CI logs."""
+    out = [f"{'metric':>16}  {'base':>7}  {'cand':>7}  {'delta':>7}"]
+    for r in result["rows"]:
+        b = "—" if r["baseline"] is None else f"{r['baseline']:.3f}"
+        c = "—" if r["candidate"] is None else f"{r['candidate']:.3f}"
+        d = "—" if r["delta"] is None else f"{r['delta']:+.3f}"
+        flag = "   << REGRESSION" if r["regressed"] else ""
+        out.append(f"{r['metric']:>16}  {b:>7}  {c:>7}  {d:>7}{flag}")
+    return "\n".join(out)

proofrag/embeddings.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Optional semantic matcher for retrieval metrics.
+Lexical (token-overlap) matching is the zero-dependency default. When chunks are
+paraphrased rather than copied, swap in an embedding matcher: it marks a retrieved
+chunk relevant to a gold context when their cosine similarity clears a threshold.
+Uses the OpenAI-compatible embeddings API (also covers local servers via
+OPENAI_BASE_URL). Requires the `openai` extra and OPENAI_API_KEY.
+"""
+from __future__ import annotations
+import math
+import os
+from .metrics import Matcher
+DEFAULT_EMBED_MODEL = "text-embedding-3-small"
+def _cosine(a: list[float], b: list[float]) -> float:
+    dot = sum(x * y for x, y in zip(a, b, strict=True))
+    na = math.sqrt(sum(x * x for x in a))
+    nb = math.sqrt(sum(y * y for y in b))
+    return dot / (na * nb) if na and nb else 0.0
+def embedding_matcher(threshold: float = 0.75, model: str | None = None) -> Matcher:
+    """Return a matcher backed by embedding cosine similarity.
+    Embeddings are cached per text within the matcher, so repeated gold/retrieved
+    strings across a run are embedded once.
+    """
+    try:
+        import openai
+    except ImportError as e:
+        raise RuntimeError("embedding_matcher needs: pip install 'proofrag[openai]'") from e
+    base = os.environ.get("OPENAI_BASE_URL")
+    client = openai.OpenAI(base_url=base) if base else openai.OpenAI()
+    model = model or os.environ.get("PROOFRAG_EMBED_MODEL") or DEFAULT_EMBED_MODEL
+    cache: dict[str, list[float]] = {}
+    def embed(text: str) -> list[float]:
+        key = text[:2000]
+        if key not in cache:
+            cache[key] = client.embeddings.create(model=model, input=key).data[0].embedding
+        return cache[key]
+    def match(gold: str, chunk: str) -> bool:
+        return _cosine(embed(gold), embed(chunk)) >= threshold
+    return match

proofrag/goldenset.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Synthesize a golden evaluation set from a corpus.
+The wedge: most teams never build evals because hand-writing a balanced,
+non-contaminated test set is tedious. This generates one from your own docs,
+with difficulty tiers that catch the failure modes papers flag as worst:
+multi-document questions and unanswerable (refusal) cases.
+Output is versioned JSONL. One record per line:
+  {id, question, gold_answer, gold_contexts[], difficulty, sources[]}
+"""
+from __future__ import annotations
+import json
+import random
+from .llm import LLM
+SYS = "You write evaluation questions for a RAG system. Output strict JSON only, no prose."
+_SINGLE = '''From this passage write ONE specific question fully answerable from it, and the ideal grounded answer.
+Source: {source}
+Passage:
+"""{text}"""
+Return JSON: {{"question": "...", "gold_answer": "..."}}'''
+_MULTI = '''From these TWO passages write ONE question that requires BOTH to answer, and the ideal answer that synthesizes them.
+Passage A ({src_a}):
+"""{text_a}"""
+Passage B ({src_b}):
+"""{text_b}"""
+Return JSON: {{"question": "...", "gold_answer": "..."}}'''
+_UNANS = '''Here is a passage from a knowledge base. Write ONE realistic question that is on-topic but CANNOT be answered from this passage (the info is simply not present).
+Passage:
+"""{text}"""
+Return JSON: {{"question": "..."}}'''
+_REFUSAL = "I don't have enough information in the provided context to answer that."
+def generate(chunks: list[dict], n: int = 20, seed: int = 0, llm: LLM | None = None) -> list[dict]:
+    """Generate `n` golden records with ~70% single / 20% multi / 10% unanswerable."""
+    llm = llm or LLM()
+    rng = random.Random(seed)
+    pool = chunks[:]
+    rng.shuffle(pool)
+    n_single = max(1, round(n * 0.7))
+    n_multi = round(n * 0.2) if len(pool) >= 2 else 0
+    n_unans = max(0, n - n_single - n_multi)
+    records: list[dict] = []
+    cursor = 0
+    for c in pool[:n_single]:
+        out = _try(llm, _SINGLE.format(source=c["source"], text=c["text"][:1500]))
+        if out and out.get("question"):
+            records.append(
+                _record(
+                    out["question"],
+                    out.get("gold_answer", ""),
+                    [c["text"]],
+                    "single_doc",
+                    [c["source"]],
+                )
+            )
+    cursor = n_single
+    for _ in range(n_multi):
+        if cursor + 1 >= len(pool):
+            break
+        a, b = pool[cursor], pool[cursor + 1]
+        cursor += 2
+        out = _try(
+            llm,
+            _MULTI.format(
+                src_a=a["source"], text_a=a["text"][:900], src_b=b["source"], text_b=b["text"][:900]
+            ),
+        )
+        if out and out.get("question"):
+            records.append(
+                _record(
+                    out["question"],
+                    out.get("gold_answer", ""),
+                    [a["text"], b["text"]],
+                    "multi_doc",
+                    [a["source"], b["source"]],
+                )
+            )
+    for c in pool[cursor : cursor + n_unans]:
+        out = _try(llm, _UNANS.format(text=c["text"][:1500]))
+        if out and out.get("question"):
+            records.append(_record(out["question"], _REFUSAL, [], "unanswerable", []))
+    for i, r in enumerate(records):
+        r["id"] = f"q{i:03d}"
+    return records
+def _try(llm: LLM, prompt: str) -> dict | None:
+    try:
+        return llm.complete_json(SYS, prompt)
+    except Exception:  # noqa: BLE001 - one bad generation shouldn't abort the run
+        return None
+def _record(question, gold_answer, gold_contexts, difficulty, sources) -> dict:
+    return {
+        "id": "",
+        "question": question.strip(),
+        "gold_answer": gold_answer.strip(),
+        "gold_contexts": gold_contexts,
+        "difficulty": difficulty,
+        "sources": sources,
+    }
+def write_jsonl(records: list[dict], path: str) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def read_jsonl(path: str) -> list[dict]:
+    with open(path, encoding="utf-8") as f:
+        return [json.loads(line) for line in f if line.strip()]