proofrag 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. proofrag-0.3.0/.claude-plugin/marketplace.json +20 -0
  2. proofrag-0.3.0/.claude-plugin/plugin.json +10 -0
  3. proofrag-0.3.0/.env.example +27 -0
  4. proofrag-0.3.0/.github/ISSUE_TEMPLATE/bug_report.md +25 -0
  5. proofrag-0.3.0/.github/ISSUE_TEMPLATE/config.yml +1 -0
  6. proofrag-0.3.0/.github/ISSUE_TEMPLATE/feature_request.md +15 -0
  7. proofrag-0.3.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
  8. proofrag-0.3.0/.github/workflows/ci.yml +29 -0
  9. proofrag-0.3.0/.github/workflows/publish.yml +26 -0
  10. proofrag-0.3.0/.gitignore +19 -0
  11. proofrag-0.3.0/.python-version +1 -0
  12. proofrag-0.3.0/AGENTS.md +36 -0
  13. proofrag-0.3.0/CHANGELOG.md +45 -0
  14. proofrag-0.3.0/CONTRIBUTING.md +61 -0
  15. proofrag-0.3.0/LICENSE +21 -0
  16. proofrag-0.3.0/Makefile +22 -0
  17. proofrag-0.3.0/PKG-INFO +183 -0
  18. proofrag-0.3.0/README.md +156 -0
  19. proofrag-0.3.0/action.yml +84 -0
  20. proofrag-0.3.0/commands/proofrag.md +21 -0
  21. proofrag-0.3.0/devtools/lint.py +27 -0
  22. proofrag-0.3.0/docs/demo.gif +0 -0
  23. proofrag-0.3.0/docs/demo.tape +39 -0
  24. proofrag-0.3.0/docs/scorecard.png +0 -0
  25. proofrag-0.3.0/examples/ci/proofrag-eval.yml +45 -0
  26. proofrag-0.3.0/examples/docs-rag/corpus/api.md +19 -0
  27. proofrag-0.3.0/examples/docs-rag/corpus/platform.md +18 -0
  28. proofrag-0.3.0/examples/docs-rag/naive_rag.py +73 -0
  29. proofrag-0.3.0/pyproject.toml +102 -0
  30. proofrag-0.3.0/skills/proofrag/SKILL.md +90 -0
  31. proofrag-0.3.0/src/proofrag/__init__.py +8 -0
  32. proofrag-0.3.0/src/proofrag/cli.py +187 -0
  33. proofrag-0.3.0/src/proofrag/corpus.py +59 -0
  34. proofrag-0.3.0/src/proofrag/demo.py +143 -0
  35. proofrag-0.3.0/src/proofrag/diffing.py +57 -0
  36. proofrag-0.3.0/src/proofrag/embeddings.py +53 -0
  37. proofrag-0.3.0/src/proofrag/goldenset.py +128 -0
  38. proofrag-0.3.0/src/proofrag/judge.py +142 -0
  39. proofrag-0.3.0/src/proofrag/llm.py +117 -0
  40. proofrag-0.3.0/src/proofrag/metrics.py +106 -0
  41. proofrag-0.3.0/src/proofrag/scorecard.py +218 -0
  42. proofrag-0.3.0/tests/test_smoke.py +106 -0
  43. proofrag-0.3.0/uv.lock +567 -0
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "proofrag",
3
+ "owner": { "name": "Ansh Dawda", "url": "https://github.com/unshDee" },
4
+ "metadata": {
5
+ "description": "RAG/LLM evaluation skill — golden sets, LLM-as-judge, scorecards.",
6
+ "version": "0.1.0"
7
+ },
8
+ "plugins": [
9
+ {
10
+ "name": "proofrag",
11
+ "source": "./",
12
+ "description": "Evaluate a RAG/LLM app: golden set from your docs + LLM-as-judge + retrieval metrics + shareable scorecard + CI gate.",
13
+ "version": "0.1.0",
14
+ "author": { "name": "Ansh Dawda" },
15
+ "homepage": "https://github.com/unshDee/proofrag",
16
+ "license": "MIT",
17
+ "keywords": ["rag", "llm", "evaluation", "llm-as-judge", "skill"]
18
+ }
19
+ ]
20
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "name": "proofrag",
3
+ "version": "0.1.0",
4
+ "description": "Evaluate a RAG/LLM app: generate a golden set from your docs, run LLM-as-judge + retrieval metrics, and produce a shareable HTML scorecard with a CI gate.",
5
+ "author": { "name": "Ansh Dawda", "email": "ansh.dawda@gmail.com" },
6
+ "homepage": "https://github.com/unshDee/proofrag",
7
+ "repository": "https://github.com/unshDee/proofrag",
8
+ "license": "MIT",
9
+ "keywords": ["rag", "llm", "evaluation", "llm-as-judge", "retrieval", "skill"]
10
+ }
@@ -0,0 +1,27 @@
1
+ # Copy this file to `.env` and fill in your key: cp .env.example .env
2
+ # `.env` is gitignored — never commit real keys.
3
+ #
4
+ # proofrag does not auto-load .env. Load it before running, e.g.:
5
+ # set -a && source .env && set +a
6
+ # (or just `export` the vars yourself).
7
+
8
+ # --- Backend (pick ONE) ------------------------------------------------------
9
+
10
+ # Anthropic — the default backend (cheap Haiku judge).
11
+ ANTHROPIC_API_KEY=
12
+
13
+ # OpenAI-compatible — also covers local servers (Ollama, vLLM, LM Studio)
14
+ # via OPENAI_BASE_URL. Needed for `evaluate --semantic` (embeddings).
15
+ # OPENAI_API_KEY=
16
+ # OPENAI_BASE_URL=http://localhost:11434/v1
17
+
18
+ # --- Optional overrides ------------------------------------------------------
19
+
20
+ # Force a provider instead of auto-detecting from the keys above.
21
+ # PROOFRAG_PROVIDER=anthropic # or: openai
22
+
23
+ # Judge & generator model (defaults: Haiku for Anthropic, gpt-4o-mini for OpenAI).
24
+ # PROOFRAG_MODEL=
25
+
26
+ # Embedding model used by `evaluate --semantic`.
27
+ # PROOFRAG_EMBED_MODEL=text-embedding-3-small
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: Bug report
3
+ about: Something isn't working as expected
4
+ title: "bug: "
5
+ labels: bug
6
+ ---
7
+
8
+ **What happened**
9
+ <!-- A clear description of the bug. -->
10
+
11
+ **Steps to reproduce**
12
+ 1.
13
+ 2.
14
+
15
+ **Expected**
16
+ <!-- What you expected instead. -->
17
+
18
+ **Environment**
19
+ - proofrag version: <!-- `proofrag --version` -->
20
+ - Python:
21
+ - Backend: <!-- anthropic / openai / local -->
22
+ - OS:
23
+
24
+ **Logs / scorecard**
25
+ <!-- Paste error output, or attach the scorecard HTML/JSON if relevant. -->
@@ -0,0 +1 @@
1
+ blank_issues_enabled: true
@@ -0,0 +1,15 @@
1
+ ---
2
+ name: Feature request
3
+ about: Suggest a capability or improvement
4
+ title: "feat: "
5
+ labels: enhancement
6
+ ---
7
+
8
+ **Problem**
9
+ <!-- What are you trying to do that proofrag doesn't support today? -->
10
+
11
+ **Proposed solution**
12
+ <!-- What would the command / metric / output look like? -->
13
+
14
+ **Alternatives considered**
15
+ <!-- Other tools or approaches, and why they fall short. -->
@@ -0,0 +1,17 @@
1
+ ## Summary
2
+
3
+ <!-- What does this PR do, and why? -->
4
+
5
+ ## Type
6
+
7
+ - [ ] feat — new capability
8
+ - [ ] fix — bug fix
9
+ - [ ] docs — documentation only
10
+ - [ ] chore / refactor / test
11
+
12
+ ## Checklist
13
+
14
+ - [ ] `make lint` passes
15
+ - [ ] `make test` passes
16
+ - [ ] Updated `CHANGELOG.md` under `## [Unreleased]` (if user-facing)
17
+ - [ ] Updated docs / `SKILL.md` if behavior or commands changed
@@ -0,0 +1,29 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.11", "3.12", "3.13"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0 # uv-dynamic-versioning needs tags/history
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v6
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+ - name: Install
23
+ run: uv sync --all-extras
24
+ - name: Lint
25
+ run: uv run python devtools/lint.py
26
+ - name: Test
27
+ run: uv run pytest
28
+ - name: Demo scorecard renders without an API key
29
+ run: uv run proofrag demo --out /tmp/scorecard.html
@@ -0,0 +1,26 @@
1
+ name: Publish
2
+
3
+ # Publishes proofrag to PyPI when a GitHub Release is published.
4
+ # Uses PyPI Trusted Publishing (OIDC) — configure the publisher once at
5
+ # https://pypi.org/manage/project/proofrag/settings/publishing/ (no token needed).
6
+
7
+ on:
8
+ release:
9
+ types: [published]
10
+
11
+ jobs:
12
+ pypi:
13
+ runs-on: ubuntu-latest
14
+ environment: pypi
15
+ permissions:
16
+ id-token: write # required for trusted publishing
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ with:
20
+ fetch-depth: 0 # uv-dynamic-versioning derives the version from tags
21
+ - name: Install uv
22
+ uses: astral-sh/setup-uv@v6
23
+ - name: Build
24
+ run: uv build
25
+ - name: Publish to PyPI
26
+ run: uv publish
@@ -0,0 +1,19 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ build/
5
+ dist/
6
+ .venv/
7
+ venv/
8
+ .pytest_cache/
9
+ .DS_Store
10
+ # eval artifacts (commit goldenset.jsonl deliberately, ignore the rest)
11
+ results.json
12
+ predictions.jsonl
13
+ scorecard.html
14
+ !docs/scorecard.png
15
+ # secrets — never commit
16
+ .env
17
+ .env.*
18
+ !.env.example
19
+ .venv/
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,36 @@
1
+ # Agents
2
+
3
+ This repo ships **proofrag** as a portable [Agent Skill](https://agentskills.io):
4
+ `skills/proofrag/SKILL.md`. The skill is the interface; the `proofrag` Python CLI
5
+ (`src/proofrag/`) is the engine it drives.
6
+
7
+ ## Use it as a skill
8
+
9
+ **Claude Code (plugin):**
10
+ ```
11
+ /plugin marketplace add unshDee/proofrag
12
+ /plugin install proofrag@proofrag
13
+ ```
14
+ Then just ask: *"evaluate my RAG"* — Claude auto-loads the skill. Or type `/proofrag`.
15
+
16
+ **Claude Code (manual):** copy the skill folder where Claude discovers skills:
17
+ ```
18
+ cp -r skills/proofrag ~/.claude/skills/ # personal
19
+ cp -r skills/proofrag .claude/skills/ # this project only
20
+ ```
21
+
22
+ **Codex / other agents (open standard):** drop the skill into your agent's skills
23
+ directory (e.g. `.agents/skills/` or your tool's equivalent):
24
+ ```
25
+ cp -r skills/proofrag .agents/skills/
26
+ ```
27
+
28
+ ## Install the engine
29
+
30
+ The skill calls the `proofrag` CLI. Install it once, or run ad-hoc with `uvx`:
31
+ ```
32
+ uv tool install "proofrag[anthropic]" # or: pipx install "proofrag[anthropic]"
33
+ uvx "proofrag[anthropic]" demo # no install
34
+ ```
35
+ Set `ANTHROPIC_API_KEY` (default Haiku) or `OPENAI_API_KEY` (`OPENAI_BASE_URL` for
36
+ local/Ollama). No key needed for `proofrag demo`.
@@ -0,0 +1,45 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres
5
+ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [Unreleased]
8
+
9
+ ### Added
10
+ - `proofrag diff` — compare a run against a committed baseline results.json and
11
+ fail on regression (per-metric delta table, `--tolerance`, refuses to compare
12
+ across different judge models unless `--allow-judge-mismatch`).
13
+ - Reusable composite GitHub Action (`action.yml`): `uses: unshDee/proofrag@v0`
14
+ installs the CLI, evaluates, writes the scorecard, and gates on the absolute
15
+ floor and/or the baseline. Example workflow in `examples/ci/`.
16
+
17
+ ## [0.2.0] - 2026-05-31
18
+
19
+ ### Added
20
+ - Rank-aware retrieval metrics: Recall@k, Precision@k, NDCG@k, MRR, with a
21
+ pluggable relevance matcher (`metrics.py`).
22
+ - Optional embedding-based semantic matcher (`embeddings.py`); `evaluate --semantic`
23
+ and `--k` to set the cutoff.
24
+ - Scorecard split into Generation and Retrieval panels with an NDCG@k headline.
25
+ - Animated demo GIF of the full eval loop (`docs/demo.gif`, reproducible via
26
+ `docs/demo.tape`).
27
+ - Installable as a Claude Code plugin: `.claude-plugin/` manifests, `/proofrag`
28
+ slash command, `AGENTS.md`, and skill-discovery layout under `skills/proofrag/`.
29
+
30
+ ### Changed
31
+ - Unanswerable cases skip retrieval scoring so they don't skew the averages.
32
+
33
+ ## [0.1.0] - 2026-05-31
34
+
35
+ ### Added
36
+ - Golden-set generator from a corpus, with single-doc / multi-doc / unanswerable
37
+ difficulty tiers.
38
+ - LLM-as-judge scoring (groundedness, correctness, completeness, citation quality),
39
+ pinned and fingerprinted.
40
+ - Self-contained, shareable HTML scorecard, plus a keyless `demo` command.
41
+ - `--fail-under` CI gate; provider-agnostic backend (Anthropic / OpenAI / local).
42
+
43
+ [Unreleased]: https://github.com/unshDee/proofrag/compare/v0.2.0...HEAD
44
+ [0.2.0]: https://github.com/unshDee/proofrag/compare/v0.1.0...v0.2.0
45
+ [0.1.0]: https://github.com/unshDee/proofrag/releases/tag/v0.1.0
@@ -0,0 +1,61 @@
1
+ # Contributing to proofrag
2
+
3
+ Thanks for considering a contribution! proofrag is an Agent Skill + Python CLI for
4
+ evaluating RAG/LLM apps. This guide covers the dev setup and the workflow.
5
+
6
+ ## Dev setup
7
+
8
+ Uses [uv](https://docs.astral.sh/uv/).
9
+
10
+ ```bash
11
+ git clone https://github.com/unshDee/proofrag && cd proofrag
12
+ uv sync --all-extras # installs the package + both backends + dev tools
13
+ ```
14
+
15
+ Run the checks (CI runs exactly these):
16
+
17
+ ```bash
18
+ make test # or: uv run pytest
19
+ make lint # or: uv run python devtools/lint.py (ruff + codespell + basedpyright)
20
+ ```
21
+
22
+ No API key needed for tests — they're fully offline. For a live end-to-end run, copy
23
+ the env template and add a key, then load it:
24
+
25
+ ```bash
26
+ cp .env.example .env # then put your key in .env
27
+ set -a && source .env && set +a
28
+ ```
29
+
30
+ `.env` is gitignored; never commit real keys.
31
+
32
+ ## Workflow (GitHub Flow)
33
+
34
+ `main` is always green and releasable. All changes land via pull request.
35
+
36
+ 1. Branch off `main`. Name it by type:
37
+ - `feat/<short-name>` — new capability
38
+ - `fix/<short-name>` — bug fix
39
+ - `docs/<short-name>` — docs only
40
+ - `chore/<short-name>` — tooling, deps, CI, refactors
41
+ 2. Make focused commits using [Conventional Commits](https://www.conventionalcommits.org/):
42
+ `feat: …`, `fix: …`, `docs: …`, `chore: …`, `refactor: …`, `test: …`.
43
+ 3. Keep the change scoped — one logical thing per PR.
44
+ 4. Make sure `make lint` and `make test` pass locally.
45
+ 5. Open a PR into `main`. CI (lint + tests on Python 3.11–3.13) must pass.
46
+ 6. PRs are **squash-merged** — your PR becomes one clean commit on `main`.
47
+ 7. Note user-facing changes under `## [Unreleased]` in [CHANGELOG.md](CHANGELOG.md).
48
+
49
+ ## Project layout
50
+
51
+ - `skills/proofrag/SKILL.md` — the Agent Skill (the interface agents load)
52
+ - `src/proofrag/` — the engine: `corpus`, `goldenset`, `judge`, `metrics`,
53
+ `embeddings`, `scorecard`, `llm`, `cli`
54
+ - `examples/docs-rag/` — a runnable end-to-end example
55
+ - `.claude-plugin/` — plugin + marketplace manifests
56
+ - `tests/` — offline smoke tests
57
+
58
+ ## Releases
59
+
60
+ Maintainer cuts a [SemVer](https://semver.org/) tag and a GitHub Release from `main`;
61
+ that triggers the PyPI publish workflow. Versions are derived from git tags.
proofrag-0.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ansh Dawda
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,22 @@
1
+ .PHONY: default install lint test build clean
2
+
3
+ default: install lint test
4
+
5
+ install:
6
+ uv sync --all-extras
7
+
8
+ lint:
9
+ uv run python devtools/lint.py
10
+
11
+ test:
12
+ uv run pytest
13
+
14
+ build:
15
+ uv build
16
+
17
+ upgrade:
18
+ uv sync --upgrade --all-extras
19
+
20
+ clean:
21
+ -rm -rf dist/ build/ *.egg-info/ .pytest_cache/ .ruff_cache/ .coverage htmlcov/
22
+ -find . -type d -name __pycache__ -exec rm -rf {} +
@@ -0,0 +1,183 @@
1
+ Metadata-Version: 2.5
2
+ Name: proofrag
3
+ Version: 0.3.0
4
+ Summary: Point your agent at your docs and your RAG app; get a golden test set + an LLM-as-judge & retrieval scorecard, in one command.
5
+ Project-URL: Repository, https://github.com/unshDee/proofrag
6
+ Project-URL: Issues, https://github.com/unshDee/proofrag/issues
7
+ Author-email: Ansh Dawda <ansh.dawda@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: agent-skills,claude,codex,evaluation,llm,llm-as-judge,rag,retrieval
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: <4.0,>=3.11
22
+ Provides-Extra: anthropic
23
+ Requires-Dist: anthropic>=0.40; extra == 'anthropic'
24
+ Provides-Extra: openai
25
+ Requires-Dist: openai>=1.40; extra == 'openai'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # proofrag
29
+
30
+ [![CI](https://github.com/unshDee/proofrag/actions/workflows/ci.yml/badge.svg)](https://github.com/unshDee/proofrag/actions/workflows/ci.yml)
31
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org)
32
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
33
+
34
+ **Point your agent at your docs and your RAG app. Get a golden test set, an
35
+ LLM-as-judge + retrieval scorecard, and a CI gate — in one command.**
36
+
37
+ Evaluation is the #1 unmet pain in production RAG/LLM work, and the hardest part
38
+ is building a good test set in the first place. `proofrag` generates one from
39
+ *your own corpus*, judges your system on it, and emits a shareable HTML scorecard.
40
+ It's an [Agent Skill](https://agentskills.io) (works in Claude Code, Codex, Cursor)
41
+ **and** a plain Python CLI — wrapping the eval loop, not reinventing the metrics.
42
+
43
+ <p align="center">
44
+ <img src="docs/demo.gif" alt="proofrag — generate a golden set, judge, and score in one loop" width="820">
45
+ </p>
46
+
47
+ <p align="center"><em>…and the scorecard it produces:</em></p>
48
+ <p align="center">
49
+ <img src="docs/scorecard.png" alt="RAG eval scorecard" width="760">
50
+ </p>
51
+
52
+ <p align="center"><em>Try it now — no API key needed:</em></p>
53
+
54
+ ```bash
55
+ git clone https://github.com/unshDee/proofrag && cd proofrag
56
+ uv run proofrag demo --out scorecard.html && open scorecard.html
57
+ ```
58
+
59
+ > Uses [uv](https://docs.astral.sh/uv/). `uv run` auto-creates the environment on
60
+ > first call — nothing else to install. Prefer pip? `pipx install proofrag`.
61
+
62
+ ## Install as an Agent Skill
63
+
64
+ `proofrag` is a skill (the [agentskills.io](https://agentskills.io) open standard) backed
65
+ by a real CLI — so any agent can run *"evaluate my RAG"* and get a reproducible scorecard.
66
+
67
+ **Claude Code (plugin):**
68
+ ```
69
+ /plugin marketplace add unshDee/proofrag
70
+ /plugin install proofrag@proofrag
71
+ ```
72
+ Then ask *"evaluate my RAG"* (auto-triggered) or type `/proofrag`.
73
+
74
+ **Claude Code (manual)** — `cp -r skills/proofrag ~/.claude/skills/`
75
+ **Codex / other agents** — `cp -r skills/proofrag .agents/skills/`
76
+
77
+ The skill drives the `proofrag` CLI; install it with `uv tool install "proofrag[anthropic]"`
78
+ (or `pipx install`, or run ad-hoc via `uvx`). See [AGENTS.md](AGENTS.md) for details.
79
+
80
+ ## Why this exists
81
+
82
+ > "Running evals aren't the problem — the problem is acquiring or building a
83
+ > high-quality, non-contaminated dataset."
84
+
85
+ Most RAG systems reach production with no evals because writing a balanced golden
86
+ set by hand is tedious. So teams ship prompt and model changes blind. This closes
87
+ that loop: **change something → re-run → see if quality moved → gate the merge.**
88
+
89
+ ## The loop
90
+
91
+ ```bash
92
+ # 1. Generate a golden set from YOUR docs (questions + gold answers + gold contexts)
93
+ proofrag generate --corpus ./docs --out goldenset.jsonl --n 20
94
+
95
+ # 2. Run your RAG over each question -> predictions.jsonl (one line per question)
96
+ # {"id": "q000", "answer": "...", "retrieved_contexts": ["...", "..."]}
97
+ # See examples/docs-rag/naive_rag.py for a runnable driver.
98
+
99
+ # 3. Judge: groundedness, correctness, completeness, citation quality + retrieval metrics
100
+ proofrag evaluate --goldenset goldenset.jsonl --predictions predictions.jsonl --out results.json
101
+
102
+ # 4. Shareable HTML scorecard
103
+ proofrag report --results results.json --out scorecard.html
104
+ ```
105
+
106
+ Run the whole thing end-to-end against the bundled example:
107
+
108
+ ```bash
109
+ uv sync --extra anthropic && export ANTHROPIC_API_KEY=...
110
+ uv run proofrag generate --corpus examples/docs-rag/corpus --out goldenset.jsonl --n 8
111
+ uv run python examples/docs-rag/naive_rag.py --goldenset goldenset.jsonl --corpus examples/docs-rag/corpus --out predictions.jsonl
112
+ uv run proofrag evaluate --goldenset goldenset.jsonl --predictions predictions.jsonl --out results.json
113
+ uv run proofrag report --results results.json --out scorecard.html
114
+ ```
115
+
116
+ ## CI gate
117
+
118
+ Two kinds of gate. An **absolute** floor:
119
+
120
+ ```bash
121
+ proofrag evaluate --goldenset goldenset.jsonl --predictions predictions.jsonl \
122
+ --out results.json --fail-under 0.7 # non-zero exit if overall score drops below 0.7
123
+ ```
124
+
125
+ …and a **regression** gate against a committed baseline (a known-good results.json):
126
+
127
+ ```bash
128
+ proofrag diff --baseline baseline.json --candidate results.json --tolerance 0.02
129
+ # prints a per-metric delta table; exits 1 if any metric dropped > tolerance.
130
+ # Refuses to compare across different judge models unless --allow-judge-mismatch.
131
+ ```
132
+
133
+ ### GitHub Action
134
+
135
+ Drop proofrag into any repo's CI in a few lines — it installs the CLI, evaluates,
136
+ writes the scorecard, and gates on both the floor and the baseline:
137
+
138
+ ```yaml
139
+ - uses: unshDee/proofrag@v0
140
+ env:
141
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
142
+ with:
143
+ goldenset: eval/goldenset.jsonl
144
+ predictions: predictions.jsonl # produced by your RAG earlier in the job
145
+ baseline: eval/baseline.json # optional regression gate
146
+ fail-under: "0.7" # optional absolute gate
147
+ ```
148
+
149
+ Full runnable workflow (with artifact upload): [`examples/ci/proofrag-eval.yml`](examples/ci/proofrag-eval.yml).
150
+
151
+ ## What makes it different
152
+
153
+ - **Golden set from your corpus** — the wedge. Difficulty tiers: single-doc,
154
+ multi-doc, and *unanswerable* (so you catch hallucination-instead-of-refusal).
155
+ - **Retriever vs generator split** — rank-aware retrieval metrics (Recall@k,
156
+ Precision@k, NDCG@k, MRR) separate "the context never arrived / ranked too low"
157
+ from "the model fluffed it." Lexical by default; `--semantic` for embedding match.
158
+ - **Pinned, fingerprinted judge** — every scorecard records its judge model, so you
159
+ never compare scores produced by different judges.
160
+ - **Cheap & portable** — defaults to a small model; Anthropic, OpenAI, or local/Ollama
161
+ (`OPENAI_BASE_URL`). Self-contained HTML, zero JS, zero external assets.
162
+ - **Agent-native** — drop it in as a skill and say *"evaluate my RAG"*; the agent
163
+ wires your pipeline to the kit.
164
+
165
+ ## Configuration
166
+
167
+ | Env | Default | Purpose |
168
+ |-----|---------|---------|
169
+ | `ANTHROPIC_API_KEY` | — | Anthropic backend (default) |
170
+ | `OPENAI_API_KEY` / `OPENAI_BASE_URL` | — | OpenAI-compatible / local |
171
+ | `PROOFRAG_PROVIDER` | auto | `anthropic` or `openai` |
172
+ | `PROOFRAG_MODEL` | Haiku / gpt-4o-mini | judge & generator model |
173
+ | `PROOFRAG_EMBED_MODEL` | text-embedding-3-small | embeddings for `--semantic` retrieval match |
174
+
175
+ ## Roadmap
176
+
177
+ - [x] v0.1 — golden-set generator, LLM-as-judge, retrieval recall, HTML scorecard, CI gate
178
+ - [x] v0.2 — rank-aware retrieval metrics (Recall@k / Precision@k / NDCG@k / MRR), lexical + optional embedding match
179
+ - [ ] v0.3 — GitHub Action + baseline diffing (regression-aware gate)
180
+ - [ ] v0.4 — A/B comparator (vector vs GraphRAG) with blind judging
181
+ - [ ] v0.5 — Ragas / DeepEval backends as pluggable scorers
182
+
183
+ Issues and PRs welcome. MIT licensed.