proofrag 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proofrag-0.3.0/.claude-plugin/marketplace.json +20 -0
- proofrag-0.3.0/.claude-plugin/plugin.json +10 -0
- proofrag-0.3.0/.env.example +27 -0
- proofrag-0.3.0/.github/ISSUE_TEMPLATE/bug_report.md +25 -0
- proofrag-0.3.0/.github/ISSUE_TEMPLATE/config.yml +1 -0
- proofrag-0.3.0/.github/ISSUE_TEMPLATE/feature_request.md +15 -0
- proofrag-0.3.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
- proofrag-0.3.0/.github/workflows/ci.yml +29 -0
- proofrag-0.3.0/.github/workflows/publish.yml +26 -0
- proofrag-0.3.0/.gitignore +19 -0
- proofrag-0.3.0/.python-version +1 -0
- proofrag-0.3.0/AGENTS.md +36 -0
- proofrag-0.3.0/CHANGELOG.md +45 -0
- proofrag-0.3.0/CONTRIBUTING.md +61 -0
- proofrag-0.3.0/LICENSE +21 -0
- proofrag-0.3.0/Makefile +22 -0
- proofrag-0.3.0/PKG-INFO +183 -0
- proofrag-0.3.0/README.md +156 -0
- proofrag-0.3.0/action.yml +84 -0
- proofrag-0.3.0/commands/proofrag.md +21 -0
- proofrag-0.3.0/devtools/lint.py +27 -0
- proofrag-0.3.0/docs/demo.gif +0 -0
- proofrag-0.3.0/docs/demo.tape +39 -0
- proofrag-0.3.0/docs/scorecard.png +0 -0
- proofrag-0.3.0/examples/ci/proofrag-eval.yml +45 -0
- proofrag-0.3.0/examples/docs-rag/corpus/api.md +19 -0
- proofrag-0.3.0/examples/docs-rag/corpus/platform.md +18 -0
- proofrag-0.3.0/examples/docs-rag/naive_rag.py +73 -0
- proofrag-0.3.0/pyproject.toml +102 -0
- proofrag-0.3.0/skills/proofrag/SKILL.md +90 -0
- proofrag-0.3.0/src/proofrag/__init__.py +8 -0
- proofrag-0.3.0/src/proofrag/cli.py +187 -0
- proofrag-0.3.0/src/proofrag/corpus.py +59 -0
- proofrag-0.3.0/src/proofrag/demo.py +143 -0
- proofrag-0.3.0/src/proofrag/diffing.py +57 -0
- proofrag-0.3.0/src/proofrag/embeddings.py +53 -0
- proofrag-0.3.0/src/proofrag/goldenset.py +128 -0
- proofrag-0.3.0/src/proofrag/judge.py +142 -0
- proofrag-0.3.0/src/proofrag/llm.py +117 -0
- proofrag-0.3.0/src/proofrag/metrics.py +106 -0
- proofrag-0.3.0/src/proofrag/scorecard.py +218 -0
- proofrag-0.3.0/tests/test_smoke.py +106 -0
- proofrag-0.3.0/uv.lock +567 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "proofrag",
|
|
3
|
+
"owner": { "name": "Ansh Dawda", "url": "https://github.com/unshDee" },
|
|
4
|
+
"metadata": {
|
|
5
|
+
"description": "RAG/LLM evaluation skill — golden sets, LLM-as-judge, scorecards.",
|
|
6
|
+
"version": "0.1.0"
|
|
7
|
+
},
|
|
8
|
+
"plugins": [
|
|
9
|
+
{
|
|
10
|
+
"name": "proofrag",
|
|
11
|
+
"source": "./",
|
|
12
|
+
"description": "Evaluate a RAG/LLM app: golden set from your docs + LLM-as-judge + retrieval metrics + shareable scorecard + CI gate.",
|
|
13
|
+
"version": "0.1.0",
|
|
14
|
+
"author": { "name": "Ansh Dawda" },
|
|
15
|
+
"homepage": "https://github.com/unshDee/proofrag",
|
|
16
|
+
"license": "MIT",
|
|
17
|
+
"keywords": ["rag", "llm", "evaluation", "llm-as-judge", "skill"]
|
|
18
|
+
}
|
|
19
|
+
]
|
|
20
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "proofrag",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Evaluate a RAG/LLM app: generate a golden set from your docs, run LLM-as-judge + retrieval metrics, and produce a shareable HTML scorecard with a CI gate.",
|
|
5
|
+
"author": { "name": "Ansh Dawda", "email": "ansh.dawda@gmail.com" },
|
|
6
|
+
"homepage": "https://github.com/unshDee/proofrag",
|
|
7
|
+
"repository": "https://github.com/unshDee/proofrag",
|
|
8
|
+
"license": "MIT",
|
|
9
|
+
"keywords": ["rag", "llm", "evaluation", "llm-as-judge", "retrieval", "skill"]
|
|
10
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copy this file to `.env` and fill in your key: cp .env.example .env
|
|
2
|
+
# `.env` is gitignored — never commit real keys.
|
|
3
|
+
#
|
|
4
|
+
# proofrag does not auto-load .env. Load it before running, e.g.:
|
|
5
|
+
# set -a && source .env && set +a
|
|
6
|
+
# (or just `export` the vars yourself).
|
|
7
|
+
|
|
8
|
+
# --- Backend (pick ONE) ------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
# Anthropic — the default backend (cheap Haiku judge).
|
|
11
|
+
ANTHROPIC_API_KEY=
|
|
12
|
+
|
|
13
|
+
# OpenAI-compatible — also covers local servers (Ollama, vLLM, LM Studio)
|
|
14
|
+
# via OPENAI_BASE_URL. Needed for `evaluate --semantic` (embeddings).
|
|
15
|
+
# OPENAI_API_KEY=
|
|
16
|
+
# OPENAI_BASE_URL=http://localhost:11434/v1
|
|
17
|
+
|
|
18
|
+
# --- Optional overrides ------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
# Force a provider instead of auto-detecting from the keys above.
|
|
21
|
+
# PROOFRAG_PROVIDER=anthropic # or: openai
|
|
22
|
+
|
|
23
|
+
# Judge & generator model (defaults: Haiku for Anthropic, gpt-4o-mini for OpenAI).
|
|
24
|
+
# PROOFRAG_MODEL=
|
|
25
|
+
|
|
26
|
+
# Embedding model used by `evaluate --semantic`.
|
|
27
|
+
# PROOFRAG_EMBED_MODEL=text-embedding-3-small
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: Something isn't working as expected
|
|
4
|
+
title: "bug: "
|
|
5
|
+
labels: bug
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
**What happened**
|
|
9
|
+
<!-- A clear description of the bug. -->
|
|
10
|
+
|
|
11
|
+
**Steps to reproduce**
|
|
12
|
+
1.
|
|
13
|
+
2.
|
|
14
|
+
|
|
15
|
+
**Expected**
|
|
16
|
+
<!-- What you expected instead. -->
|
|
17
|
+
|
|
18
|
+
**Environment**
|
|
19
|
+
- proofrag version: <!-- `proofrag --version` -->
|
|
20
|
+
- Python:
|
|
21
|
+
- Backend: <!-- anthropic / openai / local -->
|
|
22
|
+
- OS:
|
|
23
|
+
|
|
24
|
+
**Logs / scorecard**
|
|
25
|
+
<!-- Paste error output, or attach the scorecard HTML/JSON if relevant. -->
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
blank_issues_enabled: true
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: Suggest a capability or improvement
|
|
4
|
+
title: "feat: "
|
|
5
|
+
labels: enhancement
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
**Problem**
|
|
9
|
+
<!-- What are you trying to do that proofrag doesn't support today? -->
|
|
10
|
+
|
|
11
|
+
**Proposed solution**
|
|
12
|
+
<!-- What would the command / metric / output look like? -->
|
|
13
|
+
|
|
14
|
+
**Alternatives considered**
|
|
15
|
+
<!-- Other tools or approaches, and why they fall short. -->
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
## Summary
|
|
2
|
+
|
|
3
|
+
<!-- What does this PR do, and why? -->
|
|
4
|
+
|
|
5
|
+
## Type
|
|
6
|
+
|
|
7
|
+
- [ ] feat — new capability
|
|
8
|
+
- [ ] fix — bug fix
|
|
9
|
+
- [ ] docs — documentation only
|
|
10
|
+
- [ ] chore / refactor / test
|
|
11
|
+
|
|
12
|
+
## Checklist
|
|
13
|
+
|
|
14
|
+
- [ ] `make lint` passes
|
|
15
|
+
- [ ] `make test` passes
|
|
16
|
+
- [ ] Updated `CHANGELOG.md` under `## [Unreleased]` (if user-facing)
|
|
17
|
+
- [ ] Updated docs / `SKILL.md` if behavior or commands changed
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
with:
|
|
17
|
+
fetch-depth: 0 # uv-dynamic-versioning needs tags/history
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v6
|
|
20
|
+
with:
|
|
21
|
+
python-version: ${{ matrix.python-version }}
|
|
22
|
+
- name: Install
|
|
23
|
+
run: uv sync --all-extras
|
|
24
|
+
- name: Lint
|
|
25
|
+
run: uv run python devtools/lint.py
|
|
26
|
+
- name: Test
|
|
27
|
+
run: uv run pytest
|
|
28
|
+
- name: Demo scorecard renders without an API key
|
|
29
|
+
run: uv run proofrag demo --out /tmp/scorecard.html
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
# Publishes proofrag to PyPI when a GitHub Release is published.
|
|
4
|
+
# Uses PyPI Trusted Publishing (OIDC) — configure the publisher once at
|
|
5
|
+
# https://pypi.org/manage/project/proofrag/settings/publishing/ (no token needed).
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
release:
|
|
9
|
+
types: [published]
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
pypi:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment: pypi
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write # required for trusted publishing
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
with:
|
|
20
|
+
fetch-depth: 0 # uv-dynamic-versioning derives the version from tags
|
|
21
|
+
- name: Install uv
|
|
22
|
+
uses: astral-sh/setup-uv@v6
|
|
23
|
+
- name: Build
|
|
24
|
+
run: uv build
|
|
25
|
+
- name: Publish to PyPI
|
|
26
|
+
run: uv publish
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
.venv/
|
|
7
|
+
venv/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.DS_Store
|
|
10
|
+
# eval artifacts (commit goldenset.jsonl deliberately, ignore the rest)
|
|
11
|
+
results.json
|
|
12
|
+
predictions.jsonl
|
|
13
|
+
scorecard.html
|
|
14
|
+
!docs/scorecard.png
|
|
15
|
+
# secrets — never commit
|
|
16
|
+
.env
|
|
17
|
+
.env.*
|
|
18
|
+
!.env.example
|
|
19
|
+
.venv/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
proofrag-0.3.0/AGENTS.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Agents
|
|
2
|
+
|
|
3
|
+
This repo ships **proofrag** as a portable [Agent Skill](https://agentskills.io):
|
|
4
|
+
`skills/proofrag/SKILL.md`. The skill is the interface; the `proofrag` Python CLI
|
|
5
|
+
(`src/proofrag/`) is the engine it drives.
|
|
6
|
+
|
|
7
|
+
## Use it as a skill
|
|
8
|
+
|
|
9
|
+
**Claude Code (plugin):**
|
|
10
|
+
```
|
|
11
|
+
/plugin marketplace add unshDee/proofrag
|
|
12
|
+
/plugin install proofrag@proofrag
|
|
13
|
+
```
|
|
14
|
+
Then just ask: *"evaluate my RAG"* — Claude auto-loads the skill. Or type `/proofrag`.
|
|
15
|
+
|
|
16
|
+
**Claude Code (manual):** copy the skill folder where Claude discovers skills:
|
|
17
|
+
```
|
|
18
|
+
cp -r skills/proofrag ~/.claude/skills/ # personal
|
|
19
|
+
cp -r skills/proofrag .claude/skills/ # this project only
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Codex / other agents (open standard):** drop the skill into your agent's skills
|
|
23
|
+
directory (e.g. `.agents/skills/` or your tool's equivalent):
|
|
24
|
+
```
|
|
25
|
+
cp -r skills/proofrag .agents/skills/
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Install the engine
|
|
29
|
+
|
|
30
|
+
The skill calls the `proofrag` CLI. Install it once, or run ad-hoc with `uvx`:
|
|
31
|
+
```
|
|
32
|
+
uv tool install "proofrag[anthropic]" # or: pipx install "proofrag[anthropic]"
|
|
33
|
+
uvx "proofrag[anthropic]" demo # no install
|
|
34
|
+
```
|
|
35
|
+
Set `ANTHROPIC_API_KEY` (default Haiku) or `OPENAI_API_KEY` (`OPENAI_BASE_URL` for
|
|
36
|
+
local/Ollama). No key needed for `proofrag demo`.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres
|
|
5
|
+
to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- `proofrag diff` — compare a run against a committed baseline results.json and
|
|
11
|
+
fail on regression (per-metric delta table, `--tolerance`, refuses to compare
|
|
12
|
+
across different judge models unless `--allow-judge-mismatch`).
|
|
13
|
+
- Reusable composite GitHub Action (`action.yml`): `uses: unshDee/proofrag@v0`
|
|
14
|
+
installs the CLI, evaluates, writes the scorecard, and gates on the absolute
|
|
15
|
+
floor and/or the baseline. Example workflow in `examples/ci/`.
|
|
16
|
+
|
|
17
|
+
## [0.2.0] - 2026-05-31
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
- Rank-aware retrieval metrics: Recall@k, Precision@k, NDCG@k, MRR, with a
|
|
21
|
+
pluggable relevance matcher (`metrics.py`).
|
|
22
|
+
- Optional embedding-based semantic matcher (`embeddings.py`); `evaluate --semantic`
|
|
23
|
+
and `--k` to set the cutoff.
|
|
24
|
+
- Scorecard split into Generation and Retrieval panels with an NDCG@k headline.
|
|
25
|
+
- Animated demo GIF of the full eval loop (`docs/demo.gif`, reproducible via
|
|
26
|
+
`docs/demo.tape`).
|
|
27
|
+
- Installable as a Claude Code plugin: `.claude-plugin/` manifests, `/proofrag`
|
|
28
|
+
slash command, `AGENTS.md`, and skill-discovery layout under `skills/proofrag/`.
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
- Unanswerable cases skip retrieval scoring so they don't skew the averages.
|
|
32
|
+
|
|
33
|
+
## [0.1.0] - 2026-05-31
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
- Golden-set generator from a corpus, with single-doc / multi-doc / unanswerable
|
|
37
|
+
difficulty tiers.
|
|
38
|
+
- LLM-as-judge scoring (groundedness, correctness, completeness, citation quality),
|
|
39
|
+
pinned and fingerprinted.
|
|
40
|
+
- Self-contained, shareable HTML scorecard, plus a keyless `demo` command.
|
|
41
|
+
- `--fail-under` CI gate; provider-agnostic backend (Anthropic / OpenAI / local).
|
|
42
|
+
|
|
43
|
+
[Unreleased]: https://github.com/unshDee/proofrag/compare/v0.2.0...HEAD
|
|
44
|
+
[0.2.0]: https://github.com/unshDee/proofrag/compare/v0.1.0...v0.2.0
|
|
45
|
+
[0.1.0]: https://github.com/unshDee/proofrag/releases/tag/v0.1.0
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Contributing to proofrag
|
|
2
|
+
|
|
3
|
+
Thanks for considering a contribution! proofrag is an Agent Skill + Python CLI for
|
|
4
|
+
evaluating RAG/LLM apps. This guide covers the dev setup and the workflow.
|
|
5
|
+
|
|
6
|
+
## Dev setup
|
|
7
|
+
|
|
8
|
+
Uses [uv](https://docs.astral.sh/uv/).
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
git clone https://github.com/unshDee/proofrag && cd proofrag
|
|
12
|
+
uv sync --all-extras # installs the package + both backends + dev tools
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Run the checks (CI runs exactly these):
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
make test # or: uv run pytest
|
|
19
|
+
make lint # or: uv run python devtools/lint.py (ruff + codespell + basedpyright)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
No API key needed for tests — they're fully offline. For a live end-to-end run, copy
|
|
23
|
+
the env template and add a key, then load it:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
cp .env.example .env # then put your key in .env
|
|
27
|
+
set -a && source .env && set +a
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
`.env` is gitignored; never commit real keys.
|
|
31
|
+
|
|
32
|
+
## Workflow (GitHub Flow)
|
|
33
|
+
|
|
34
|
+
`main` is always green and releasable. All changes land via pull request.
|
|
35
|
+
|
|
36
|
+
1. Branch off `main`. Name it by type:
|
|
37
|
+
- `feat/<short-name>` — new capability
|
|
38
|
+
- `fix/<short-name>` — bug fix
|
|
39
|
+
- `docs/<short-name>` — docs only
|
|
40
|
+
- `chore/<short-name>` — tooling, deps, CI, refactors
|
|
41
|
+
2. Make focused commits using [Conventional Commits](https://www.conventionalcommits.org/):
|
|
42
|
+
`feat: …`, `fix: …`, `docs: …`, `chore: …`, `refactor: …`, `test: …`.
|
|
43
|
+
3. Keep the change scoped — one logical thing per PR.
|
|
44
|
+
4. Make sure `make lint` and `make test` pass locally.
|
|
45
|
+
5. Open a PR into `main`. CI (lint + tests on Python 3.11–3.13) must pass.
|
|
46
|
+
6. PRs are **squash-merged** — your PR becomes one clean commit on `main`.
|
|
47
|
+
7. Note user-facing changes under `## [Unreleased]` in [CHANGELOG.md](CHANGELOG.md).
|
|
48
|
+
|
|
49
|
+
## Project layout
|
|
50
|
+
|
|
51
|
+
- `skills/proofrag/SKILL.md` — the Agent Skill (the interface agents load)
|
|
52
|
+
- `src/proofrag/` — the engine: `corpus`, `goldenset`, `judge`, `metrics`,
|
|
53
|
+
`embeddings`, `scorecard`, `llm`, `cli`
|
|
54
|
+
- `examples/docs-rag/` — a runnable end-to-end example
|
|
55
|
+
- `.claude-plugin/` — plugin + marketplace manifests
|
|
56
|
+
- `tests/` — offline smoke tests
|
|
57
|
+
|
|
58
|
+
## Releases
|
|
59
|
+
|
|
60
|
+
Maintainer cuts a [SemVer](https://semver.org/) tag and a GitHub Release from `main`;
|
|
61
|
+
that triggers the PyPI publish workflow. Versions are derived from git tags.
|
proofrag-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ansh Dawda
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
proofrag-0.3.0/Makefile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
.PHONY: default install lint test build clean
|
|
2
|
+
|
|
3
|
+
default: install lint test
|
|
4
|
+
|
|
5
|
+
install:
|
|
6
|
+
uv sync --all-extras
|
|
7
|
+
|
|
8
|
+
lint:
|
|
9
|
+
uv run python devtools/lint.py
|
|
10
|
+
|
|
11
|
+
test:
|
|
12
|
+
uv run pytest
|
|
13
|
+
|
|
14
|
+
build:
|
|
15
|
+
uv build
|
|
16
|
+
|
|
17
|
+
upgrade:
|
|
18
|
+
uv sync --upgrade --all-extras
|
|
19
|
+
|
|
20
|
+
clean:
|
|
21
|
+
-rm -rf dist/ build/ *.egg-info/ .pytest_cache/ .ruff_cache/ .coverage htmlcov/
|
|
22
|
+
-find . -type d -name __pycache__ -exec rm -rf {} +
|
proofrag-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.5
|
|
2
|
+
Name: proofrag
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Point your agent at your docs and your RAG app; get a golden test set + an LLM-as-judge & retrieval scorecard, in one command.
|
|
5
|
+
Project-URL: Repository, https://github.com/unshDee/proofrag
|
|
6
|
+
Project-URL: Issues, https://github.com/unshDee/proofrag/issues
|
|
7
|
+
Author-email: Ansh Dawda <ansh.dawda@gmail.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: agent-skills,claude,codex,evaluation,llm,llm-as-judge,rag,retrieval
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: <4.0,>=3.11
|
|
22
|
+
Provides-Extra: anthropic
|
|
23
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
24
|
+
Provides-Extra: openai
|
|
25
|
+
Requires-Dist: openai>=1.40; extra == 'openai'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# proofrag
|
|
29
|
+
|
|
30
|
+
[](https://github.com/unshDee/proofrag/actions/workflows/ci.yml)
|
|
31
|
+
[](https://www.python.org)
|
|
32
|
+
[](LICENSE)
|
|
33
|
+
|
|
34
|
+
**Point your agent at your docs and your RAG app. Get a golden test set, an
|
|
35
|
+
LLM-as-judge + retrieval scorecard, and a CI gate — in one command.**
|
|
36
|
+
|
|
37
|
+
Evaluation is the #1 unmet pain in production RAG/LLM work, and the hardest part
|
|
38
|
+
is building a good test set in the first place. `proofrag` generates one from
|
|
39
|
+
*your own corpus*, judges your system on it, and emits a shareable HTML scorecard.
|
|
40
|
+
It's an [Agent Skill](https://agentskills.io) (works in Claude Code, Codex, Cursor)
|
|
41
|
+
**and** a plain Python CLI — wrapping the eval loop, not reinventing the metrics.
|
|
42
|
+
|
|
43
|
+
<p align="center">
|
|
44
|
+
<img src="docs/demo.gif" alt="proofrag — generate a golden set, judge, and score in one loop" width="820">
|
|
45
|
+
</p>
|
|
46
|
+
|
|
47
|
+
<p align="center"><em>…and the scorecard it produces:</em></p>
|
|
48
|
+
<p align="center">
|
|
49
|
+
<img src="docs/scorecard.png" alt="RAG eval scorecard" width="760">
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
<p align="center"><em>Try it now — no API key needed:</em></p>
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/unshDee/proofrag && cd proofrag
|
|
56
|
+
uv run proofrag demo --out scorecard.html && open scorecard.html
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
> Uses [uv](https://docs.astral.sh/uv/). `uv run` auto-creates the environment on
|
|
60
|
+
> first call — nothing else to install. Prefer pip? `pipx install proofrag`.
|
|
61
|
+
|
|
62
|
+
## Install as an Agent Skill
|
|
63
|
+
|
|
64
|
+
`proofrag` is a skill (the [agentskills.io](https://agentskills.io) open standard) backed
|
|
65
|
+
by a real CLI — so any agent can run *"evaluate my RAG"* and get a reproducible scorecard.
|
|
66
|
+
|
|
67
|
+
**Claude Code (plugin):**
|
|
68
|
+
```
|
|
69
|
+
/plugin marketplace add unshDee/proofrag
|
|
70
|
+
/plugin install proofrag@proofrag
|
|
71
|
+
```
|
|
72
|
+
Then ask *"evaluate my RAG"* (auto-triggered) or type `/proofrag`.
|
|
73
|
+
|
|
74
|
+
**Claude Code (manual)** — `cp -r skills/proofrag ~/.claude/skills/`
|
|
75
|
+
**Codex / other agents** — `cp -r skills/proofrag .agents/skills/`
|
|
76
|
+
|
|
77
|
+
The skill drives the `proofrag` CLI; install it with `uv tool install "proofrag[anthropic]"`
|
|
78
|
+
(or `pipx install`, or run ad-hoc via `uvx`). See [AGENTS.md](AGENTS.md) for details.
|
|
79
|
+
|
|
80
|
+
## Why this exists
|
|
81
|
+
|
|
82
|
+
> "Running evals aren't the problem — the problem is acquiring or building a
|
|
83
|
+
> high-quality, non-contaminated dataset."
|
|
84
|
+
|
|
85
|
+
Most RAG systems reach production with no evals because writing a balanced golden
|
|
86
|
+
set by hand is tedious. So teams ship prompt and model changes blind. This closes
|
|
87
|
+
that loop: **change something → re-run → see if quality moved → gate the merge.**
|
|
88
|
+
|
|
89
|
+
## The loop
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# 1. Generate a golden set from YOUR docs (questions + gold answers + gold contexts)
|
|
93
|
+
proofrag generate --corpus ./docs --out goldenset.jsonl --n 20
|
|
94
|
+
|
|
95
|
+
# 2. Run your RAG over each question -> predictions.jsonl (one line per question)
|
|
96
|
+
# {"id": "q000", "answer": "...", "retrieved_contexts": ["...", "..."]}
|
|
97
|
+
# See examples/docs-rag/naive_rag.py for a runnable driver.
|
|
98
|
+
|
|
99
|
+
# 3. Judge: groundedness, correctness, completeness, citation quality + retrieval metrics
|
|
100
|
+
proofrag evaluate --goldenset goldenset.jsonl --predictions predictions.jsonl --out results.json
|
|
101
|
+
|
|
102
|
+
# 4. Shareable HTML scorecard
|
|
103
|
+
proofrag report --results results.json --out scorecard.html
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Run the whole thing end-to-end against the bundled example:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
uv sync --extra anthropic && export ANTHROPIC_API_KEY=...
|
|
110
|
+
uv run proofrag generate --corpus examples/docs-rag/corpus --out goldenset.jsonl --n 8
|
|
111
|
+
uv run python examples/docs-rag/naive_rag.py --goldenset goldenset.jsonl --corpus examples/docs-rag/corpus --out predictions.jsonl
|
|
112
|
+
uv run proofrag evaluate --goldenset goldenset.jsonl --predictions predictions.jsonl --out results.json
|
|
113
|
+
uv run proofrag report --results results.json --out scorecard.html
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## CI gate
|
|
117
|
+
|
|
118
|
+
Two kinds of gate. An **absolute** floor:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
proofrag evaluate --goldenset goldenset.jsonl --predictions predictions.jsonl \
|
|
122
|
+
--out results.json --fail-under 0.7 # non-zero exit if overall score drops below 0.7
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
…and a **regression** gate against a committed baseline (a known-good results.json):
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
proofrag diff --baseline baseline.json --candidate results.json --tolerance 0.02
|
|
129
|
+
# prints a per-metric delta table; exits 1 if any metric dropped > tolerance.
|
|
130
|
+
# Refuses to compare across different judge models unless --allow-judge-mismatch.
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### GitHub Action
|
|
134
|
+
|
|
135
|
+
Drop proofrag into any repo's CI in a few lines — it installs the CLI, evaluates,
|
|
136
|
+
writes the scorecard, and gates on both the floor and the baseline:
|
|
137
|
+
|
|
138
|
+
```yaml
|
|
139
|
+
- uses: unshDee/proofrag@v0
|
|
140
|
+
env:
|
|
141
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
142
|
+
with:
|
|
143
|
+
goldenset: eval/goldenset.jsonl
|
|
144
|
+
predictions: predictions.jsonl # produced by your RAG earlier in the job
|
|
145
|
+
baseline: eval/baseline.json # optional regression gate
|
|
146
|
+
fail-under: "0.7" # optional absolute gate
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Full runnable workflow (with artifact upload): [`examples/ci/proofrag-eval.yml`](examples/ci/proofrag-eval.yml).
|
|
150
|
+
|
|
151
|
+
## What makes it different
|
|
152
|
+
|
|
153
|
+
- **Golden set from your corpus** — the wedge. Difficulty tiers: single-doc,
|
|
154
|
+
multi-doc, and *unanswerable* (so you catch hallucination-instead-of-refusal).
|
|
155
|
+
- **Retriever vs generator split** — rank-aware retrieval metrics (Recall@k,
|
|
156
|
+
Precision@k, NDCG@k, MRR) separate "the context never arrived / ranked too low"
|
|
157
|
+
from "the model fluffed it." Lexical by default; `--semantic` for embedding match.
|
|
158
|
+
- **Pinned, fingerprinted judge** — every scorecard records its judge model, so you
|
|
159
|
+
never compare scores produced by different judges.
|
|
160
|
+
- **Cheap & portable** — defaults to a small model; Anthropic, OpenAI, or local/Ollama
|
|
161
|
+
(`OPENAI_BASE_URL`). Self-contained HTML, zero JS, zero external assets.
|
|
162
|
+
- **Agent-native** — drop it in as a skill and say *"evaluate my RAG"*; the agent
|
|
163
|
+
wires your pipeline to the kit.
|
|
164
|
+
|
|
165
|
+
## Configuration
|
|
166
|
+
|
|
167
|
+
| Env | Default | Purpose |
|
|
168
|
+
|-----|---------|---------|
|
|
169
|
+
| `ANTHROPIC_API_KEY` | — | Anthropic backend (default) |
|
|
170
|
+
| `OPENAI_API_KEY` / `OPENAI_BASE_URL` | — | OpenAI-compatible / local |
|
|
171
|
+
| `PROOFRAG_PROVIDER` | auto | `anthropic` or `openai` |
|
|
172
|
+
| `PROOFRAG_MODEL` | Haiku / gpt-4o-mini | judge & generator model |
|
|
173
|
+
| `PROOFRAG_EMBED_MODEL` | text-embedding-3-small | embeddings for `--semantic` retrieval match |
|
|
174
|
+
|
|
175
|
+
## Roadmap
|
|
176
|
+
|
|
177
|
+
- [x] v0.1 — golden-set generator, LLM-as-judge, retrieval recall, HTML scorecard, CI gate
|
|
178
|
+
- [x] v0.2 — rank-aware retrieval metrics (Recall@k / Precision@k / NDCG@k / MRR), lexical + optional embedding match
|
|
179
|
+
- [ ] v0.3 — GitHub Action + baseline diffing (regression-aware gate)
|
|
180
|
+
- [ ] v0.4 — A/B comparator (vector vs GraphRAG) with blind judging
|
|
181
|
+
- [ ] v0.5 — Ragas / DeepEval backends as pluggable scorers
|
|
182
|
+
|
|
183
|
+
Issues and PRs welcome. MIT licensed.
|