ragverdict 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragverdict-0.2.1/.gitignore +42 -0
- ragverdict-0.2.1/LICENSE +21 -0
- ragverdict-0.2.1/PKG-INFO +303 -0
- ragverdict-0.2.1/README.md +269 -0
- ragverdict-0.2.1/examples/README.md +63 -0
- ragverdict-0.2.1/examples/comparison/README.md +98 -0
- ragverdict-0.2.1/examples/langchain_rag/README.md +64 -0
- ragverdict-0.2.1/examples/openai_rag/README.md +47 -0
- ragverdict-0.2.1/pyproject.toml +72 -0
- ragverdict-0.2.1/src/ragverdict/__init__.py +30 -0
- ragverdict-0.2.1/src/ragverdict/adapters/__init__.py +23 -0
- ragverdict-0.2.1/src/ragverdict/adapters/base.py +92 -0
- ragverdict-0.2.1/src/ragverdict/adapters/http.py +65 -0
- ragverdict-0.2.1/src/ragverdict/adapters/loader.py +53 -0
- ragverdict-0.2.1/src/ragverdict/cli.py +58 -0
- ragverdict-0.2.1/src/ragverdict/config.py +117 -0
- ragverdict-0.2.1/src/ragverdict/evaluators/__init__.py +27 -0
- ragverdict-0.2.1/src/ragverdict/evaluators/base.py +47 -0
- ragverdict-0.2.1/src/ragverdict/evaluators/citation_audit.py +207 -0
- ragverdict-0.2.1/src/ragverdict/evaluators/edge_cases.py +460 -0
- ragverdict-0.2.1/src/ragverdict/evaluators/rag_quality.py +318 -0
- ragverdict-0.2.1/src/ragverdict/evaluators/tool_coverage.py +130 -0
- ragverdict-0.2.1/src/ragverdict/judges/__init__.py +1 -0
- ragverdict-0.2.1/src/ragverdict/judges/llm_judge.py +324 -0
- ragverdict-0.2.1/src/ragverdict/py.typed +0 -0
- ragverdict-0.2.1/src/ragverdict/report.py +140 -0
- ragverdict-0.2.1/src/ragverdict/runner.py +107 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
.Python
|
|
6
|
+
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
env/
|
|
10
|
+
ENV/
|
|
11
|
+
|
|
12
|
+
*.egg-info/
|
|
13
|
+
dist/
|
|
14
|
+
build/
|
|
15
|
+
*.egg
|
|
16
|
+
|
|
17
|
+
.mypy_cache/
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
.coverage.*
|
|
22
|
+
htmlcov/
|
|
23
|
+
coverage.xml
|
|
24
|
+
|
|
25
|
+
.idea/
|
|
26
|
+
.vscode/
|
|
27
|
+
*.swp
|
|
28
|
+
*.swo
|
|
29
|
+
.DS_Store
|
|
30
|
+
|
|
31
|
+
report/
|
|
32
|
+
report.json
|
|
33
|
+
report.md
|
|
34
|
+
|
|
35
|
+
.env
|
|
36
|
+
.env.local
|
|
37
|
+
|
|
38
|
+
# Personal notes — never published
|
|
39
|
+
INTERVIEW_PREP.md
|
|
40
|
+
NOTES.md
|
|
41
|
+
RESUME_POINTERS.md
|
|
42
|
+
_private/
|
ragverdict-0.2.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shaurya Gulati
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragverdict
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: pytest for RAG agents — behavioral audits with PASS/FAIL/WEAK verdicts
|
|
5
|
+
Project-URL: Homepage, https://github.com/Shauryagulati/ragverdict
|
|
6
|
+
Project-URL: Repository, https://github.com/Shauryagulati/ragverdict
|
|
7
|
+
Author: Shaurya Gulati
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: anthropic,claude,evaluation,llm,rag,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: anthropic>=0.40
|
|
22
|
+
Requires-Dist: click>=8.1
|
|
23
|
+
Requires-Dist: httpx>=0.27
|
|
24
|
+
Requires-Dist: pydantic>=2.6
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: rich>=13.7
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
32
|
+
Requires-Dist: types-pyyaml; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# ragverdict
|
|
36
|
+
|
|
37
|
+
[](https://github.com/Shauryagulati/ragverdict/actions/workflows/ci.yml)
|
|
38
|
+
[](https://github.com/Shauryagulati/ragverdict/blob/main/pyproject.toml)
|
|
39
|
+
[](./LICENSE)
|
|
40
|
+
|
|
41
|
+
**pytest for RAG agents.** Behavioral audits of any RAG system — tool coverage, retrieval
|
|
42
|
+
quality, citation verification, hallucination guardrails — with PASS / FAIL / WEAK verdicts,
|
|
43
|
+
not floating-point metric averages.
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
47
|
+
┃ Test ┃ Evaluator ┃ Verdict ┃ Latency ┃ Detail ┃
|
|
48
|
+
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
|
49
|
+
│ tool_coverage_all │ tool_coverage │ PASS │ 8ms │ 2/2 tools fired cleanly │
|
|
50
|
+
│ direct_retrieval │ rag_quality │ PASS │ 12662ms │ 3/3 cases passed │
|
|
51
|
+
│ hallucination_g… │ rag_quality │ PASS │ 3985ms │ 2/2 cases passed │
|
|
52
|
+
│ citation_audit │ citation_audit │ PASS │ 5535ms │ mean support_score=1.00 │
|
|
53
|
+
│ edge_cases_battery │ edge_cases │ PASS │ 2380ms │ 4/4 cases passed │
|
|
54
|
+
└────────────────────┴────────────────┴─────────┴─────────┴─────────────────────────────────┘
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Why ragverdict
|
|
58
|
+
|
|
59
|
+
Existing RAG evaluation tools score metrics. RAGAs, DeepEval, TruLens, and Arize Phoenix
|
|
60
|
+
all answer "how faithful was the response *on average*" via LLM-as-judge — they tell you
|
|
61
|
+
the mean of a fleet of scores. They do not answer **does the agent actually work
|
|
62
|
+
end-to-end**.
|
|
63
|
+
|
|
64
|
+
| Tool | Does | Doesn't |
|
|
65
|
+
|------------|---------------------------------------------------------------|-------------------------------------------------------------------------------|
|
|
66
|
+
| **RAGAs** | LLM-as-judge metric scores (faithfulness, context P/R) | No tool-call testing, no citation-vs-corpus verification, no assertions |
|
|
67
|
+
| **DeepEval** | pytest-style assertions on the RAGAs metric family | Same metric-centric model |
|
|
68
|
+
| **TruLens** | RAG Triad + OpenTelemetry tracing | Observability-centric |
|
|
69
|
+
| **Phoenix** | Tracing platform that wraps the above | Heavy infra, not a CLI |
|
|
70
|
+
|
|
71
|
+
**The gap ragverdict fills:** behavioral audits of RAG *agents* — assertions about whether
|
|
72
|
+
the system *behaves correctly*, with PASS/FAIL/WEAK verdicts that map cleanly to CI.
|
|
73
|
+
|
|
74
|
+
### What it checks
|
|
75
|
+
|
|
76
|
+
- **`tool_coverage`** — Fires every tool the agent exposes and confirms it returns without
|
|
77
|
+
error. Reports per-tool pass/fail + latency. **None of the four competitors do this.**
|
|
78
|
+
- **`rag_quality`** — Hard assertions (`must_mention`, `must_not_cite`, `must_refuse`,
|
|
79
|
+
`expects_citations`) plus LLM-as-judge faithfulness + relevance scoring for `WEAK` /
|
|
80
|
+
`FAIL` verdicts when hard checks pass.
|
|
81
|
+
- **`citation_audit`** — Verifies every `[src:ID]` citation resolves to a real document in
|
|
82
|
+
the agent's corpus, then asks the judge whether the cited claim is actually supported by
|
|
83
|
+
the source. Dangling citations are a hard `FAIL`.
|
|
84
|
+
- **`edge_cases`** *(v0.2)* — Input-boundary failure modes: `long_input` (10K-char
|
|
85
|
+
prompts, timeout-bounded), `multi_turn` (conversation-context recall), `contradiction`
|
|
86
|
+
(false premises must be pushed back on, judge-graded with `--no-judge` heuristic
|
|
87
|
+
fallback), and `empty_input` (clean rejection). **None of the four competitors do this
|
|
88
|
+
either.**
|
|
89
|
+
|
|
90
|
+
## Quickstart
|
|
91
|
+
|
|
92
|
+
### 1. Install
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pip install -e ".[dev]" # from a clone; PyPI release pending
|
|
96
|
+
export ANTHROPIC_API_KEY=sk-ant-… # required for the LLM judge
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 2. Run the bundled demo
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
ragverdict run examples/demo_rag/config.yaml
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
This runs five tests against a tiny reference RAG agent (`DemoAdapter`) over a fictional
|
|
106
|
+
"Acme Corp" corpus, exercising all four evaluators.
|
|
107
|
+
|
|
108
|
+
To run without burning API tokens:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
ragverdict run examples/demo_rag/config.yaml --no-judge
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
(Hard assertions still run; WEAK verdicts and citation support scoring are skipped.)
|
|
115
|
+
|
|
116
|
+
### 3. Write a config for your own RAG system
|
|
117
|
+
|
|
118
|
+
`config.yaml`:
|
|
119
|
+
|
|
120
|
+
```yaml
|
|
121
|
+
adapter:
|
|
122
|
+
type: python
|
|
123
|
+
module: my_app.rag_adapter
|
|
124
|
+
class: MyRagAdapter
|
|
125
|
+
|
|
126
|
+
judge:
|
|
127
|
+
provider: anthropic
|
|
128
|
+
model: claude-sonnet-4-6
|
|
129
|
+
|
|
130
|
+
tests:
|
|
131
|
+
- name: tool_coverage_all
|
|
132
|
+
evaluator: tool_coverage
|
|
133
|
+
|
|
134
|
+
- name: golden_path
|
|
135
|
+
evaluator: rag_quality
|
|
136
|
+
cases:
|
|
137
|
+
- query: "What was Q1 2025 revenue?"
|
|
138
|
+
must_mention: ["$5.2M"]
|
|
139
|
+
expects_citations: true
|
|
140
|
+
|
|
141
|
+
- name: out_of_corpus
|
|
142
|
+
evaluator: rag_quality
|
|
143
|
+
cases:
|
|
144
|
+
- query: "Predict 2030 revenue."
|
|
145
|
+
must_refuse: true
|
|
146
|
+
must_not_cite: true
|
|
147
|
+
|
|
148
|
+
- name: citations
|
|
149
|
+
evaluator: citation_audit
|
|
150
|
+
sample_queries:
|
|
151
|
+
- "Summarize Q1 2025 risks."
|
|
152
|
+
- "Who is the CTO?"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 4. Write your adapter
|
|
156
|
+
|
|
157
|
+
Subclass `RagAdapter` and implement `query()`:
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from ragverdict import RagAdapter, RagResponse, Citation, ToolCall, ToolSpec, SourceDoc
|
|
161
|
+
|
|
162
|
+
class MyRagAdapter(RagAdapter):
|
|
163
|
+
def query(self, prompt, *, conversation=None) -> RagResponse:
|
|
164
|
+
# Call your real RAG pipeline:
|
|
165
|
+
text, retrieved, citations, tool_calls = my_pipeline.run(prompt)
|
|
166
|
+
return RagResponse(
|
|
167
|
+
text=text,
|
|
168
|
+
citations=[Citation(id=c.id, source_id=c.source, span=c.span) for c in citations],
|
|
169
|
+
tool_calls=[ToolCall(name=t.name, args=t.args, latency_ms=t.ms) for t in tool_calls],
|
|
170
|
+
retrieved_context=retrieved,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def available_tools(self) -> list[ToolSpec]:
|
|
174
|
+
return [ToolSpec(name="search_kb", description="Knowledge-base lookup")]
|
|
175
|
+
|
|
176
|
+
def corpus(self):
|
|
177
|
+
for doc in my_pipeline.iter_docs():
|
|
178
|
+
yield SourceDoc(source_id=doc.id, content=doc.text, title=doc.title)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
The runner inserts the current working directory into `sys.path` before resolving your
|
|
182
|
+
`module:` import, so a project-local `my_app/` package just works.
|
|
183
|
+
|
|
184
|
+
See [`examples/demo_rag/adapter.py`](./examples/demo_rag/adapter.py) for a complete
|
|
185
|
+
reference adapter and [`examples/README.md`](./examples/README.md) for a walkthrough.
|
|
186
|
+
|
|
187
|
+
## Verdicts
|
|
188
|
+
|
|
189
|
+
- **PASS** — All hard assertions hold; judge scores (if configured) are at or above the
|
|
190
|
+
pass threshold (defaults: faithfulness 0.85, relevance 0.85, citation support 0.95).
|
|
191
|
+
- **WEAK** — Hard assertions hold but a judge score falls in `[weak, pass)` (defaults:
|
|
192
|
+
0.7–0.85 for faithfulness/relevance, 0.8–0.95 for citation support).
|
|
193
|
+
- **FAIL** — A hard assertion failed, or a judge score fell below the weak threshold.
|
|
194
|
+
- **ERROR** — The evaluator crashed or the judge returned unparseable output.
|
|
195
|
+
|
|
196
|
+
Tune thresholds via the `thresholds:` section of `config.yaml`. Exit codes:
|
|
197
|
+
|
|
198
|
+
| Code | Meaning |
|
|
199
|
+
|------|--------------------------------------------------------|
|
|
200
|
+
| 0 | All tests PASS or WEAK |
|
|
201
|
+
| 1 | At least one FAIL or ERROR |
|
|
202
|
+
| 2 | Config error / adapter load failure / unknown evaluator |
|
|
203
|
+
| 3 | All tests ERROR (typically: judge unavailable) |
|
|
204
|
+
|
|
205
|
+
## Reports
|
|
206
|
+
|
|
207
|
+
After each run, two files land in `./report/` (override with `--out-dir`):
|
|
208
|
+
|
|
209
|
+
- **`report.json`** — Machine-readable: full per-test verdicts, metrics, judge artifacts,
|
|
210
|
+
per-citation audit detail. Stable shape — see [`docs/json-report-schema.md`](./docs/json-report-schema.md).
|
|
211
|
+
- **`report.md`** — Human-readable summary table.
|
|
212
|
+
|
|
213
|
+
## FAQ
|
|
214
|
+
|
|
215
|
+
### When should I use ragverdict vs RAGAs / DeepEval / TruLens?
|
|
216
|
+
|
|
217
|
+
They're complementary, not competing. The metric-centric tools (RAGAs, ARES, TruLens,
|
|
218
|
+
Phoenix, DeepEval) score response quality dimensions like faithfulness and relevance —
|
|
219
|
+
useful for tracking quality over time. ragverdict tests *agent behavior* — did the tools
|
|
220
|
+
fire, do the citations resolve to real documents, did the agent push back on a false
|
|
221
|
+
premise, does it survive a 10K-character prompt. A mature RAG team uses both:
|
|
222
|
+
RAGAs-style scoring for quality tracking + ragverdict for behavioral regression in CI.
|
|
223
|
+
|
|
224
|
+
### Does it work without an API key?
|
|
225
|
+
|
|
226
|
+
Yes. Pass `--no-judge` (or set no `ANTHROPIC_API_KEY` and the runner degrades
|
|
227
|
+
automatically). Hard assertions still run — `tool_coverage`, citation-vs-corpus
|
|
228
|
+
dangling checks, `must_mention` / `must_refuse` / `must_not_cite`,
|
|
229
|
+
long-input/multi-turn/empty-input edge cases. The `contradiction` edge case falls back
|
|
230
|
+
to a narrow regex heuristic (`_PUSHBACK_HINTS`) with a clear caveat in the FAIL detail
|
|
231
|
+
when it can't confidently grade.
|
|
232
|
+
|
|
233
|
+
### Can I write my own evaluator?
|
|
234
|
+
|
|
235
|
+
Yes. Subclass `Evaluator`, set a class-level `name`, decorate with `@register`, and
|
|
236
|
+
implement `run(adapter, spec, *, judge, thresholds) -> TestResult`. Then `import` your
|
|
237
|
+
module before `ragverdict run` or add it to the package's autoload. The bundled
|
|
238
|
+
evaluators (`src/ragverdict/evaluators/`) are reference implementations.
|
|
239
|
+
|
|
240
|
+
### Can I use it with a RAG system written in another language?
|
|
241
|
+
|
|
242
|
+
Yes — use the `HttpAdapter`. Set `adapter.type: http` + an `endpoint` URL in your
|
|
243
|
+
config. The runner POSTs `{prompt, conversation}` and expects a JSON response matching
|
|
244
|
+
the `RagResponse` shape. Your Rust / Go / Node / TypeScript / etc. service just needs
|
|
245
|
+
to speak that protocol.
|
|
246
|
+
|
|
247
|
+
### What's the difference between `WEAK` and `FAIL`?
|
|
248
|
+
|
|
249
|
+
`FAIL` = a hard assertion failed (a required substring was missing, a citation didn't
|
|
250
|
+
resolve, an edge case crashed). `WEAK` = all hard assertions held but a judge score
|
|
251
|
+
fell into the configurable weak band (default: faithfulness or relevance in `[0.7,
|
|
252
|
+
0.85)`). `WEAK` is "watch this," `FAIL` is "fix this." Both `PASS` and `WEAK` give
|
|
253
|
+
exit code 0; `FAIL` gives exit code 1.
|
|
254
|
+
|
|
255
|
+
### Why four-state verdicts instead of floating-point scores?
|
|
256
|
+
|
|
257
|
+
So they map cleanly to CI exit codes and a 5-second scan of the terminal table. Raw
|
|
258
|
+
judge scores still live in `report.json` for users who want them — but the headline
|
|
259
|
+
output is a verdict, not a number you have to threshold yourself. The pitch is "pytest
|
|
260
|
+
for RAG, not metrics for RAG."
|
|
261
|
+
|
|
262
|
+
### Can I use a model other than Claude for the judge?
|
|
263
|
+
|
|
264
|
+
The judge is configurable via `judge.model` in `config.yaml` (defaults to
|
|
265
|
+
`claude-sonnet-4-6`). Any current Anthropic model works out of the box. Other
|
|
266
|
+
providers require swapping `LLMJudge` for a sibling implementation — the runner
|
|
267
|
+
accepts any object that satisfies the judge interface.
|
|
268
|
+
|
|
269
|
+
### How do I integrate this into GitHub Actions?
|
|
270
|
+
|
|
271
|
+
```yaml
|
|
272
|
+
- name: RAG behavioral audit
|
|
273
|
+
run: |
|
|
274
|
+
pip install git+https://github.com/Shauryagulati/ragverdict.git # PyPI release pending
|
|
275
|
+
ragverdict run config.yaml --no-judge
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
CI exit code propagates naturally — `PASS`/`WEAK` is exit 0, any `FAIL` is exit 1,
|
|
279
|
+
config errors are exit 2, all-`ERROR` (typically: judge unreachable) is exit 3. For
|
|
280
|
+
live-judge CI runs, set `ANTHROPIC_API_KEY` as a repo secret and drop the
|
|
281
|
+
`--no-judge` flag.
|
|
282
|
+
|
|
283
|
+
### Does prompt caching actually fire?
|
|
284
|
+
|
|
285
|
+
The wiring is correct on every judge rubric (`cache_control={"type": "ephemeral"}`),
|
|
286
|
+
but Sonnet 4.6's minimum cacheable prefix is 2048 tokens and current rubrics are
|
|
287
|
+
400-600 tokens. Caching activates as rubrics grow (more examples) or on models with
|
|
288
|
+
smaller minimums. Documented honestly in `LLMJudge`'s module docstring rather than
|
|
289
|
+
silently shipping a feature that doesn't fire yet.
|
|
290
|
+
|
|
291
|
+
## Roadmap
|
|
292
|
+
|
|
293
|
+
v0.2 shipped the edge-case battery. Next up:
|
|
294
|
+
|
|
295
|
+
- Write-tool safety evaluator (preview-only verification, version chain checks)
|
|
296
|
+
- `auth_negative` kind for the `edge_cases` evaluator (requires adapter ABC extension)
|
|
297
|
+
- Native `OpenAI` / `LangChain` adapters
|
|
298
|
+
- Concurrent test execution
|
|
299
|
+
- Hosted dashboard with regression tracking across runs
|
|
300
|
+
|
|
301
|
+
## License
|
|
302
|
+
|
|
303
|
+
MIT — see [LICENSE](./LICENSE).
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# ragverdict
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Shauryagulati/ragverdict/actions/workflows/ci.yml)
|
|
4
|
+
[](https://github.com/Shauryagulati/ragverdict/blob/main/pyproject.toml)
|
|
5
|
+
[](./LICENSE)
|
|
6
|
+
|
|
7
|
+
**pytest for RAG agents.** Behavioral audits of any RAG system — tool coverage, retrieval
|
|
8
|
+
quality, citation verification, hallucination guardrails — with PASS / FAIL / WEAK verdicts,
|
|
9
|
+
not floating-point metric averages.
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
13
|
+
┃ Test ┃ Evaluator ┃ Verdict ┃ Latency ┃ Detail ┃
|
|
14
|
+
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
|
15
|
+
│ tool_coverage_all │ tool_coverage │ PASS │ 8ms │ 2/2 tools fired cleanly │
|
|
16
|
+
│ direct_retrieval │ rag_quality │ PASS │ 12662ms │ 3/3 cases passed │
|
|
17
|
+
│ hallucination_g… │ rag_quality │ PASS │ 3985ms │ 2/2 cases passed │
|
|
18
|
+
│ citation_audit │ citation_audit │ PASS │ 5535ms │ mean support_score=1.00 │
|
|
19
|
+
│ edge_cases_battery │ edge_cases │ PASS │ 2380ms │ 4/4 cases passed │
|
|
20
|
+
└────────────────────┴────────────────┴─────────┴─────────┴─────────────────────────────────┘
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Why ragverdict
|
|
24
|
+
|
|
25
|
+
Existing RAG evaluation tools score metrics. RAGAs, DeepEval, TruLens, and Arize Phoenix
|
|
26
|
+
all answer "how faithful was the response *on average*" via LLM-as-judge — they tell you
|
|
27
|
+
the mean of a fleet of scores. They do not answer **does the agent actually work
|
|
28
|
+
end-to-end**.
|
|
29
|
+
|
|
30
|
+
| Tool | Does | Doesn't |
|
|
31
|
+
|------------|---------------------------------------------------------------|-------------------------------------------------------------------------------|
|
|
32
|
+
| **RAGAs** | LLM-as-judge metric scores (faithfulness, context P/R) | No tool-call testing, no citation-vs-corpus verification, no assertions |
|
|
33
|
+
| **DeepEval** | pytest-style assertions on the RAGAs metric family | Same metric-centric model |
|
|
34
|
+
| **TruLens** | RAG Triad + OpenTelemetry tracing | Observability-centric |
|
|
35
|
+
| **Phoenix** | Tracing platform that wraps the above | Heavy infra, not a CLI |
|
|
36
|
+
|
|
37
|
+
**The gap ragverdict fills:** behavioral audits of RAG *agents* — assertions about whether
|
|
38
|
+
the system *behaves correctly*, with PASS/FAIL/WEAK verdicts that map cleanly to CI.
|
|
39
|
+
|
|
40
|
+
### What it checks
|
|
41
|
+
|
|
42
|
+
- **`tool_coverage`** — Fires every tool the agent exposes and confirms it returns without
|
|
43
|
+
error. Reports per-tool pass/fail + latency. **None of the four competitors do this.**
|
|
44
|
+
- **`rag_quality`** — Hard assertions (`must_mention`, `must_not_cite`, `must_refuse`,
|
|
45
|
+
`expects_citations`) plus LLM-as-judge faithfulness + relevance scoring for `WEAK` /
|
|
46
|
+
`FAIL` verdicts when hard checks pass.
|
|
47
|
+
- **`citation_audit`** — Verifies every `[src:ID]` citation resolves to a real document in
|
|
48
|
+
the agent's corpus, then asks the judge whether the cited claim is actually supported by
|
|
49
|
+
the source. Dangling citations are a hard `FAIL`.
|
|
50
|
+
- **`edge_cases`** *(v0.2)* — Input-boundary failure modes: `long_input` (10K-char
|
|
51
|
+
prompts, timeout-bounded), `multi_turn` (conversation-context recall), `contradiction`
|
|
52
|
+
(false premises must be pushed back on, judge-graded with `--no-judge` heuristic
|
|
53
|
+
fallback), and `empty_input` (clean rejection). **None of the four competitors do this
|
|
54
|
+
either.**
|
|
55
|
+
|
|
56
|
+
## Quickstart
|
|
57
|
+
|
|
58
|
+
### 1. Install
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -e ".[dev]" # from a clone; PyPI release pending
|
|
62
|
+
export ANTHROPIC_API_KEY=sk-ant-… # required for the LLM judge
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 2. Run the bundled demo
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
ragverdict run examples/demo_rag/config.yaml
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
This runs five tests against a tiny reference RAG agent (`DemoAdapter`) over a fictional
|
|
72
|
+
"Acme Corp" corpus, exercising all four evaluators.
|
|
73
|
+
|
|
74
|
+
To run without burning API tokens:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
ragverdict run examples/demo_rag/config.yaml --no-judge
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
(Hard assertions still run; WEAK verdicts and citation support scoring are skipped.)
|
|
81
|
+
|
|
82
|
+
### 3. Write a config for your own RAG system
|
|
83
|
+
|
|
84
|
+
`config.yaml`:
|
|
85
|
+
|
|
86
|
+
```yaml
|
|
87
|
+
adapter:
|
|
88
|
+
type: python
|
|
89
|
+
module: my_app.rag_adapter
|
|
90
|
+
class: MyRagAdapter
|
|
91
|
+
|
|
92
|
+
judge:
|
|
93
|
+
provider: anthropic
|
|
94
|
+
model: claude-sonnet-4-6
|
|
95
|
+
|
|
96
|
+
tests:
|
|
97
|
+
- name: tool_coverage_all
|
|
98
|
+
evaluator: tool_coverage
|
|
99
|
+
|
|
100
|
+
- name: golden_path
|
|
101
|
+
evaluator: rag_quality
|
|
102
|
+
cases:
|
|
103
|
+
- query: "What was Q1 2025 revenue?"
|
|
104
|
+
must_mention: ["$5.2M"]
|
|
105
|
+
expects_citations: true
|
|
106
|
+
|
|
107
|
+
- name: out_of_corpus
|
|
108
|
+
evaluator: rag_quality
|
|
109
|
+
cases:
|
|
110
|
+
- query: "Predict 2030 revenue."
|
|
111
|
+
must_refuse: true
|
|
112
|
+
must_not_cite: true
|
|
113
|
+
|
|
114
|
+
- name: citations
|
|
115
|
+
evaluator: citation_audit
|
|
116
|
+
sample_queries:
|
|
117
|
+
- "Summarize Q1 2025 risks."
|
|
118
|
+
- "Who is the CTO?"
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 4. Write your adapter
|
|
122
|
+
|
|
123
|
+
Subclass `RagAdapter` and implement `query()`:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from ragverdict import RagAdapter, RagResponse, Citation, ToolCall, ToolSpec, SourceDoc
|
|
127
|
+
|
|
128
|
+
class MyRagAdapter(RagAdapter):
|
|
129
|
+
def query(self, prompt, *, conversation=None) -> RagResponse:
|
|
130
|
+
# Call your real RAG pipeline:
|
|
131
|
+
text, retrieved, citations, tool_calls = my_pipeline.run(prompt)
|
|
132
|
+
return RagResponse(
|
|
133
|
+
text=text,
|
|
134
|
+
citations=[Citation(id=c.id, source_id=c.source, span=c.span) for c in citations],
|
|
135
|
+
tool_calls=[ToolCall(name=t.name, args=t.args, latency_ms=t.ms) for t in tool_calls],
|
|
136
|
+
retrieved_context=retrieved,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def available_tools(self) -> list[ToolSpec]:
|
|
140
|
+
return [ToolSpec(name="search_kb", description="Knowledge-base lookup")]
|
|
141
|
+
|
|
142
|
+
def corpus(self):
|
|
143
|
+
for doc in my_pipeline.iter_docs():
|
|
144
|
+
yield SourceDoc(source_id=doc.id, content=doc.text, title=doc.title)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
The runner inserts the current working directory into `sys.path` before resolving your
|
|
148
|
+
`module:` import, so a project-local `my_app/` package just works.
|
|
149
|
+
|
|
150
|
+
See [`examples/demo_rag/adapter.py`](./examples/demo_rag/adapter.py) for a complete
|
|
151
|
+
reference adapter and [`examples/README.md`](./examples/README.md) for a walkthrough.
|
|
152
|
+
|
|
153
|
+
## Verdicts
|
|
154
|
+
|
|
155
|
+
- **PASS** — All hard assertions hold; judge scores (if configured) are at or above the
|
|
156
|
+
pass threshold (defaults: faithfulness 0.85, relevance 0.85, citation support 0.95).
|
|
157
|
+
- **WEAK** — Hard assertions hold but a judge score falls in `[weak, pass)` (defaults:
|
|
158
|
+
0.7–0.85 for faithfulness/relevance, 0.8–0.95 for citation support).
|
|
159
|
+
- **FAIL** — A hard assertion failed, or a judge score fell below the weak threshold.
|
|
160
|
+
- **ERROR** — The evaluator crashed or the judge returned unparseable output.
|
|
161
|
+
|
|
162
|
+
Tune thresholds via the `thresholds:` section of `config.yaml`. Exit codes:
|
|
163
|
+
|
|
164
|
+
| Code | Meaning |
|
|
165
|
+
|------|--------------------------------------------------------|
|
|
166
|
+
| 0 | All tests PASS or WEAK |
|
|
167
|
+
| 1 | At least one FAIL or ERROR |
|
|
168
|
+
| 2 | Config error / adapter load failure / unknown evaluator |
|
|
169
|
+
| 3 | All tests ERROR (typically: judge unavailable) |
|
|
170
|
+
|
|
171
|
+
## Reports
|
|
172
|
+
|
|
173
|
+
After each run, two files land in `./report/` (override with `--out-dir`):
|
|
174
|
+
|
|
175
|
+
- **`report.json`** — Machine-readable: full per-test verdicts, metrics, judge artifacts,
|
|
176
|
+
per-citation audit detail. Stable shape — see [`docs/json-report-schema.md`](./docs/json-report-schema.md).
|
|
177
|
+
- **`report.md`** — Human-readable summary table.
|
|
178
|
+
|
|
179
|
+
## FAQ
|
|
180
|
+
|
|
181
|
+
### When should I use ragverdict vs RAGAs / DeepEval / TruLens?
|
|
182
|
+
|
|
183
|
+
They're complementary, not competing. The metric-centric tools (RAGAs, ARES, TruLens,
|
|
184
|
+
Phoenix, DeepEval) score response quality dimensions like faithfulness and relevance —
|
|
185
|
+
useful for tracking quality over time. ragverdict tests *agent behavior* — did the tools
|
|
186
|
+
fire, do the citations resolve to real documents, did the agent push back on a false
|
|
187
|
+
premise, does it survive a 10K-character prompt. A mature RAG team uses both:
|
|
188
|
+
RAGAs-style scoring for quality tracking + ragverdict for behavioral regression in CI.
|
|
189
|
+
|
|
190
|
+
### Does it work without an API key?
|
|
191
|
+
|
|
192
|
+
Yes. Pass `--no-judge` (or set no `ANTHROPIC_API_KEY` and the runner degrades
|
|
193
|
+
automatically). Hard assertions still run — `tool_coverage`, citation-vs-corpus
|
|
194
|
+
dangling checks, `must_mention` / `must_refuse` / `must_not_cite`,
|
|
195
|
+
long-input/multi-turn/empty-input edge cases. The `contradiction` edge case falls back
|
|
196
|
+
to a narrow regex heuristic (`_PUSHBACK_HINTS`) with a clear caveat in the FAIL detail
|
|
197
|
+
when it can't confidently grade.
|
|
198
|
+
|
|
199
|
+
### Can I write my own evaluator?
|
|
200
|
+
|
|
201
|
+
Yes. Subclass `Evaluator`, set a class-level `name`, decorate with `@register`, and
|
|
202
|
+
implement `run(adapter, spec, *, judge, thresholds) -> TestResult`. Then `import` your
|
|
203
|
+
module before `ragverdict run` or add it to the package's autoload. The bundled
|
|
204
|
+
evaluators (`src/ragverdict/evaluators/`) are reference implementations.
|
|
205
|
+
|
|
206
|
+
### Can I use it with a RAG system written in another language?
|
|
207
|
+
|
|
208
|
+
Yes — use the `HttpAdapter`. Set `adapter.type: http` + an `endpoint` URL in your
|
|
209
|
+
config. The runner POSTs `{prompt, conversation}` and expects a JSON response matching
|
|
210
|
+
the `RagResponse` shape. Your Rust / Go / Node / TypeScript / etc. service just needs
|
|
211
|
+
to speak that protocol.
|
|
212
|
+
|
|
213
|
+
### What's the difference between `WEAK` and `FAIL`?
|
|
214
|
+
|
|
215
|
+
`FAIL` = a hard assertion failed (a required substring was missing, a citation didn't
|
|
216
|
+
resolve, an edge case crashed). `WEAK` = all hard assertions held but a judge score
|
|
217
|
+
fell into the configurable weak band (default: faithfulness or relevance in `[0.7,
|
|
218
|
+
0.85)`). `WEAK` is "watch this," `FAIL` is "fix this." Both `PASS` and `WEAK` give
|
|
219
|
+
exit code 0; `FAIL` gives exit code 1.
|
|
220
|
+
|
|
221
|
+
### Why four-state verdicts instead of floating-point scores?
|
|
222
|
+
|
|
223
|
+
So they map cleanly to CI exit codes and a 5-second scan of the terminal table. Raw
|
|
224
|
+
judge scores still live in `report.json` for users who want them — but the headline
|
|
225
|
+
output is a verdict, not a number you have to threshold yourself. The pitch is "pytest
|
|
226
|
+
for RAG, not metrics for RAG."
|
|
227
|
+
|
|
228
|
+
### Can I use a model other than Claude for the judge?
|
|
229
|
+
|
|
230
|
+
The judge is configurable via `judge.model` in `config.yaml` (defaults to
|
|
231
|
+
`claude-sonnet-4-6`). Any current Anthropic model works out of the box. Other
|
|
232
|
+
providers require swapping `LLMJudge` for a sibling implementation — the runner
|
|
233
|
+
accepts any object that satisfies the judge interface.
|
|
234
|
+
|
|
235
|
+
### How do I integrate this into GitHub Actions?
|
|
236
|
+
|
|
237
|
+
```yaml
|
|
238
|
+
- name: RAG behavioral audit
|
|
239
|
+
run: |
|
|
240
|
+
pip install git+https://github.com/Shauryagulati/ragverdict.git # PyPI release pending
|
|
241
|
+
ragverdict run config.yaml --no-judge
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
CI exit code propagates naturally — `PASS`/`WEAK` is exit 0, any `FAIL` is exit 1,
|
|
245
|
+
config errors are exit 2, all-`ERROR` (typically: judge unreachable) is exit 3. For
|
|
246
|
+
live-judge CI runs, set `ANTHROPIC_API_KEY` as a repo secret and drop the
|
|
247
|
+
`--no-judge` flag.
|
|
248
|
+
|
|
249
|
+
### Does prompt caching actually fire?
|
|
250
|
+
|
|
251
|
+
The wiring is correct on every judge rubric (`cache_control={"type": "ephemeral"}`),
|
|
252
|
+
but Sonnet 4.6's minimum cacheable prefix is 2048 tokens and current rubrics are
|
|
253
|
+
400-600 tokens. Caching activates as rubrics grow (more examples) or on models with
|
|
254
|
+
smaller minimums. Documented honestly in `LLMJudge`'s module docstring rather than
|
|
255
|
+
silently shipping a feature that doesn't fire yet.
|
|
256
|
+
|
|
257
|
+
## Roadmap
|
|
258
|
+
|
|
259
|
+
v0.2 shipped the edge-case battery. Next up:
|
|
260
|
+
|
|
261
|
+
- Write-tool safety evaluator (preview-only verification, version chain checks)
|
|
262
|
+
- `auth_negative` kind for the `edge_cases` evaluator (requires adapter ABC extension)
|
|
263
|
+
- Native `OpenAI` / `LangChain` adapters
|
|
264
|
+
- Concurrent test execution
|
|
265
|
+
- Hosted dashboard with regression tracking across runs
|
|
266
|
+
|
|
267
|
+
## License
|
|
268
|
+
|
|
269
|
+
MIT — see [LICENSE](./LICENSE).
|