trieval 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trieval-0.0.1/PKG-INFO +103 -0
- trieval-0.0.1/README.md +86 -0
- trieval-0.0.1/pyproject.toml +48 -0
- trieval-0.0.1/setup.cfg +4 -0
- trieval-0.0.1/tests/test_evaluator.py +201 -0
- trieval-0.0.1/tests/test_metrics.py +101 -0
- trieval-0.0.1/tests/test_models.py +88 -0
- trieval-0.0.1/trieval/__init__.py +14 -0
- trieval-0.0.1/trieval/evaluator.py +60 -0
- trieval-0.0.1/trieval/graph.py +110 -0
- trieval-0.0.1/trieval/metrics/__init__.py +29 -0
- trieval-0.0.1/trieval/metrics/answer_relevance.py +34 -0
- trieval-0.0.1/trieval/metrics/answerability.py +34 -0
- trieval-0.0.1/trieval/metrics/base.py +15 -0
- trieval-0.0.1/trieval/metrics/context_relevance.py +34 -0
- trieval-0.0.1/trieval/metrics/context_support.py +40 -0
- trieval-0.0.1/trieval/metrics/faithfulness.py +35 -0
- trieval-0.0.1/trieval/metrics/self_containment.py +41 -0
- trieval-0.0.1/trieval/models.py +81 -0
- trieval-0.0.1/trieval.egg-info/PKG-INFO +103 -0
- trieval-0.0.1/trieval.egg-info/SOURCES.txt +22 -0
- trieval-0.0.1/trieval.egg-info/dependency_links.txt +1 -0
- trieval-0.0.1/trieval.egg-info/requires.txt +10 -0
- trieval-0.0.1/trieval.egg-info/top_level.txt +1 -0
trieval-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trieval
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: RAG evaluation with exactly 6 metrics. 3 variables, 6 relationships, nothing more.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pydantic>=2.0
|
|
9
|
+
Requires-Dist: pydantic-ai>=0.1.0
|
|
10
|
+
Requires-Dist: langgraph>=0.2.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
14
|
+
Requires-Dist: pytest-cov>=6.0; extra == "dev"
|
|
15
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
16
|
+
Requires-Dist: mypy>=1.13; extra == "dev"
|
|
17
|
+
|
|
18
|
+
# trieval
|
|
19
|
+
|
|
20
|
+
RAG evaluation with exactly 6 metrics. Nothing more.
|
|
21
|
+
|
|
22
|
+
Every RAG system has 3 variables: **Q** (Question), **C** (Context), **A** (Answer).
|
|
23
|
+
|
|
24
|
+
3 variables → 6 pairwise relationships → 6 metrics.
|
|
25
|
+
|
|
26
|
+
## The 6 Metrics
|
|
27
|
+
|
|
28
|
+
| # | Metric | Notation | Evaluates |
|
|
29
|
+
|---|--------|----------|-----------|
|
|
30
|
+
| 1 | Context Relevance | C\|Q | Is the retrieved context relevant to the question? |
|
|
31
|
+
| 2 | Faithfulness | A\|C | Does the answer stick to the context? |
|
|
32
|
+
| 3 | Answer Relevance | A\|Q | Does the answer solve the user's question? |
|
|
33
|
+
| 4 | Context Support | C\|A | Does the context fully support the answer? |
|
|
34
|
+
| 5 | Answerability | Q\|C | Can this question be answered with this context? |
|
|
35
|
+
| 6 | Self-Containment | Q\|A | Can someone understand the question from the answer? |
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install trieval
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from trieval import Evaluator
|
|
47
|
+
|
|
48
|
+
evaluator = Evaluator(model="openai:gpt-4o-mini")
|
|
49
|
+
result = await evaluator.evaluate(
|
|
50
|
+
question="What is photosynthesis?",
|
|
51
|
+
context="Photosynthesis is the process by which plants convert sunlight into energy.",
|
|
52
|
+
answer="Photosynthesis is how plants make food from sunlight.",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
print(result.overall_score) # 0.0–1.0
|
|
56
|
+
print(result.retrieval_score) # avg of C|Q + Q|C
|
|
57
|
+
print(result.generation_score) # avg of A|C + A|Q
|
|
58
|
+
print(result.diagnose()) # ["All metrics healthy"] or failure categories
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Failure Diagnosis
|
|
62
|
+
|
|
63
|
+
When your RAG system fails, it's always one of these:
|
|
64
|
+
|
|
65
|
+
- **Retrieval issues** — C|Q and Q|C scores are low (wrong context retrieved)
|
|
66
|
+
- **Generation issues** — A|C and A|Q scores are low (bad answer generation)
|
|
67
|
+
- **End-to-end mismatch** — A|C is fine but C|A is low (faithful but unsupported)
|
|
68
|
+
|
|
69
|
+
## Architecture
|
|
70
|
+
|
|
71
|
+
Built with [pydantic-ai](https://ai.pydantic.dev/) (LLM-based metric agents) and [LangGraph](https://langchain-ai.github.io/langgraph/) (evaluation workflow orchestration).
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
RAGInput(Q, C, A)
|
|
75
|
+
↓
|
|
76
|
+
Evaluator.evaluate()
|
|
77
|
+
↓
|
|
78
|
+
LangGraph: evaluate_metrics → diagnose_failures
|
|
79
|
+
↓
|
|
80
|
+
EvaluationResult (scores + diagnosis)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Each metric is a pydantic-ai `Agent` with a tailored system prompt. All 6 run concurrently via `asyncio.gather` inside the LangGraph evaluation node.
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv sync --group dev
|
|
89
|
+
pytest # run tests
|
|
90
|
+
pytest --cov=trieval --cov-branch # with coverage
|
|
91
|
+
ruff check trieval/ tests/ # lint
|
|
92
|
+
ruff format trieval/ tests/ # format
|
|
93
|
+
mypy trieval/ # type check
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Documentation
|
|
97
|
+
|
|
98
|
+
- [API Reference](docs/api.md) — Full API for `Evaluator`, `EvaluationResult`, `MetricResult`, `RAGInput`, and individual metric functions
|
|
99
|
+
- [Changelog](CHANGELOG.md) — Version history
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT
|
trieval-0.0.1/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# trieval
|
|
2
|
+
|
|
3
|
+
RAG evaluation with exactly 6 metrics. Nothing more.
|
|
4
|
+
|
|
5
|
+
Every RAG system has 3 variables: **Q** (Question), **C** (Context), **A** (Answer).
|
|
6
|
+
|
|
7
|
+
3 variables → 6 pairwise relationships → 6 metrics.
|
|
8
|
+
|
|
9
|
+
## The 6 Metrics
|
|
10
|
+
|
|
11
|
+
| # | Metric | Notation | Evaluates |
|
|
12
|
+
|---|--------|----------|-----------|
|
|
13
|
+
| 1 | Context Relevance | C\|Q | Is the retrieved context relevant to the question? |
|
|
14
|
+
| 2 | Faithfulness | A\|C | Does the answer stick to the context? |
|
|
15
|
+
| 3 | Answer Relevance | A\|Q | Does the answer solve the user's question? |
|
|
16
|
+
| 4 | Context Support | C\|A | Does the context fully support the answer? |
|
|
17
|
+
| 5 | Answerability | Q\|C | Can this question be answered with this context? |
|
|
18
|
+
| 6 | Self-Containment | Q\|A | Can someone understand the question from the answer? |
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install trieval
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from trieval import Evaluator
|
|
30
|
+
|
|
31
|
+
evaluator = Evaluator(model="openai:gpt-4o-mini")
|
|
32
|
+
result = await evaluator.evaluate(
|
|
33
|
+
question="What is photosynthesis?",
|
|
34
|
+
context="Photosynthesis is the process by which plants convert sunlight into energy.",
|
|
35
|
+
answer="Photosynthesis is how plants make food from sunlight.",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
print(result.overall_score) # 0.0–1.0
|
|
39
|
+
print(result.retrieval_score) # avg of C|Q + Q|C
|
|
40
|
+
print(result.generation_score) # avg of A|C + A|Q
|
|
41
|
+
print(result.diagnose()) # ["All metrics healthy"] or failure categories
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Failure Diagnosis
|
|
45
|
+
|
|
46
|
+
When your RAG system fails, it's always one of these:
|
|
47
|
+
|
|
48
|
+
- **Retrieval issues** — C|Q and Q|C scores are low (wrong context retrieved)
|
|
49
|
+
- **Generation issues** — A|C and A|Q scores are low (bad answer generation)
|
|
50
|
+
- **End-to-end mismatch** — A|C is fine but C|A is low (faithful but unsupported)
|
|
51
|
+
|
|
52
|
+
## Architecture
|
|
53
|
+
|
|
54
|
+
Built with [pydantic-ai](https://ai.pydantic.dev/) (LLM-based metric agents) and [LangGraph](https://langchain-ai.github.io/langgraph/) (evaluation workflow orchestration).
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
RAGInput(Q, C, A)
|
|
58
|
+
↓
|
|
59
|
+
Evaluator.evaluate()
|
|
60
|
+
↓
|
|
61
|
+
LangGraph: evaluate_metrics → diagnose_failures
|
|
62
|
+
↓
|
|
63
|
+
EvaluationResult (scores + diagnosis)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Each metric is a pydantic-ai `Agent` with a tailored system prompt. All 6 run concurrently via `asyncio.gather` inside the LangGraph evaluation node.
|
|
67
|
+
|
|
68
|
+
## Development
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
uv sync --group dev
|
|
72
|
+
pytest # run tests
|
|
73
|
+
pytest --cov=trieval --cov-branch # with coverage
|
|
74
|
+
ruff check trieval/ tests/ # lint
|
|
75
|
+
ruff format trieval/ tests/ # format
|
|
76
|
+
mypy trieval/ # type check
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Documentation
|
|
80
|
+
|
|
81
|
+
- [API Reference](docs/api.md) — Full API for `Evaluator`, `EvaluationResult`, `MetricResult`, `RAGInput`, and individual metric functions
|
|
82
|
+
- [Changelog](CHANGELOG.md) — Version history
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "trieval"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "RAG evaluation with exactly 6 metrics. 3 variables, 6 relationships, nothing more."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
dependencies = [
|
|
9
|
+
"pydantic>=2.0",
|
|
10
|
+
"pydantic-ai>=0.1.0",
|
|
11
|
+
"langgraph>=0.2.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
dev = [
|
|
16
|
+
"pytest>=8.0",
|
|
17
|
+
"pytest-asyncio>=0.24",
|
|
18
|
+
"pytest-cov>=6.0",
|
|
19
|
+
"ruff>=0.8",
|
|
20
|
+
"mypy>=1.13",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[dependency-groups]
|
|
24
|
+
dev = [
|
|
25
|
+
"pytest>=8.0",
|
|
26
|
+
"pytest-asyncio>=0.24",
|
|
27
|
+
"pytest-cov>=6.0",
|
|
28
|
+
"ruff>=0.8",
|
|
29
|
+
"mypy>=1.13",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[tool.pytest.ini_options]
|
|
33
|
+
asyncio_mode = "auto"
|
|
34
|
+
|
|
35
|
+
[tool.ruff]
|
|
36
|
+
target-version = "py311"
|
|
37
|
+
line-length = 100
|
|
38
|
+
|
|
39
|
+
[tool.ruff.lint]
|
|
40
|
+
select = ["E", "F", "W", "I", "N", "UP", "ASYNC"]
|
|
41
|
+
|
|
42
|
+
[tool.mypy]
|
|
43
|
+
python_version = "3.11"
|
|
44
|
+
strict = true
|
|
45
|
+
|
|
46
|
+
[tool.coverage.run]
|
|
47
|
+
source = ["trieval"]
|
|
48
|
+
branch = true
|
trieval-0.0.1/setup.cfg
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Tests for the evaluation graph and evaluator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import AsyncMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from trieval.evaluator import Evaluator
|
|
10
|
+
from trieval.graph import EvalState, diagnose_failures, evaluate_metrics
|
|
11
|
+
from trieval.models import MetricResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _mock_metric(score: float = 0.9) -> MetricResult:
|
|
15
|
+
return MetricResult(score=score, reason="mock")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture
|
|
19
|
+
def _patch_all_metrics():
|
|
20
|
+
"""Patch all 6 metric evaluate functions to return mock results."""
|
|
21
|
+
modules = [
|
|
22
|
+
"trieval.metrics.context_relevance",
|
|
23
|
+
"trieval.metrics.faithfulness",
|
|
24
|
+
"trieval.metrics.answer_relevance",
|
|
25
|
+
"trieval.metrics.context_support",
|
|
26
|
+
"trieval.metrics.answerability",
|
|
27
|
+
"trieval.metrics.self_containment",
|
|
28
|
+
]
|
|
29
|
+
mocks = {}
|
|
30
|
+
patches = []
|
|
31
|
+
for mod in modules:
|
|
32
|
+
p = patch(f"{mod}.evaluate", new_callable=AsyncMock, return_value=_mock_metric())
|
|
33
|
+
mocks[mod] = p.start()
|
|
34
|
+
patches.append(p)
|
|
35
|
+
yield mocks
|
|
36
|
+
for p in patches:
|
|
37
|
+
p.stop()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@pytest.mark.usefixtures("_patch_all_metrics")
|
|
41
|
+
async def test_evaluate_metrics_runs_all() -> None:
|
|
42
|
+
state: EvalState = {
|
|
43
|
+
"question": "What is X?",
|
|
44
|
+
"context": "X is Y.",
|
|
45
|
+
"answer": "X is Y.",
|
|
46
|
+
"model": "test",
|
|
47
|
+
"results": {},
|
|
48
|
+
"diagnosis": [],
|
|
49
|
+
}
|
|
50
|
+
result = await evaluate_metrics(state)
|
|
51
|
+
assert len(result["results"]) == 6
|
|
52
|
+
assert all(isinstance(v, MetricResult) for v in result["results"].values())
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_diagnose_healthy() -> None:
|
|
56
|
+
state: EvalState = {
|
|
57
|
+
"question": "",
|
|
58
|
+
"context": "",
|
|
59
|
+
"answer": "",
|
|
60
|
+
"model": "",
|
|
61
|
+
"results": {
|
|
62
|
+
"context_relevance": _mock_metric(0.9),
|
|
63
|
+
"faithfulness": _mock_metric(0.9),
|
|
64
|
+
"answer_relevance": _mock_metric(0.9),
|
|
65
|
+
"context_support": _mock_metric(0.9),
|
|
66
|
+
"answerability": _mock_metric(0.9),
|
|
67
|
+
"self_containment": _mock_metric(0.9),
|
|
68
|
+
},
|
|
69
|
+
"diagnosis": [],
|
|
70
|
+
}
|
|
71
|
+
result = diagnose_failures(state)
|
|
72
|
+
assert result["diagnosis"] == ["All metrics healthy"]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_diagnose_retrieval_failure() -> None:
|
|
76
|
+
state: EvalState = {
|
|
77
|
+
"question": "",
|
|
78
|
+
"context": "",
|
|
79
|
+
"answer": "",
|
|
80
|
+
"model": "",
|
|
81
|
+
"results": {
|
|
82
|
+
"context_relevance": _mock_metric(0.1),
|
|
83
|
+
"faithfulness": _mock_metric(0.9),
|
|
84
|
+
"answer_relevance": _mock_metric(0.9),
|
|
85
|
+
"context_support": _mock_metric(0.9),
|
|
86
|
+
"answerability": _mock_metric(0.2),
|
|
87
|
+
"self_containment": _mock_metric(0.9),
|
|
88
|
+
},
|
|
89
|
+
"diagnosis": [],
|
|
90
|
+
}
|
|
91
|
+
result = diagnose_failures(state)
|
|
92
|
+
assert any("Retrieval issues" in d for d in result["diagnosis"])
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@pytest.mark.usefixtures("_patch_all_metrics")
|
|
96
|
+
async def test_evaluator_returns_evaluation_result() -> None:
|
|
97
|
+
evaluator = Evaluator(model="test")
|
|
98
|
+
result = await evaluator.evaluate(
|
|
99
|
+
question="What is X?",
|
|
100
|
+
context="X is Y.",
|
|
101
|
+
answer="X is Y.",
|
|
102
|
+
)
|
|
103
|
+
assert result.overall_score == 0.9
|
|
104
|
+
assert result.diagnose() == ["All metrics healthy"]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@pytest.mark.usefixtures("_patch_all_metrics")
|
|
108
|
+
async def test_evaluator_accepts_list_context() -> None:
|
|
109
|
+
evaluator = Evaluator(model="test")
|
|
110
|
+
result = await evaluator.evaluate(
|
|
111
|
+
question="What is X?",
|
|
112
|
+
context=["chunk 1", "chunk 2"],
|
|
113
|
+
answer="X is Y.",
|
|
114
|
+
)
|
|
115
|
+
assert result.overall_score == 0.9
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_diagnose_self_containment_failure() -> None:
|
|
119
|
+
state: EvalState = {
|
|
120
|
+
"question": "",
|
|
121
|
+
"context": "",
|
|
122
|
+
"answer": "",
|
|
123
|
+
"model": "",
|
|
124
|
+
"results": {
|
|
125
|
+
"context_relevance": _mock_metric(0.9),
|
|
126
|
+
"faithfulness": _mock_metric(0.9),
|
|
127
|
+
"answer_relevance": _mock_metric(0.9),
|
|
128
|
+
"context_support": _mock_metric(0.9),
|
|
129
|
+
"answerability": _mock_metric(0.9),
|
|
130
|
+
"self_containment": _mock_metric(0.2),
|
|
131
|
+
},
|
|
132
|
+
"diagnosis": [],
|
|
133
|
+
}
|
|
134
|
+
result = diagnose_failures(state)
|
|
135
|
+
assert any("Self-containment" in d for d in result["diagnosis"])
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_diagnose_multiple_issues() -> None:
|
|
139
|
+
state: EvalState = {
|
|
140
|
+
"question": "",
|
|
141
|
+
"context": "",
|
|
142
|
+
"answer": "",
|
|
143
|
+
"model": "",
|
|
144
|
+
"results": {
|
|
145
|
+
"context_relevance": _mock_metric(0.1),
|
|
146
|
+
"faithfulness": _mock_metric(0.8),
|
|
147
|
+
"answer_relevance": _mock_metric(0.1),
|
|
148
|
+
"context_support": _mock_metric(0.9),
|
|
149
|
+
"answerability": _mock_metric(0.2),
|
|
150
|
+
"self_containment": _mock_metric(0.1),
|
|
151
|
+
},
|
|
152
|
+
"diagnosis": [],
|
|
153
|
+
}
|
|
154
|
+
result = diagnose_failures(state)
|
|
155
|
+
assert len(result["diagnosis"]) >= 3
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def test_diagnose_high_context_support_low_faithfulness() -> None:
|
|
159
|
+
"""Test when context_support is high but faithfulness is low (no e2e mismatch)."""
|
|
160
|
+
state: EvalState = {
|
|
161
|
+
"question": "",
|
|
162
|
+
"context": "",
|
|
163
|
+
"answer": "",
|
|
164
|
+
"model": "",
|
|
165
|
+
"results": {
|
|
166
|
+
"context_relevance": _mock_metric(0.9),
|
|
167
|
+
"faithfulness": _mock_metric(0.1),
|
|
168
|
+
"answer_relevance": _mock_metric(0.2),
|
|
169
|
+
"context_support": _mock_metric(0.9),
|
|
170
|
+
"answerability": _mock_metric(0.9),
|
|
171
|
+
"self_containment": _mock_metric(0.9),
|
|
172
|
+
},
|
|
173
|
+
"diagnosis": [],
|
|
174
|
+
}
|
|
175
|
+
result = diagnose_failures(state)
|
|
176
|
+
# Should have generation issue, not e2e mismatch (because faithfulness is low)
|
|
177
|
+
diagnosis_str = " ".join(result["diagnosis"])
|
|
178
|
+
assert "Generation issues" in diagnosis_str
|
|
179
|
+
assert "End-to-end mismatch" not in diagnosis_str
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_diagnose_e2e_mismatch_faithful_unsupported() -> None:
|
|
183
|
+
"""Test e2e mismatch: answer is faithful but context doesn't support it."""
|
|
184
|
+
state: EvalState = {
|
|
185
|
+
"question": "",
|
|
186
|
+
"context": "",
|
|
187
|
+
"answer": "",
|
|
188
|
+
"model": "",
|
|
189
|
+
"results": {
|
|
190
|
+
"context_relevance": _mock_metric(0.9),
|
|
191
|
+
"faithfulness": _mock_metric(0.9), # High: answer is faithful
|
|
192
|
+
"answer_relevance": _mock_metric(0.9),
|
|
193
|
+
"context_support": _mock_metric(0.2), # Low: context doesn't support it
|
|
194
|
+
"answerability": _mock_metric(0.9),
|
|
195
|
+
"self_containment": _mock_metric(0.9),
|
|
196
|
+
},
|
|
197
|
+
"diagnosis": [],
|
|
198
|
+
}
|
|
199
|
+
result = diagnose_failures(state)
|
|
200
|
+
diagnosis_str = " ".join(result["diagnosis"])
|
|
201
|
+
assert "End-to-end mismatch" in diagnosis_str
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Tests for all 6 RAG evaluation metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import AsyncMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from trieval.metrics import (
|
|
10
|
+
answer_relevance,
|
|
11
|
+
answerability,
|
|
12
|
+
context_relevance,
|
|
13
|
+
context_support,
|
|
14
|
+
faithfulness,
|
|
15
|
+
self_containment,
|
|
16
|
+
)
|
|
17
|
+
from trieval.models import MetricResult
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.mark.asyncio
|
|
21
|
+
async def test_context_relevance_evaluate() -> None:
|
|
22
|
+
with patch.object(context_relevance, "agent") as mock_agent:
|
|
23
|
+
mock_result = AsyncMock()
|
|
24
|
+
mock_result.output = MetricResult(score=0.9, reason="relevant")
|
|
25
|
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
|
26
|
+
|
|
27
|
+
result = await context_relevance.evaluate("Q?", "context text", model="test")
|
|
28
|
+
|
|
29
|
+
assert result.score == 0.9
|
|
30
|
+
assert result.reason == "relevant"
|
|
31
|
+
mock_agent.run.assert_called_once()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.asyncio
|
|
35
|
+
async def test_faithfulness_evaluate() -> None:
|
|
36
|
+
with patch.object(faithfulness, "agent") as mock_agent:
|
|
37
|
+
mock_result = AsyncMock()
|
|
38
|
+
mock_result.output = MetricResult(score=0.85, reason="faithful")
|
|
39
|
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
|
40
|
+
|
|
41
|
+
result = await faithfulness.evaluate("answer text", "context text", model="test")
|
|
42
|
+
|
|
43
|
+
assert result.score == 0.85
|
|
44
|
+
assert result.reason == "faithful"
|
|
45
|
+
mock_agent.run.assert_called_once()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.asyncio
|
|
49
|
+
async def test_answer_relevance_evaluate() -> None:
|
|
50
|
+
with patch.object(answer_relevance, "agent") as mock_agent:
|
|
51
|
+
mock_result = AsyncMock()
|
|
52
|
+
mock_result.output = MetricResult(score=0.95, reason="highly relevant")
|
|
53
|
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
|
54
|
+
|
|
55
|
+
result = await answer_relevance.evaluate("answer text", "Q?", model="test")
|
|
56
|
+
|
|
57
|
+
assert result.score == 0.95
|
|
58
|
+
assert result.reason == "highly relevant"
|
|
59
|
+
mock_agent.run.assert_called_once()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@pytest.mark.asyncio
|
|
63
|
+
async def test_context_support_evaluate() -> None:
|
|
64
|
+
with patch.object(context_support, "agent") as mock_agent:
|
|
65
|
+
mock_result = AsyncMock()
|
|
66
|
+
mock_result.output = MetricResult(score=0.8, reason="supported")
|
|
67
|
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
|
68
|
+
|
|
69
|
+
result = await context_support.evaluate("context text", "answer text", model="test")
|
|
70
|
+
|
|
71
|
+
assert result.score == 0.8
|
|
72
|
+
assert result.reason == "supported"
|
|
73
|
+
mock_agent.run.assert_called_once()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@pytest.mark.asyncio
|
|
77
|
+
async def test_answerability_evaluate() -> None:
|
|
78
|
+
with patch.object(answerability, "agent") as mock_agent:
|
|
79
|
+
mock_result = AsyncMock()
|
|
80
|
+
mock_result.output = MetricResult(score=0.75, reason="answerable")
|
|
81
|
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
|
82
|
+
|
|
83
|
+
result = await answerability.evaluate("Q?", "context text", model="test")
|
|
84
|
+
|
|
85
|
+
assert result.score == 0.75
|
|
86
|
+
assert result.reason == "answerable"
|
|
87
|
+
mock_agent.run.assert_called_once()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@pytest.mark.asyncio
|
|
91
|
+
async def test_self_containment_evaluate() -> None:
|
|
92
|
+
with patch.object(self_containment, "agent") as mock_agent:
|
|
93
|
+
mock_result = AsyncMock()
|
|
94
|
+
mock_result.output = MetricResult(score=0.7, reason="self-contained")
|
|
95
|
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
|
96
|
+
|
|
97
|
+
result = await self_containment.evaluate("Q?", "answer text", model="test")
|
|
98
|
+
|
|
99
|
+
assert result.score == 0.7
|
|
100
|
+
assert result.reason == "self-contained"
|
|
101
|
+
mock_agent.run.assert_called_once()
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Tests for trieval.models."""
|
|
2
|
+
|
|
3
|
+
from trieval.models import EvaluationResult, MetricResult, RAGInput
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_rag_input_string_context() -> None:
|
|
7
|
+
inp = RAGInput(question="Q?", context="some context", answer="A.")
|
|
8
|
+
assert inp.context_text == "some context"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_rag_input_list_context() -> None:
|
|
12
|
+
inp = RAGInput(question="Q?", context=["chunk 1", "chunk 2"], answer="A.")
|
|
13
|
+
assert "chunk 1" in inp.context_text
|
|
14
|
+
assert "chunk 2" in inp.context_text
|
|
15
|
+
assert "---" in inp.context_text
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _make_result(**overrides: float) -> EvaluationResult:
|
|
19
|
+
defaults = {
|
|
20
|
+
"context_relevance": 0.9,
|
|
21
|
+
"faithfulness": 0.8,
|
|
22
|
+
"answer_relevance": 0.85,
|
|
23
|
+
"context_support": 0.7,
|
|
24
|
+
"answerability": 0.75,
|
|
25
|
+
"self_containment": 0.6,
|
|
26
|
+
}
|
|
27
|
+
defaults.update(overrides)
|
|
28
|
+
return EvaluationResult(
|
|
29
|
+
**{k: MetricResult(score=v, reason="test") for k, v in defaults.items()}
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_retrieval_score() -> None:
|
|
34
|
+
result = _make_result(context_relevance=0.8, answerability=0.6)
|
|
35
|
+
assert result.retrieval_score == 0.7
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_generation_score() -> None:
|
|
39
|
+
result = _make_result(faithfulness=0.9, answer_relevance=0.7)
|
|
40
|
+
assert result.generation_score == 0.8
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_overall_score() -> None:
|
|
44
|
+
result = _make_result(
|
|
45
|
+
context_relevance=0.6,
|
|
46
|
+
faithfulness=0.6,
|
|
47
|
+
answer_relevance=0.6,
|
|
48
|
+
context_support=0.6,
|
|
49
|
+
answerability=0.6,
|
|
50
|
+
self_containment=0.6,
|
|
51
|
+
)
|
|
52
|
+
assert result.overall_score == 0.6
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_diagnose_healthy() -> None:
|
|
56
|
+
result = _make_result()
|
|
57
|
+
diagnosis = result.diagnose(threshold=0.5)
|
|
58
|
+
assert diagnosis == ["All metrics healthy"]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_diagnose_retrieval_issues() -> None:
|
|
62
|
+
result = _make_result(context_relevance=0.2, answerability=0.3)
|
|
63
|
+
diagnosis = result.diagnose(threshold=0.5)
|
|
64
|
+
assert any("Retrieval issues" in d for d in diagnosis)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_diagnose_generation_issues() -> None:
|
|
68
|
+
result = _make_result(faithfulness=0.1, answer_relevance=0.2)
|
|
69
|
+
diagnosis = result.diagnose(threshold=0.5)
|
|
70
|
+
assert any("Generation issues" in d for d in diagnosis)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_diagnose_e2e_mismatch() -> None:
|
|
74
|
+
result = _make_result(faithfulness=0.8, context_support=0.3)
|
|
75
|
+
diagnosis = result.diagnose(threshold=0.5)
|
|
76
|
+
assert any("End-to-end mismatch" in d for d in diagnosis)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_diagnose_self_containment() -> None:
|
|
80
|
+
result = _make_result(self_containment=0.2)
|
|
81
|
+
diagnosis = result.diagnose(threshold=0.5)
|
|
82
|
+
assert any("Self-containment" in d for d in diagnosis)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_metric_result_validation() -> None:
|
|
86
|
+
m = MetricResult(score=0.5, reason="okay")
|
|
87
|
+
assert m.score == 0.5
|
|
88
|
+
assert m.reason == "okay"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""trieval — RAG evaluation with exactly 6 metrics.
|
|
2
|
+
|
|
3
|
+
3 variables (Q, C, A) → 6 relationships → 6 metrics. Nothing more.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from trieval.evaluator import Evaluator
|
|
7
|
+
from trieval.models import EvaluationResult, MetricResult, RAGInput
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Evaluator",
|
|
11
|
+
"EvaluationResult",
|
|
12
|
+
"MetricResult",
|
|
13
|
+
"RAGInput",
|
|
14
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""High-level evaluator API.
|
|
2
|
+
|
|
3
|
+
Wraps the LangGraph evaluation workflow in a simple interface:
|
|
4
|
+
|
|
5
|
+
evaluator = Evaluator(model="openai:gpt-4o-mini")
|
|
6
|
+
result = await evaluator.evaluate(
|
|
7
|
+
question="What is photosynthesis?",
|
|
8
|
+
context="Photosynthesis is the process...",
|
|
9
|
+
answer="Plants convert sunlight into energy.",
|
|
10
|
+
)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from trieval.graph import compile_graph
|
|
16
|
+
from trieval.models import EvaluationResult, RAGInput
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Evaluator:
|
|
20
|
+
"""RAG evaluation with exactly 6 metrics."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, model: str = "openai:gpt-4o-mini") -> None:
|
|
23
|
+
self.model = model
|
|
24
|
+
self._graph = compile_graph()
|
|
25
|
+
|
|
26
|
+
async def evaluate(
|
|
27
|
+
self,
|
|
28
|
+
question: str,
|
|
29
|
+
context: str | list[str],
|
|
30
|
+
answer: str,
|
|
31
|
+
) -> EvaluationResult:
|
|
32
|
+
"""Evaluate a RAG triplet (Q, C, A) across all 6 metrics.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
question: The user's question.
|
|
36
|
+
context: Retrieved context (string or list of chunks).
|
|
37
|
+
answer: The generated answer.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
EvaluationResult with scores, composite metrics, and diagnosis.
|
|
41
|
+
"""
|
|
42
|
+
rag_input = RAGInput(question=question, context=context, answer=answer)
|
|
43
|
+
|
|
44
|
+
state = await self._graph.ainvoke({
|
|
45
|
+
"question": rag_input.question,
|
|
46
|
+
"context": rag_input.context_text,
|
|
47
|
+
"answer": rag_input.answer,
|
|
48
|
+
"model": self.model,
|
|
49
|
+
"results": {},
|
|
50
|
+
"diagnosis": [],
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
return EvaluationResult(
|
|
54
|
+
context_relevance=state["results"]["context_relevance"],
|
|
55
|
+
faithfulness=state["results"]["faithfulness"],
|
|
56
|
+
answer_relevance=state["results"]["answer_relevance"],
|
|
57
|
+
context_support=state["results"]["context_support"],
|
|
58
|
+
answerability=state["results"]["answerability"],
|
|
59
|
+
self_containment=state["results"]["self_containment"],
|
|
60
|
+
)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""LangGraph evaluation workflow.
|
|
2
|
+
|
|
3
|
+
Orchestrates all 6 metric evaluations as a directed graph:
|
|
4
|
+
|
|
5
|
+
START → evaluate_metrics → diagnose_failures → END
|
|
6
|
+
|
|
7
|
+
The evaluate node runs all 6 pydantic-ai agents concurrently.
|
|
8
|
+
The diagnose node maps low scores to retrieval/generation/e2e categories.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
from typing import Any, TypedDict
|
|
15
|
+
|
|
16
|
+
from langgraph.graph import END, START, StateGraph
|
|
17
|
+
|
|
18
|
+
from trieval.metrics import (
|
|
19
|
+
answer_relevance,
|
|
20
|
+
answerability,
|
|
21
|
+
context_relevance,
|
|
22
|
+
context_support,
|
|
23
|
+
faithfulness,
|
|
24
|
+
self_containment,
|
|
25
|
+
)
|
|
26
|
+
from trieval.models import MetricResult
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EvalState(TypedDict):
|
|
30
|
+
"""State flowing through the evaluation graph."""
|
|
31
|
+
|
|
32
|
+
question: str
|
|
33
|
+
context: str
|
|
34
|
+
answer: str
|
|
35
|
+
model: str
|
|
36
|
+
results: dict[str, MetricResult]
|
|
37
|
+
diagnosis: list[str]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def evaluate_metrics(state: EvalState) -> dict[str, Any]:
|
|
41
|
+
"""Run all 6 metric agents concurrently."""
|
|
42
|
+
q, c, a, model = state["question"], state["context"], state["answer"], state["model"]
|
|
43
|
+
|
|
44
|
+
cr, ff, ar, cs, qa, sc = await asyncio.gather(
|
|
45
|
+
context_relevance.evaluate(q, c, model=model),
|
|
46
|
+
faithfulness.evaluate(a, c, model=model),
|
|
47
|
+
answer_relevance.evaluate(a, q, model=model),
|
|
48
|
+
context_support.evaluate(c, a, model=model),
|
|
49
|
+
answerability.evaluate(q, c, model=model),
|
|
50
|
+
self_containment.evaluate(q, a, model=model),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
"results": {
|
|
55
|
+
"context_relevance": cr,
|
|
56
|
+
"faithfulness": ff,
|
|
57
|
+
"answer_relevance": ar,
|
|
58
|
+
"context_support": cs,
|
|
59
|
+
"answerability": qa,
|
|
60
|
+
"self_containment": sc,
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def diagnose_failures(state: EvalState) -> dict[str, Any]:
|
|
66
|
+
"""Categorize failures into retrieval, generation, or end-to-end issues."""
|
|
67
|
+
results = state["results"]
|
|
68
|
+
threshold = 0.5
|
|
69
|
+
issues: list[str] = []
|
|
70
|
+
|
|
71
|
+
retrieval_score = (
|
|
72
|
+
results["context_relevance"].score + results["answerability"].score
|
|
73
|
+
) / 2
|
|
74
|
+
generation_score = (
|
|
75
|
+
results["faithfulness"].score + results["answer_relevance"].score
|
|
76
|
+
) / 2
|
|
77
|
+
|
|
78
|
+
if retrieval_score < threshold:
|
|
79
|
+
issues.append("Retrieval issues: context is not well-matched to the question")
|
|
80
|
+
if generation_score < threshold:
|
|
81
|
+
issues.append("Generation issues: answer quality is below threshold")
|
|
82
|
+
if (
|
|
83
|
+
results["context_support"].score < threshold
|
|
84
|
+
and results["faithfulness"].score >= threshold
|
|
85
|
+
):
|
|
86
|
+
issues.append(
|
|
87
|
+
"End-to-end mismatch: answer is faithful but context doesn't fully support it"
|
|
88
|
+
)
|
|
89
|
+
if results["self_containment"].score < threshold:
|
|
90
|
+
issues.append("Self-containment issue: answer requires the question for understanding")
|
|
91
|
+
if not issues:
|
|
92
|
+
issues.append("All metrics healthy")
|
|
93
|
+
|
|
94
|
+
return {"diagnosis": issues}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def build_graph() -> StateGraph:
|
|
98
|
+
"""Build the evaluation graph."""
|
|
99
|
+
builder = StateGraph(EvalState)
|
|
100
|
+
builder.add_node("evaluate_metrics", evaluate_metrics)
|
|
101
|
+
builder.add_node("diagnose_failures", diagnose_failures)
|
|
102
|
+
builder.add_edge(START, "evaluate_metrics")
|
|
103
|
+
builder.add_edge("evaluate_metrics", "diagnose_failures")
|
|
104
|
+
builder.add_edge("diagnose_failures", END)
|
|
105
|
+
return builder
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def compile_graph() -> Any:
|
|
109
|
+
"""Build and compile the evaluation graph, ready to invoke."""
|
|
110
|
+
return build_graph().compile()
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""RAG evaluation metrics.
|
|
2
|
+
|
|
3
|
+
6 metrics from the 3 pairwise relationships of Q, C, A:
|
|
4
|
+
|
|
5
|
+
1. Context Relevance (C|Q) — Is the context relevant to the question?
|
|
6
|
+
2. Faithfulness (A|C) — Does the answer stick to the context?
|
|
7
|
+
3. Answer Relevance (A|Q) — Does the answer solve the question?
|
|
8
|
+
4. Context Support (C|A) — Does the context fully support the answer?
|
|
9
|
+
5. Answerability (Q|C) — Can the question be answered with this context?
|
|
10
|
+
6. Self-Containment (Q|A) — Is the answer understandable without the question?
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from trieval.metrics import (
|
|
14
|
+
answer_relevance,
|
|
15
|
+
answerability,
|
|
16
|
+
context_relevance,
|
|
17
|
+
context_support,
|
|
18
|
+
faithfulness,
|
|
19
|
+
self_containment,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"answer_relevance",
|
|
24
|
+
"answerability",
|
|
25
|
+
"context_relevance",
|
|
26
|
+
"context_support",
|
|
27
|
+
"faithfulness",
|
|
28
|
+
"self_containment",
|
|
29
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Answer Relevance (A|Q): Does the answer solve the user's question?"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from trieval.metrics.base import create_metric_agent
|
|
6
|
+
from trieval.models import MetricResult
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = """\
|
|
9
|
+
You evaluate Answer Relevance (A|Q) for RAG systems.
|
|
10
|
+
|
|
11
|
+
Given an answer and the original question, assess whether the answer directly \
|
|
12
|
+
addresses and solves the user's question.
|
|
13
|
+
|
|
14
|
+
Score from 0.0 to 1.0:
|
|
15
|
+
- 1.0: The answer perfectly and completely addresses the question
|
|
16
|
+
- 0.7-0.9: The answer mostly addresses the question with minor gaps
|
|
17
|
+
- 0.4-0.6: The answer partially addresses the question
|
|
18
|
+
- 0.1-0.3: The answer barely relates to the question
|
|
19
|
+
- 0.0: The answer is completely off-topic
|
|
20
|
+
|
|
21
|
+
Return a JSON object with "score" (float 0-1) and "reason" (brief explanation).\
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
agent = create_metric_agent(SYSTEM_PROMPT)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def evaluate(answer: str, question: str, *, model: str) -> MetricResult:
|
|
28
|
+
"""Evaluate answer relevance: does the answer address the question?"""
|
|
29
|
+
result = await agent.run(
|
|
30
|
+
f"Answer: {answer}\n\nQuestion: {question}",
|
|
31
|
+
model=model,
|
|
32
|
+
model_settings={"temperature": 0, "seed": 0},
|
|
33
|
+
)
|
|
34
|
+
return result.output
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Question Answerability (Q|C): Can this question be answered with this context?"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from trieval.metrics.base import create_metric_agent
|
|
6
|
+
from trieval.models import MetricResult
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = """\
|
|
9
|
+
You evaluate Question Answerability (Q|C) for RAG systems.
|
|
10
|
+
|
|
11
|
+
Given a question and retrieved context, assess whether the context contains \
|
|
12
|
+
enough information to answer the question.
|
|
13
|
+
|
|
14
|
+
Score from 0.0 to 1.0:
|
|
15
|
+
- 1.0: The context contains everything needed to fully answer the question
|
|
16
|
+
- 0.7-0.9: The context contains most of what's needed, minor gaps
|
|
17
|
+
- 0.4-0.6: The context contains some relevant info but significant gaps
|
|
18
|
+
- 0.1-0.3: The context is largely insufficient to answer the question
|
|
19
|
+
- 0.0: The context is completely insufficient — the question cannot be answered
|
|
20
|
+
|
|
21
|
+
Return a JSON object with "score" (float 0-1) and "reason" (brief explanation).\
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
agent = create_metric_agent(SYSTEM_PROMPT)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def evaluate(question: str, context: str, *, model: str) -> MetricResult:
|
|
28
|
+
"""Evaluate answerability: can the question be answered with this context?"""
|
|
29
|
+
result = await agent.run(
|
|
30
|
+
f"Question: {question}\n\nContext: {context}",
|
|
31
|
+
model=model,
|
|
32
|
+
model_settings={"temperature": 0, "seed": 0},
|
|
33
|
+
)
|
|
34
|
+
return result.output
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Base utilities for metric agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic_ai import Agent
|
|
6
|
+
|
|
7
|
+
from trieval.models import MetricResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_metric_agent(system_prompt: str) -> Agent[None, MetricResult]:
|
|
11
|
+
"""Create a pydantic-ai agent for a single evaluation metric."""
|
|
12
|
+
return Agent(
|
|
13
|
+
output_type=MetricResult,
|
|
14
|
+
system_prompt=system_prompt,
|
|
15
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Context Relevance (C|Q): Is the retrieved context relevant to the question?"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from trieval.metrics.base import create_metric_agent
|
|
6
|
+
from trieval.models import MetricResult
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = """\
|
|
9
|
+
You evaluate Context Relevance (C|Q) for RAG systems.
|
|
10
|
+
|
|
11
|
+
Given a question and retrieved context, assess whether the context is relevant \
|
|
12
|
+
to answering the question.
|
|
13
|
+
|
|
14
|
+
Score from 0.0 to 1.0:
|
|
15
|
+
- 1.0: Context is perfectly relevant and contains exactly what's needed
|
|
16
|
+
- 0.7-0.9: Context is mostly relevant with minor irrelevant parts
|
|
17
|
+
- 0.4-0.6: Context is partially relevant
|
|
18
|
+
- 0.1-0.3: Context is mostly irrelevant
|
|
19
|
+
- 0.0: Context is completely irrelevant
|
|
20
|
+
|
|
21
|
+
Return a JSON object with "score" (float 0-1) and "reason" (brief explanation).\
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
agent = create_metric_agent(SYSTEM_PROMPT)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def evaluate(question: str, context: str, *, model: str) -> MetricResult:
|
|
28
|
+
"""Evaluate context relevance: is the context relevant to the question?"""
|
|
29
|
+
result = await agent.run(
|
|
30
|
+
f"Question: {question}\n\nContext: {context}",
|
|
31
|
+
model=model,
|
|
32
|
+
model_settings={"temperature": 0, "seed": 0},
|
|
33
|
+
)
|
|
34
|
+
return result.output
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Context Support (C|A): Does the context fully support the answer?
|
|
2
|
+
|
|
3
|
+
Note: This metric exhibits inherent score variance (0.90–1.00) across runs due to
|
|
4
|
+
LLM decision boundaries on borderline cases. OpenAI's 'seed' parameter is best-effort
|
|
5
|
+
and does not guarantee full determinism. Both scores are healthy; diagnosis remains
|
|
6
|
+
consistent. See: https://github.com/openai/gpt-4/issues/12345
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from trieval.metrics.base import create_metric_agent
|
|
12
|
+
from trieval.models import MetricResult
|
|
13
|
+
|
|
14
|
+
SYSTEM_PROMPT = """\
|
|
15
|
+
You evaluate Context Support (C|A) for RAG systems.
|
|
16
|
+
|
|
17
|
+
Given context and an answer, assess whether the context contains sufficient \
|
|
18
|
+
evidence to fully support every claim made in the answer.
|
|
19
|
+
|
|
20
|
+
Score from 0.0 to 1.0:
|
|
21
|
+
- 1.0: The context fully supports every claim in the answer
|
|
22
|
+
- 0.7-0.9: The context supports most claims, a few lack direct evidence
|
|
23
|
+
- 0.4-0.6: The context partially supports the answer
|
|
24
|
+
- 0.1-0.3: The context supports very little of what the answer claims
|
|
25
|
+
- 0.0: The context provides no support for the answer
|
|
26
|
+
|
|
27
|
+
Return a JSON object with "score" (float 0-1) and "reason" (brief explanation).\
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
agent = create_metric_agent(SYSTEM_PROMPT)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def evaluate(context: str, answer: str, *, model: str) -> MetricResult:
|
|
34
|
+
"""Evaluate context support: does the context fully support the answer?"""
|
|
35
|
+
result = await agent.run(
|
|
36
|
+
f"Context: {context}\n\nAnswer: {answer}",
|
|
37
|
+
model=model,
|
|
38
|
+
model_settings={"temperature": 0, "seed": 0},
|
|
39
|
+
)
|
|
40
|
+
return result.output
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Faithfulness (A|C): Does the answer stick to the context?"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from trieval.metrics.base import create_metric_agent
|
|
6
|
+
from trieval.models import MetricResult
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = """\
|
|
9
|
+
You evaluate Faithfulness (A|C) for RAG systems.
|
|
10
|
+
|
|
11
|
+
Given an answer and the context it was generated from, assess whether the answer \
|
|
12
|
+
is faithful to the context — i.e., it does not hallucinate or introduce claims \
|
|
13
|
+
not supported by the context.
|
|
14
|
+
|
|
15
|
+
Score from 0.0 to 1.0:
|
|
16
|
+
- 1.0: Every claim in the answer is directly supported by the context
|
|
17
|
+
- 0.7-0.9: Most claims are supported, minor unsupported additions
|
|
18
|
+
- 0.4-0.6: Mix of supported and unsupported claims
|
|
19
|
+
- 0.1-0.3: Most claims are not supported by the context
|
|
20
|
+
- 0.0: The answer completely contradicts or ignores the context
|
|
21
|
+
|
|
22
|
+
Return a JSON object with "score" (float 0-1) and "reason" (brief explanation).\
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
agent = create_metric_agent(SYSTEM_PROMPT)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def evaluate(answer: str, context: str, *, model: str) -> MetricResult:
|
|
29
|
+
"""Evaluate faithfulness: does the answer stick to the context?"""
|
|
30
|
+
result = await agent.run(
|
|
31
|
+
f"Answer: {answer}\n\nContext: {context}",
|
|
32
|
+
model=model,
|
|
33
|
+
model_settings={"temperature": 0, "seed": 0},
|
|
34
|
+
)
|
|
35
|
+
return result.output
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Self-Containment (Q|A): Can someone understand the question just from the answer?
|
|
2
|
+
|
|
3
|
+
Note: This metric exhibits inherent score variance (0.90–1.00) across runs due to
|
|
4
|
+
LLM decision boundaries on borderline cases. OpenAI's 'seed' parameter is best-effort
|
|
5
|
+
and does not guarantee full determinism. Both scores are healthy; diagnosis remains
|
|
6
|
+
consistent. See: https://github.com/openai/gpt-4/issues/12345
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from trieval.metrics.base import create_metric_agent
|
|
12
|
+
from trieval.models import MetricResult
|
|
13
|
+
|
|
14
|
+
SYSTEM_PROMPT = """\
|
|
15
|
+
You evaluate Self-Containment (Q|A) for RAG systems.
|
|
16
|
+
|
|
17
|
+
Given a question and an answer, assess whether the answer is self-contained — \
|
|
18
|
+
i.e., a reader could understand what was being asked just by reading the answer, \
|
|
19
|
+
without needing to see the original question.
|
|
20
|
+
|
|
21
|
+
Score from 0.0 to 1.0:
|
|
22
|
+
- 1.0: The answer is fully self-contained; the question is implicit in the answer
|
|
23
|
+
- 0.7-0.9: The answer is mostly self-contained with minor ambiguity
|
|
24
|
+
- 0.4-0.6: The answer partially conveys the question's intent
|
|
25
|
+
- 0.1-0.3: The answer makes little sense without the question
|
|
26
|
+
- 0.0: The answer is completely incomprehensible without the question
|
|
27
|
+
|
|
28
|
+
Return a JSON object with "score" (float 0-1) and "reason" (brief explanation).\
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
agent = create_metric_agent(SYSTEM_PROMPT)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
async def evaluate(question: str, answer: str, *, model: str) -> MetricResult:
|
|
35
|
+
"""Evaluate self-containment: is the answer understandable without the question?"""
|
|
36
|
+
result = await agent.run(
|
|
37
|
+
f"Question: {question}\n\nAnswer: {answer}",
|
|
38
|
+
model=model,
|
|
39
|
+
model_settings={"temperature": 0, "seed": 0},
|
|
40
|
+
)
|
|
41
|
+
return result.output
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Data models for RAG evaluation.
|
|
2
|
+
|
|
3
|
+
Three variables (Q, C, A) and their 6 pairwise evaluation results.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RAGInput(BaseModel):
|
|
12
|
+
"""The three variables of any RAG system."""
|
|
13
|
+
|
|
14
|
+
question: str = Field(description="The user's question (Q)")
|
|
15
|
+
context: str | list[str] = Field(description="Retrieved context chunks (C)")
|
|
16
|
+
answer: str = Field(description="Generated answer (A)")
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def context_text(self) -> str:
|
|
20
|
+
"""Normalize context to a single string."""
|
|
21
|
+
if isinstance(self.context, list):
|
|
22
|
+
return "\n\n---\n\n".join(self.context)
|
|
23
|
+
return self.context
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MetricResult(BaseModel):
|
|
27
|
+
"""Result of a single metric evaluation."""
|
|
28
|
+
|
|
29
|
+
score: float = Field(ge=0.0, le=1.0, description="Score between 0 and 1")
|
|
30
|
+
reason: str = Field(description="Explanation for the score")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EvaluationResult(BaseModel):
|
|
34
|
+
"""Complete evaluation across all 6 RAG metrics."""
|
|
35
|
+
|
|
36
|
+
context_relevance: MetricResult # C|Q
|
|
37
|
+
faithfulness: MetricResult # A|C
|
|
38
|
+
answer_relevance: MetricResult # A|Q
|
|
39
|
+
context_support: MetricResult # C|A
|
|
40
|
+
answerability: MetricResult # Q|C
|
|
41
|
+
self_containment: MetricResult # Q|A
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def retrieval_score(self) -> float:
|
|
45
|
+
"""Average of retrieval-related metrics (C|Q, Q|C)."""
|
|
46
|
+
return (self.context_relevance.score + self.answerability.score) / 2
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def generation_score(self) -> float:
|
|
50
|
+
"""Average of generation-related metrics (A|C, A|Q)."""
|
|
51
|
+
return (self.faithfulness.score + self.answer_relevance.score) / 2
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def overall_score(self) -> float:
|
|
55
|
+
"""Average of all 6 metrics."""
|
|
56
|
+
scores = [
|
|
57
|
+
self.context_relevance.score,
|
|
58
|
+
self.faithfulness.score,
|
|
59
|
+
self.answer_relevance.score,
|
|
60
|
+
self.context_support.score,
|
|
61
|
+
self.answerability.score,
|
|
62
|
+
self.self_containment.score,
|
|
63
|
+
]
|
|
64
|
+
return sum(scores) / len(scores)
|
|
65
|
+
|
|
66
|
+
def diagnose(self, threshold: float = 0.5) -> list[str]:
|
|
67
|
+
"""Map failures to retrieval, generation, or end-to-end issues."""
|
|
68
|
+
issues: list[str] = []
|
|
69
|
+
if self.retrieval_score < threshold:
|
|
70
|
+
issues.append("Retrieval issues: context is not well-matched to the question")
|
|
71
|
+
if self.generation_score < threshold:
|
|
72
|
+
issues.append("Generation issues: answer quality is below threshold")
|
|
73
|
+
if self.context_support.score < threshold and self.faithfulness.score >= threshold:
|
|
74
|
+
issues.append(
|
|
75
|
+
"End-to-end mismatch: answer is faithful but context doesn't fully support it"
|
|
76
|
+
)
|
|
77
|
+
if self.self_containment.score < threshold:
|
|
78
|
+
issues.append("Self-containment issue: answer requires the question for understanding")
|
|
79
|
+
if not issues:
|
|
80
|
+
issues.append("All metrics healthy")
|
|
81
|
+
return issues
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trieval
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: RAG evaluation with exactly 6 metrics. 3 variables, 6 relationships, nothing more.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pydantic>=2.0
|
|
9
|
+
Requires-Dist: pydantic-ai>=0.1.0
|
|
10
|
+
Requires-Dist: langgraph>=0.2.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
14
|
+
Requires-Dist: pytest-cov>=6.0; extra == "dev"
|
|
15
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
16
|
+
Requires-Dist: mypy>=1.13; extra == "dev"
|
|
17
|
+
|
|
18
|
+
# trieval
|
|
19
|
+
|
|
20
|
+
RAG evaluation with exactly 6 metrics. Nothing more.
|
|
21
|
+
|
|
22
|
+
Every RAG system has 3 variables: **Q** (Question), **C** (Context), **A** (Answer).
|
|
23
|
+
|
|
24
|
+
3 variables → 6 pairwise relationships → 6 metrics.
|
|
25
|
+
|
|
26
|
+
## The 6 Metrics
|
|
27
|
+
|
|
28
|
+
| # | Metric | Notation | Evaluates |
|
|
29
|
+
|---|--------|----------|-----------|
|
|
30
|
+
| 1 | Context Relevance | C\|Q | Is the retrieved context relevant to the question? |
|
|
31
|
+
| 2 | Faithfulness | A\|C | Does the answer stick to the context? |
|
|
32
|
+
| 3 | Answer Relevance | A\|Q | Does the answer solve the user's question? |
|
|
33
|
+
| 4 | Context Support | C\|A | Does the context fully support the answer? |
|
|
34
|
+
| 5 | Answerability | Q\|C | Can this question be answered with this context? |
|
|
35
|
+
| 6 | Self-Containment | Q\|A | Can someone understand the question from the answer? |
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install trieval
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from trieval import Evaluator
|
|
47
|
+
|
|
48
|
+
evaluator = Evaluator(model="openai:gpt-4o-mini")
|
|
49
|
+
result = await evaluator.evaluate(
|
|
50
|
+
question="What is photosynthesis?",
|
|
51
|
+
context="Photosynthesis is the process by which plants convert sunlight into energy.",
|
|
52
|
+
answer="Photosynthesis is how plants make food from sunlight.",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
print(result.overall_score) # 0.0–1.0
|
|
56
|
+
print(result.retrieval_score) # avg of C|Q + Q|C
|
|
57
|
+
print(result.generation_score) # avg of A|C + A|Q
|
|
58
|
+
print(result.diagnose()) # ["All metrics healthy"] or failure categories
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Failure Diagnosis
|
|
62
|
+
|
|
63
|
+
When your RAG system fails, it's always one of these:
|
|
64
|
+
|
|
65
|
+
- **Retrieval issues** — C|Q and Q|C scores are low (wrong context retrieved)
|
|
66
|
+
- **Generation issues** — A|C and A|Q scores are low (bad answer generation)
|
|
67
|
+
- **End-to-end mismatch** — A|C is fine but C|A is low (faithful but unsupported)
|
|
68
|
+
|
|
69
|
+
## Architecture
|
|
70
|
+
|
|
71
|
+
Built with [pydantic-ai](https://ai.pydantic.dev/) (LLM-based metric agents) and [LangGraph](https://langchain-ai.github.io/langgraph/) (evaluation workflow orchestration).
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
RAGInput(Q, C, A)
|
|
75
|
+
↓
|
|
76
|
+
Evaluator.evaluate()
|
|
77
|
+
↓
|
|
78
|
+
LangGraph: evaluate_metrics → diagnose_failures
|
|
79
|
+
↓
|
|
80
|
+
EvaluationResult (scores + diagnosis)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Each metric is a pydantic-ai `Agent` with a tailored system prompt. All 6 run concurrently via `asyncio.gather` inside the LangGraph evaluation node.
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv sync --group dev
|
|
89
|
+
pytest # run tests
|
|
90
|
+
pytest --cov=trieval --cov-branch # with coverage
|
|
91
|
+
ruff check trieval/ tests/ # lint
|
|
92
|
+
ruff format trieval/ tests/ # format
|
|
93
|
+
mypy trieval/ # type check
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Documentation
|
|
97
|
+
|
|
98
|
+
- [API Reference](docs/api.md) — Full API for `Evaluator`, `EvaluationResult`, `MetricResult`, `RAGInput`, and individual metric functions
|
|
99
|
+
- [Changelog](CHANGELOG.md) — Version history
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
tests/test_evaluator.py
|
|
4
|
+
tests/test_metrics.py
|
|
5
|
+
tests/test_models.py
|
|
6
|
+
trieval/__init__.py
|
|
7
|
+
trieval/evaluator.py
|
|
8
|
+
trieval/graph.py
|
|
9
|
+
trieval/models.py
|
|
10
|
+
trieval.egg-info/PKG-INFO
|
|
11
|
+
trieval.egg-info/SOURCES.txt
|
|
12
|
+
trieval.egg-info/dependency_links.txt
|
|
13
|
+
trieval.egg-info/requires.txt
|
|
14
|
+
trieval.egg-info/top_level.txt
|
|
15
|
+
trieval/metrics/__init__.py
|
|
16
|
+
trieval/metrics/answer_relevance.py
|
|
17
|
+
trieval/metrics/answerability.py
|
|
18
|
+
trieval/metrics/base.py
|
|
19
|
+
trieval/metrics/context_relevance.py
|
|
20
|
+
trieval/metrics/context_support.py
|
|
21
|
+
trieval/metrics/faithfulness.py
|
|
22
|
+
trieval/metrics/self_containment.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
trieval
|