ragradar-evaluate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragradar_evaluate-0.1.0/.gitignore +38 -0
- ragradar_evaluate-0.1.0/PKG-INFO +162 -0
- ragradar_evaluate-0.1.0/README.md +135 -0
- ragradar_evaluate-0.1.0/pyproject.toml +51 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/__init__.py +43 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/__init__.py +0 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/builder.py +117 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/checker.py +79 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/exporter.py +43 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/seeder.py +89 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/cli.py +390 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/facade.py +535 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/layers/__init__.py +0 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/layers/input_quality.py +327 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/layers/output_quality.py +83 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/__init__.py +0 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/persistence.py +29 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/risk.py +47 -0
- ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/schema.py +40 -0
- ragradar_evaluate-0.1.0/tests/conftest.py +206 -0
- ragradar_evaluate-0.1.0/tests/test_benchmark.py +228 -0
- ragradar_evaluate-0.1.0/tests/test_cli.py +201 -0
- ragradar_evaluate-0.1.0/tests/test_facade.py +315 -0
- ragradar_evaluate-0.1.0/tests/test_input_quality.py +269 -0
- ragradar_evaluate-0.1.0/tests/test_lazy_imports.py +21 -0
- ragradar_evaluate-0.1.0/tests/test_policy.py +39 -0
- ragradar_evaluate-0.1.0/tests/test_risk.py +48 -0
- ragradar_evaluate-0.1.0/tests/test_user_tasks.py +136 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.venv/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.so
|
|
9
|
+
|
|
10
|
+
# uv
|
|
11
|
+
.uv/
|
|
12
|
+
uv.lock
|
|
13
|
+
|
|
14
|
+
# ragradar runtime — never commit user run data
|
|
15
|
+
.ragradar/
|
|
16
|
+
|
|
17
|
+
# environment
|
|
18
|
+
.env
|
|
19
|
+
*.env
|
|
20
|
+
.env.*
|
|
21
|
+
|
|
22
|
+
# IDE
|
|
23
|
+
.vscode/
|
|
24
|
+
.idea/
|
|
25
|
+
*.swp
|
|
26
|
+
|
|
27
|
+
# OS
|
|
28
|
+
.DS_Store
|
|
29
|
+
Thumbs.db
|
|
30
|
+
|
|
31
|
+
# test output
|
|
32
|
+
.pytest_cache/
|
|
33
|
+
htmlcov/
|
|
34
|
+
.coverage
|
|
35
|
+
|
|
36
|
+
# example output
|
|
37
|
+
examples/rag_pipeline/output/
|
|
38
|
+
.claude/
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragradar-evaluate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evaluation layer for ragradar observability system
|
|
5
|
+
Project-URL: Homepage, https://github.com/pleokarthik/RAGRadar
|
|
6
|
+
Project-URL: Repository, https://github.com/pleokarthik/RAGRadar
|
|
7
|
+
Project-URL: Issues, https://github.com/pleokarthik/RAGRadar/issues
|
|
8
|
+
Author-email: Leo Karthik Paramasivan <pleokarthik@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: click>=8.0
|
|
18
|
+
Requires-Dist: ragas>=0.2
|
|
19
|
+
Requires-Dist: ragradar-core<0.2.0,>=0.1.0
|
|
20
|
+
Requires-Dist: rich>=13.0
|
|
21
|
+
Requires-Dist: scipy>=1.13
|
|
22
|
+
Provides-Extra: ollama
|
|
23
|
+
Requires-Dist: ollama>=0.2; extra == 'ollama'
|
|
24
|
+
Provides-Extra: transformers
|
|
25
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'transformers'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# ragradar-evaluate
|
|
29
|
+
|
|
30
|
+
Scores captured runs. Two tasks, one discovery helper:
|
|
31
|
+
|
|
32
|
+
| Task | Call | Cost |
|
|
33
|
+
|---|---|---|
|
|
34
|
+
| "Is this run healthy?" | `check(run_id)` | free — deterministic, no LLM, instant |
|
|
35
|
+
| "Score it fully" | `evaluate(run_id)` | free input metrics + LLM-judged output metrics |
|
|
36
|
+
| "What can be scored?" | `available_metrics()` | free |
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
pip install ragradar-evaluate
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Is this run healthy? — check()
|
|
43
|
+
|
|
44
|
+
Call before paying for an LLM; put it in CI.
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import ragradar_capture
|
|
48
|
+
from ragradar_evaluate import check
|
|
49
|
+
|
|
50
|
+
run_id = ragradar_capture.capture(
|
|
51
|
+
"what is RRF?", "RRF fuses rankings.",
|
|
52
|
+
chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
|
|
53
|
+
"content": "RRF combines rankings.", "token_count": 10,
|
|
54
|
+
"rerank_score": 0.9}],
|
|
55
|
+
)
|
|
56
|
+
result = check(run_id)
|
|
57
|
+
|
|
58
|
+
print(result.verdict) # "ok" | "warn" | "fail"
|
|
59
|
+
print(result.problems) # ["duplicate chunks: ratio 0.50 exceeds 0.20", ...]
|
|
60
|
+
print(result.risk_score) # 0.0-1.0, None if input quality couldn't be assessed
|
|
61
|
+
print(result.factors) # per-factor {value, threshold, status}
|
|
62
|
+
print(result.thresholds) # "learned" | "policy" — which standards were applied
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
`check()` compares all free input metrics against the **current
|
|
66
|
+
standards**: once at least 10 evaluated runs exist for the pipeline it
|
|
67
|
+
uses thresholds learned from your own history (and says so via
|
|
68
|
+
`thresholds == "learned"`); before that it falls back to the policy
|
|
69
|
+
defaults. A run captured without chunks gets a `warn` verdict explaining
|
|
70
|
+
the missing data — never an exception.
|
|
71
|
+
|
|
72
|
+
## Score it fully — evaluate()
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
import ragradar_capture
|
|
76
|
+
from ragradar_evaluate import evaluate
|
|
77
|
+
|
|
78
|
+
run_id = ragradar_capture.capture(
|
|
79
|
+
"what is RRF?", "RRF fuses rankings.",
|
|
80
|
+
chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
|
|
81
|
+
"content": "RRF combines rankings.", "token_count": 10,
|
|
82
|
+
"rerank_score": 0.9}],
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Complete eval: every metric applicable to the record.
|
|
86
|
+
result = evaluate(run_id)
|
|
87
|
+
|
|
88
|
+
# One atomic metric — nothing else is computed:
|
|
89
|
+
result = evaluate(run_id, metrics=["duplicates"], save=False)
|
|
90
|
+
|
|
91
|
+
# A chosen subset:
|
|
92
|
+
result = evaluate(run_id, metrics=["relevance", "faithfulness"])
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
`target` can be an sNrN string (what `ragradar_capture.capture()` returns), a
|
|
96
|
+
committed `Capture` object, or a bare `RunRecord` (then pass
|
|
97
|
+
`save=False` — there's no run row to write to).
|
|
98
|
+
|
|
99
|
+
### EvalResult
|
|
100
|
+
|
|
101
|
+
| Field | Meaning |
|
|
102
|
+
|---|---|
|
|
103
|
+
| `metrics` | per-metric results: a dict of values for input families, a float for RAGAS metrics |
|
|
104
|
+
| `skipped` | metric → reason: `"not requested"`, `"missing data: ..."`, or `"requires ground_truth"` |
|
|
105
|
+
| `errors` | metric → error string; RAGAS-not-installed and RAGAS runtime failures land here identically — `evaluate()` never raises for judge failures |
|
|
106
|
+
| `policy_violations` | policy thresholds breached by the computed values |
|
|
107
|
+
| `risk_score` | `None` when input metrics weren't computed; `0.0` only ever means "computed, no risk" |
|
|
108
|
+
| `run_id` / `saved` | identity and whether scores were persisted |
|
|
109
|
+
|
|
110
|
+
`save=True` (default) persists via the one store path; `ragradar explain
|
|
111
|
+
<run_id>` then shows the scores alongside its analysis.
|
|
112
|
+
|
|
113
|
+
## available_metrics()
|
|
114
|
+
|
|
115
|
+
| Metric | Layer | Cost | Requires |
|
|
116
|
+
|---|---|---|---|
|
|
117
|
+
| `relevance` | input | free | chunks |
|
|
118
|
+
| `duplicates` | input | free | chunks |
|
|
119
|
+
| `truncation` | input | free | chunks |
|
|
120
|
+
| `token_efficiency` | input | free | chunks |
|
|
121
|
+
| `coherence` | input | free | chunks |
|
|
122
|
+
| `faithfulness` | output | llm | chunks, response |
|
|
123
|
+
| `answer_relevancy` | output | llm | chunks, response |
|
|
124
|
+
| `context_precision` | output | llm | chunks, response |
|
|
125
|
+
| `context_recall` | output | llm | chunks, response, ground_truth |
|
|
126
|
+
|
|
127
|
+
Output metrics are RAGAS LLM-as-judge calls — they cost money and need a
|
|
128
|
+
configured judge. **To stay free-only**, use `check()`, or select input
|
|
129
|
+
metrics explicitly: `evaluate(run_id, metrics=["relevance",
|
|
130
|
+
"duplicates", "truncation", "token_efficiency", "coherence"])`.
|
|
131
|
+
|
|
132
|
+
## Policy system
|
|
133
|
+
|
|
134
|
+
Human-set thresholds encoding known failure modes; active from day one
|
|
135
|
+
and the fallback standard for `check()`. Stored per pipeline.
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
ragradar-evaluate policy show
|
|
139
|
+
ragradar-evaluate policy set max_duplicate_ratio 0.1
|
|
140
|
+
ragradar-evaluate policy reset
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Programmatic override: `evaluate(run_id, policy=InputQualityPolicy(...))`.
|
|
144
|
+
|
|
145
|
+
## Benchmark lifecycle (CLI)
|
|
146
|
+
|
|
147
|
+
Learned thresholds accumulate as you evaluate real runs — `check()`
|
|
148
|
+
picks them up automatically at 10+ evaluated runs. The CLI exposes the
|
|
149
|
+
machinery for inspection:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
ragradar-evaluate run s2r3 # evaluate one run (both layers)
|
|
153
|
+
ragradar-evaluate run s2r3 --input-only # free metrics only
|
|
154
|
+
ragradar-evaluate run --session s2 # evaluate a whole session
|
|
155
|
+
ragradar-evaluate benchmark show # current learned thresholds
|
|
156
|
+
ragradar-evaluate benchmark build # rebuild from evaluated history
|
|
157
|
+
ragradar-evaluate benchmark check s2r3 # factor-by-factor threshold check
|
|
158
|
+
ragradar-evaluate benchmark export # RAGAS-compatible JSONL dataset
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
`--input-only --output-only` together is an error (it would compute
|
|
162
|
+
nothing).
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# ragradar-evaluate
|
|
2
|
+
|
|
3
|
+
Scores captured runs. Two tasks, one discovery helper:
|
|
4
|
+
|
|
5
|
+
| Task | Call | Cost |
|
|
6
|
+
|---|---|---|
|
|
7
|
+
| "Is this run healthy?" | `check(run_id)` | free — deterministic, no LLM, instant |
|
|
8
|
+
| "Score it fully" | `evaluate(run_id)` | free input metrics + LLM-judged output metrics |
|
|
9
|
+
| "What can be scored?" | `available_metrics()` | free |
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
pip install ragradar-evaluate
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Is this run healthy? — check()
|
|
16
|
+
|
|
17
|
+
Call before paying for an LLM; put it in CI.
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import ragradar_capture
|
|
21
|
+
from ragradar_evaluate import check
|
|
22
|
+
|
|
23
|
+
run_id = ragradar_capture.capture(
|
|
24
|
+
"what is RRF?", "RRF fuses rankings.",
|
|
25
|
+
chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
|
|
26
|
+
"content": "RRF combines rankings.", "token_count": 10,
|
|
27
|
+
"rerank_score": 0.9}],
|
|
28
|
+
)
|
|
29
|
+
result = check(run_id)
|
|
30
|
+
|
|
31
|
+
print(result.verdict) # "ok" | "warn" | "fail"
|
|
32
|
+
print(result.problems) # ["duplicate chunks: ratio 0.50 exceeds 0.20", ...]
|
|
33
|
+
print(result.risk_score) # 0.0-1.0, None if input quality couldn't be assessed
|
|
34
|
+
print(result.factors) # per-factor {value, threshold, status}
|
|
35
|
+
print(result.thresholds) # "learned" | "policy" — which standards were applied
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
`check()` compares all free input metrics against the **current
|
|
39
|
+
standards**: once at least 10 evaluated runs exist for the pipeline it
|
|
40
|
+
uses thresholds learned from your own history (and says so via
|
|
41
|
+
`thresholds == "learned"`); before that it falls back to the policy
|
|
42
|
+
defaults. A run captured without chunks gets a `warn` verdict explaining
|
|
43
|
+
the missing data — never an exception.
|
|
44
|
+
|
|
45
|
+
## Score it fully — evaluate()
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import ragradar_capture
|
|
49
|
+
from ragradar_evaluate import evaluate
|
|
50
|
+
|
|
51
|
+
run_id = ragradar_capture.capture(
|
|
52
|
+
"what is RRF?", "RRF fuses rankings.",
|
|
53
|
+
chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
|
|
54
|
+
"content": "RRF combines rankings.", "token_count": 10,
|
|
55
|
+
"rerank_score": 0.9}],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Complete eval: every metric applicable to the record.
|
|
59
|
+
result = evaluate(run_id)
|
|
60
|
+
|
|
61
|
+
# One atomic metric — nothing else is computed:
|
|
62
|
+
result = evaluate(run_id, metrics=["duplicates"], save=False)
|
|
63
|
+
|
|
64
|
+
# A chosen subset:
|
|
65
|
+
result = evaluate(run_id, metrics=["relevance", "faithfulness"])
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
`target` can be an sNrN string (what `ragradar_capture.capture()` returns), a
|
|
69
|
+
committed `Capture` object, or a bare `RunRecord` (then pass
|
|
70
|
+
`save=False` — there's no run row to write to).
|
|
71
|
+
|
|
72
|
+
### EvalResult
|
|
73
|
+
|
|
74
|
+
| Field | Meaning |
|
|
75
|
+
|---|---|
|
|
76
|
+
| `metrics` | per-metric results: a dict of values for input families, a float for RAGAS metrics |
|
|
77
|
+
| `skipped` | metric → reason: `"not requested"`, `"missing data: ..."`, or `"requires ground_truth"` |
|
|
78
|
+
| `errors` | metric → error string; RAGAS-not-installed and RAGAS runtime failures land here identically — `evaluate()` never raises for judge failures |
|
|
79
|
+
| `policy_violations` | policy thresholds breached by the computed values |
|
|
80
|
+
| `risk_score` | `None` when input metrics weren't computed; `0.0` only ever means "computed, no risk" |
|
|
81
|
+
| `run_id` / `saved` | identity and whether scores were persisted |
|
|
82
|
+
|
|
83
|
+
`save=True` (default) persists via the one store path; `ragradar explain
|
|
84
|
+
<run_id>` then shows the scores alongside its analysis.
|
|
85
|
+
|
|
86
|
+
## available_metrics()
|
|
87
|
+
|
|
88
|
+
| Metric | Layer | Cost | Requires |
|
|
89
|
+
|---|---|---|---|
|
|
90
|
+
| `relevance` | input | free | chunks |
|
|
91
|
+
| `duplicates` | input | free | chunks |
|
|
92
|
+
| `truncation` | input | free | chunks |
|
|
93
|
+
| `token_efficiency` | input | free | chunks |
|
|
94
|
+
| `coherence` | input | free | chunks |
|
|
95
|
+
| `faithfulness` | output | llm | chunks, response |
|
|
96
|
+
| `answer_relevancy` | output | llm | chunks, response |
|
|
97
|
+
| `context_precision` | output | llm | chunks, response |
|
|
98
|
+
| `context_recall` | output | llm | chunks, response, ground_truth |
|
|
99
|
+
|
|
100
|
+
Output metrics are RAGAS LLM-as-judge calls — they cost money and need a
|
|
101
|
+
configured judge. **To stay free-only**, use `check()`, or select input
|
|
102
|
+
metrics explicitly: `evaluate(run_id, metrics=["relevance",
|
|
103
|
+
"duplicates", "truncation", "token_efficiency", "coherence"])`.
|
|
104
|
+
|
|
105
|
+
## Policy system
|
|
106
|
+
|
|
107
|
+
Human-set thresholds encoding known failure modes; active from day one
|
|
108
|
+
and the fallback standard for `check()`. Stored per pipeline.
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
ragradar-evaluate policy show
|
|
112
|
+
ragradar-evaluate policy set max_duplicate_ratio 0.1
|
|
113
|
+
ragradar-evaluate policy reset
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Programmatic override: `evaluate(run_id, policy=InputQualityPolicy(...))`.
|
|
117
|
+
|
|
118
|
+
## Benchmark lifecycle (CLI)
|
|
119
|
+
|
|
120
|
+
Learned thresholds accumulate as you evaluate real runs — `check()`
|
|
121
|
+
picks them up automatically at 10+ evaluated runs. The CLI exposes the
|
|
122
|
+
machinery for inspection:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
ragradar-evaluate run s2r3 # evaluate one run (both layers)
|
|
126
|
+
ragradar-evaluate run s2r3 --input-only # free metrics only
|
|
127
|
+
ragradar-evaluate run --session s2 # evaluate a whole session
|
|
128
|
+
ragradar-evaluate benchmark show # current learned thresholds
|
|
129
|
+
ragradar-evaluate benchmark build # rebuild from evaluated history
|
|
130
|
+
ragradar-evaluate benchmark check s2r3 # factor-by-factor threshold check
|
|
131
|
+
ragradar-evaluate benchmark export # RAGAS-compatible JSONL dataset
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
`--input-only --output-only` together is an error (it would compute
|
|
135
|
+
nothing).
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ragradar-evaluate"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Evaluation layer for ragradar observability system"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Leo Karthik Paramasivan", email = "pleokarthik@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"ragradar-core>=0.1.0,<0.2.0",
|
|
21
|
+
"ragas>=0.2",
|
|
22
|
+
"scipy>=1.13",
|
|
23
|
+
"rich>=13.0",
|
|
24
|
+
"click>=8.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/pleokarthik/RAGRadar"
|
|
29
|
+
Repository = "https://github.com/pleokarthik/RAGRadar"
|
|
30
|
+
Issues = "https://github.com/pleokarthik/RAGRadar/issues"
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
ollama = [
|
|
34
|
+
"ollama>=0.2",
|
|
35
|
+
]
|
|
36
|
+
transformers = [
|
|
37
|
+
"sentence-transformers>=3.0",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
ragradar-evaluate = "ragradar_evaluate.cli:main"
|
|
42
|
+
|
|
43
|
+
[tool.uv.sources]
|
|
44
|
+
ragradar-core = { workspace = true }
|
|
45
|
+
|
|
46
|
+
[build-system]
|
|
47
|
+
requires = ["hatchling"]
|
|
48
|
+
build-backend = "hatchling.build"
|
|
49
|
+
|
|
50
|
+
[tool.hatch.build.targets.wheel]
|
|
51
|
+
packages = ["src/ragradar_evaluate"]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from ragradar_core.schema import (
|
|
2
|
+
CacheEvent,
|
|
3
|
+
ChunkRecord,
|
|
4
|
+
RunRecord,
|
|
5
|
+
TokenBudget,
|
|
6
|
+
TokenUsage,
|
|
7
|
+
ToolCallRecord,
|
|
8
|
+
Turn,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from ragradar_evaluate.facade import (
|
|
12
|
+
CheckResult,
|
|
13
|
+
EvalResult,
|
|
14
|
+
MetricInfo,
|
|
15
|
+
available_metrics,
|
|
16
|
+
check,
|
|
17
|
+
evaluate,
|
|
18
|
+
)
|
|
19
|
+
from ragradar_evaluate.policy.schema import InputQualityPolicy
|
|
20
|
+
|
|
21
|
+
# NOTE: benchmark machinery (seeding, building, checking, exporting) is
|
|
22
|
+
# internal — the CLI's `benchmark` commands drive it, and check() consults
|
|
23
|
+
# learned thresholds automatically. It is deliberately absent here.
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
# User tasks
|
|
27
|
+
"check",
|
|
28
|
+
"evaluate",
|
|
29
|
+
"available_metrics",
|
|
30
|
+
# Result / config types
|
|
31
|
+
"CheckResult",
|
|
32
|
+
"EvalResult",
|
|
33
|
+
"MetricInfo",
|
|
34
|
+
"InputQualityPolicy",
|
|
35
|
+
# Re-exported schema dataclasses so users need only one import.
|
|
36
|
+
"ChunkRecord",
|
|
37
|
+
"TokenBudget",
|
|
38
|
+
"TokenUsage",
|
|
39
|
+
"Turn",
|
|
40
|
+
"CacheEvent",
|
|
41
|
+
"ToolCallRecord",
|
|
42
|
+
"RunRecord",
|
|
43
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from ragradar_core import store
|
|
4
|
+
from scipy import stats
|
|
5
|
+
|
|
6
|
+
INPUT_FACTORS = [
|
|
7
|
+
"duplicate_ratio",
|
|
8
|
+
"top_chunk_score",
|
|
9
|
+
"high_score_truncations",
|
|
10
|
+
"token_headroom_pct",
|
|
11
|
+
"source_domain_count",
|
|
12
|
+
"low_score_chunk_ratio",
|
|
13
|
+
"mean_relevance",
|
|
14
|
+
"truncated_count",
|
|
15
|
+
"score_variance",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
RAGAS_METRICS = ["faithfulness", "answer_relevancy"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _suggest_threshold(values: list[float], ragas_scores: list[float]) -> float:
|
|
22
|
+
sorted_vals = sorted(set(values))
|
|
23
|
+
if len(sorted_vals) < 2:
|
|
24
|
+
return sorted_vals[0] if sorted_vals else 0.0
|
|
25
|
+
|
|
26
|
+
best_threshold = sorted_vals[0]
|
|
27
|
+
best_diff = 0.0
|
|
28
|
+
|
|
29
|
+
for i in range(len(sorted_vals) - 1):
|
|
30
|
+
threshold = (sorted_vals[i] + sorted_vals[i + 1]) / 2
|
|
31
|
+
below = [r for v, r in zip(values, ragas_scores) if v <= threshold]
|
|
32
|
+
above = [r for v, r in zip(values, ragas_scores) if v > threshold]
|
|
33
|
+
|
|
34
|
+
if not below or not above:
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
diff = abs(sum(above) / len(above) - sum(below) / len(below))
|
|
38
|
+
if diff > best_diff:
|
|
39
|
+
best_diff = diff
|
|
40
|
+
best_threshold = threshold
|
|
41
|
+
|
|
42
|
+
return round(best_threshold, 4)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build(pipeline: str | None = None) -> dict:
|
|
46
|
+
runs = store.get_all_evaluated_runs(pipeline)
|
|
47
|
+
|
|
48
|
+
if len(runs) < 10:
|
|
49
|
+
raise ValueError(f"Need at least 10 evaluated runs to build benchmark, found {len(runs)}.")
|
|
50
|
+
|
|
51
|
+
parsed = []
|
|
52
|
+
for r in runs:
|
|
53
|
+
parsed.append(json.loads(r["eval_scores"]))
|
|
54
|
+
|
|
55
|
+
pipeline_key = pipeline or "__default"
|
|
56
|
+
factors_result = {}
|
|
57
|
+
batch_entries: list[tuple] = []
|
|
58
|
+
|
|
59
|
+
for factor in INPUT_FACTORS:
|
|
60
|
+
factor_values: list[float] = []
|
|
61
|
+
ragas_values: dict[str, list[float]] = {m: [] for m in RAGAS_METRICS}
|
|
62
|
+
|
|
63
|
+
for eval_data in parsed:
|
|
64
|
+
input_data = eval_data.get("input") or {}
|
|
65
|
+
output_data = eval_data.get("output") or {}
|
|
66
|
+
|
|
67
|
+
fval = input_data.get(factor)
|
|
68
|
+
if fval is None:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
has_ragas = any(output_data.get(m) is not None for m in RAGAS_METRICS)
|
|
72
|
+
if not has_ragas:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
factor_values.append(float(fval))
|
|
76
|
+
for m in RAGAS_METRICS:
|
|
77
|
+
ragas_values[m].append(float(output_data.get(m) or 0.0))
|
|
78
|
+
|
|
79
|
+
if len(factor_values) < 3:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
correlations: dict[str, float | None] = {}
|
|
83
|
+
for m in RAGAS_METRICS:
|
|
84
|
+
vals = ragas_values[m]
|
|
85
|
+
if (
|
|
86
|
+
len(vals) == len(factor_values)
|
|
87
|
+
and len(set(factor_values)) > 1
|
|
88
|
+
and len(set(vals)) > 1
|
|
89
|
+
):
|
|
90
|
+
corr, _ = stats.pearsonr(factor_values, vals)
|
|
91
|
+
# float(): scipy stubs expose corr as numpy.float64 (_T_co), not float;
|
|
92
|
+
# the cast is the narrowing workaround — no type: ignore needed.
|
|
93
|
+
correlations[f"{m}_correlation"] = round(float(corr), 4)
|
|
94
|
+
else:
|
|
95
|
+
correlations[f"{m}_correlation"] = None
|
|
96
|
+
|
|
97
|
+
valid_corrs = [v for v in correlations.values() if v is not None]
|
|
98
|
+
primary_corr = max(valid_corrs, key=abs) if valid_corrs else 0.0
|
|
99
|
+
|
|
100
|
+
primary_ragas = ragas_values[RAGAS_METRICS[0]]
|
|
101
|
+
suggested = _suggest_threshold(factor_values, primary_ragas)
|
|
102
|
+
|
|
103
|
+
batch_entries.append((pipeline_key, factor, suggested, primary_corr, len(factor_values)))
|
|
104
|
+
|
|
105
|
+
factors_result[factor] = {
|
|
106
|
+
**correlations,
|
|
107
|
+
"suggested_threshold": suggested,
|
|
108
|
+
"sample_count": len(factor_values),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
store.write_benchmark_entries_batch(batch_entries)
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
"run_count": len(runs),
|
|
115
|
+
"pipeline": pipeline,
|
|
116
|
+
"factors": factors_result,
|
|
117
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from ragradar_core import store
|
|
4
|
+
from ragradar_core.schema import RunRecord
|
|
5
|
+
|
|
6
|
+
from ragradar_evaluate.layers import input_quality
|
|
7
|
+
from ragradar_evaluate.policy.persistence import load_policy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check(
|
|
11
|
+
session_id: int,
|
|
12
|
+
run_seq: int,
|
|
13
|
+
pipeline: str | None = None,
|
|
14
|
+
) -> dict:
|
|
15
|
+
run_row = store.get_run(session_id, run_seq)
|
|
16
|
+
if run_row is None:
|
|
17
|
+
raise ValueError(f"Run s{session_id}r{run_seq} not found.")
|
|
18
|
+
|
|
19
|
+
record = RunRecord.from_json(json.loads(run_row["run_data"]))
|
|
20
|
+
pipeline = pipeline or run_row["pipeline"] or "__default"
|
|
21
|
+
policy = load_policy(pipeline)
|
|
22
|
+
|
|
23
|
+
input_scores = input_quality.score_input_quality(record, policy)
|
|
24
|
+
benchmark = store.get_benchmark(pipeline)
|
|
25
|
+
benchmark_map = {b["factor"]: b for b in benchmark}
|
|
26
|
+
|
|
27
|
+
factors = {}
|
|
28
|
+
fail_count = 0
|
|
29
|
+
|
|
30
|
+
check_factors = [
|
|
31
|
+
("duplicate_ratio", "higher_bad"),
|
|
32
|
+
("top_chunk_score", "lower_bad"),
|
|
33
|
+
("high_score_truncations", "higher_bad"),
|
|
34
|
+
("token_headroom_pct", "lower_bad"),
|
|
35
|
+
("source_domain_count", "higher_bad"),
|
|
36
|
+
("low_score_chunk_ratio", "higher_bad"),
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
for factor, direction in check_factors:
|
|
40
|
+
value = input_scores.get(factor) if input_scores else None
|
|
41
|
+
bench = benchmark_map.get(factor)
|
|
42
|
+
threshold = bench["threshold"] if bench else None
|
|
43
|
+
|
|
44
|
+
if value is None or threshold is None:
|
|
45
|
+
status = "ok"
|
|
46
|
+
elif direction == "lower_bad":
|
|
47
|
+
status = "fail" if value < threshold else "ok"
|
|
48
|
+
else:
|
|
49
|
+
status = "fail" if value > threshold else "ok"
|
|
50
|
+
|
|
51
|
+
if status == "fail":
|
|
52
|
+
fail_count += 1
|
|
53
|
+
|
|
54
|
+
factors[factor] = {
|
|
55
|
+
"value": value,
|
|
56
|
+
"benchmark_threshold": threshold,
|
|
57
|
+
"status": status,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# risk_score is None when the run was never evaluated or its input
|
|
61
|
+
# metrics could not be computed (0.0 strictly means "computed, no
|
|
62
|
+
# risk") — unknown risk never counts toward the verdict.
|
|
63
|
+
eval_data = store.get_eval_scores(session_id, run_seq)
|
|
64
|
+
risk = eval_data.get("risk_score") if eval_data else None
|
|
65
|
+
|
|
66
|
+
if (risk is not None and risk > 0.7) or fail_count >= 3:
|
|
67
|
+
overall = "fail"
|
|
68
|
+
elif fail_count >= 1:
|
|
69
|
+
overall = "warn"
|
|
70
|
+
else:
|
|
71
|
+
overall = "ok"
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
"run_id": f"s{session_id}r{run_seq}",
|
|
75
|
+
"risk_score": risk,
|
|
76
|
+
"benchmark_available": len(benchmark) > 0,
|
|
77
|
+
"factors": factors,
|
|
78
|
+
"overall": overall,
|
|
79
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ragradar_core import store
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def export(pipeline: str | None = None, output_path: Path | None = None) -> Path:
|
|
9
|
+
runs = store.get_all_evaluated_runs(pipeline)
|
|
10
|
+
|
|
11
|
+
records = []
|
|
12
|
+
for r in runs:
|
|
13
|
+
if r["pipeline"] and r["pipeline"].endswith("__seeded"):
|
|
14
|
+
continue
|
|
15
|
+
run_data = json.loads(r["run_data"])
|
|
16
|
+
if not run_data.get("chunks") or not run_data.get("response"):
|
|
17
|
+
continue
|
|
18
|
+
records.append((r, run_data))
|
|
19
|
+
|
|
20
|
+
if output_path is None:
|
|
21
|
+
pipe_name = pipeline or "all"
|
|
22
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
23
|
+
exports_dir = store._ragradar_dir() / "exports"
|
|
24
|
+
exports_dir.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
output_path = exports_dir / f"{pipe_name}_ragas_{timestamp}.jsonl"
|
|
26
|
+
else:
|
|
27
|
+
output_path = Path(output_path)
|
|
28
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
31
|
+
for row, run_data in records:
|
|
32
|
+
entry = {
|
|
33
|
+
"question": run_data["query"],
|
|
34
|
+
"answer": run_data["response"],
|
|
35
|
+
"contexts": [c["content"] for c in run_data.get("chunks", [])],
|
|
36
|
+
"ground_truth": None,
|
|
37
|
+
"run_id": f"s{row['session_id']}r{row['run_seq']}",
|
|
38
|
+
"pipeline": row["pipeline"],
|
|
39
|
+
"evaluated_at": row["evaluated_at"],
|
|
40
|
+
}
|
|
41
|
+
f.write(json.dumps(entry) + "\n")
|
|
42
|
+
|
|
43
|
+
return output_path
|