rag-eval-lite 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rag_eval_lite-0.1.0/PKG-INFO +26 -0
- rag_eval_lite-0.1.0/README.md +13 -0
- rag_eval_lite-0.1.0/pyproject.toml +28 -0
- rag_eval_lite-0.1.0/rag_eval/__init__.py +3 -0
- rag_eval_lite-0.1.0/rag_eval/metrics.py +113 -0
- rag_eval_lite-0.1.0/rag_eval_lite.egg-info/PKG-INFO +26 -0
- rag_eval_lite-0.1.0/rag_eval_lite.egg-info/SOURCES.txt +8 -0
- rag_eval_lite-0.1.0/rag_eval_lite.egg-info/dependency_links.txt +1 -0
- rag_eval_lite-0.1.0/rag_eval_lite.egg-info/top_level.txt +2 -0
- rag_eval_lite-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rag-eval-lite
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight evaluation metrics for RAG (Hit@k, Recall@k, MRR, nDCG)
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/rag-eval-lite
|
|
7
|
+
Keywords: rag,evaluation,retrieval,ml,nlp
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# rag-eval-lite
|
|
15
|
+
|
|
16
|
+
Lightweight RAG evaluation metrics.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
pip install rag-eval-lite
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
```python
|
|
23
|
+
from rag_eval import evaluate_dataset
|
|
24
|
+
|
|
25
|
+
results = evaluate_dataset(data, k=3)
|
|
26
|
+
print(results)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rag-eval-lite"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Lightweight evaluation metrics for RAG (Hit@k, Recall@k, MRR, nDCG)"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Your Name" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = []
|
|
15
|
+
|
|
16
|
+
keywords = ["rag", "evaluation", "retrieval", "ml", "nlp"]
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/yourusername/rag-eval-lite"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["."]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# ---------- Metrics ----------
|
|
6
|
+
|
|
7
|
+
def hit_at_k(golden, retrieved, k):
|
|
8
|
+
return 1.0 if any(cid in golden for cid in retrieved[:k]) else 0.0
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def precision_at_k(golden, retrieved, k):
|
|
12
|
+
if k == 0:
|
|
13
|
+
return 0.0
|
|
14
|
+
retrieved_k = retrieved[:k]
|
|
15
|
+
relevant = sum(1 for cid in retrieved_k if cid in golden)
|
|
16
|
+
return relevant / k
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def recall_at_k(golden, retrieved, k):
|
|
20
|
+
if not golden:
|
|
21
|
+
return 0.0
|
|
22
|
+
retrieved_k = retrieved[:k]
|
|
23
|
+
relevant = sum(1 for cid in retrieved_k if cid in golden)
|
|
24
|
+
return relevant / len(golden)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def mrr(golden, retrieved):
|
|
28
|
+
for i, cid in enumerate(retrieved):
|
|
29
|
+
if cid in golden:
|
|
30
|
+
return 1.0 / (i + 1)
|
|
31
|
+
return 0.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def dcg_at_k(golden, retrieved, k):
|
|
35
|
+
score = 0.0
|
|
36
|
+
for i, cid in enumerate(retrieved[:k]):
|
|
37
|
+
if cid in golden:
|
|
38
|
+
score += 1.0 / math.log2(i + 2)
|
|
39
|
+
return score
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def ndcg_at_k(golden, retrieved, k):
|
|
43
|
+
dcg = dcg_at_k(golden, retrieved, k)
|
|
44
|
+
ideal_hits = min(len(golden), k)
|
|
45
|
+
idcg = sum(1.0 / math.log2(i + 2) for i in range(ideal_hits))
|
|
46
|
+
return dcg / idcg if idcg > 0 else 0.0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------- Main evaluator ----------
|
|
50
|
+
|
|
51
|
+
def evaluate_dataset(
|
|
52
|
+
data: List[Dict[str, Any]],
|
|
53
|
+
k: int = 3,
|
|
54
|
+
return_failures: bool = True
|
|
55
|
+
) -> Dict[str, Any]:
|
|
56
|
+
|
|
57
|
+
hit_scores, precision_scores, recall_scores = [], [], []
|
|
58
|
+
mrr_scores, ndcg_scores = [], []
|
|
59
|
+
|
|
60
|
+
failures = []
|
|
61
|
+
|
|
62
|
+
for idx, row in enumerate(data):
|
|
63
|
+
golden = row.get("golden_chunk_ids", [])
|
|
64
|
+
retrieved = row.get("retrieved_chunk_ids", [])
|
|
65
|
+
|
|
66
|
+
q = row.get("question", "")
|
|
67
|
+
qid = row.get("question_id", idx)
|
|
68
|
+
|
|
69
|
+
h = hit_at_k(golden, retrieved, k)
|
|
70
|
+
p = precision_at_k(golden, retrieved, k)
|
|
71
|
+
r = recall_at_k(golden, retrieved, k)
|
|
72
|
+
m = mrr(golden, retrieved)
|
|
73
|
+
n = ndcg_at_k(golden, retrieved, k)
|
|
74
|
+
|
|
75
|
+
hit_scores.append(h)
|
|
76
|
+
precision_scores.append(p)
|
|
77
|
+
recall_scores.append(r)
|
|
78
|
+
mrr_scores.append(m)
|
|
79
|
+
ndcg_scores.append(n)
|
|
80
|
+
|
|
81
|
+
# Define "failure"
|
|
82
|
+
if h == 0: # you can tweak this condition
|
|
83
|
+
failures.append({
|
|
84
|
+
"question_id": qid,
|
|
85
|
+
"question": q,
|
|
86
|
+
"golden_chunk_ids": golden,
|
|
87
|
+
"retrieved_chunk_ids": retrieved,
|
|
88
|
+
"metrics": {
|
|
89
|
+
"hit": h,
|
|
90
|
+
"precision": p,
|
|
91
|
+
"recall": r,
|
|
92
|
+
"mrr": m,
|
|
93
|
+
"ndcg": n
|
|
94
|
+
}
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
def avg(lst):
|
|
98
|
+
return sum(lst) / len(lst) if lst else 0.0
|
|
99
|
+
|
|
100
|
+
result = {
|
|
101
|
+
f"hit@{k}": avg(hit_scores),
|
|
102
|
+
f"precision@{k}": avg(precision_scores),
|
|
103
|
+
f"recall@{k}": avg(recall_scores),
|
|
104
|
+
"mrr": avg(mrr_scores),
|
|
105
|
+
f"ndcg@{k}": avg(ndcg_scores),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if return_failures:
|
|
109
|
+
result["failures"] = failures
|
|
110
|
+
result["num_failures"] = len(failures)
|
|
111
|
+
|
|
112
|
+
return result
|
|
113
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rag-eval-lite
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight evaluation metrics for RAG (Hit@k, Recall@k, MRR, nDCG)
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/rag-eval-lite
|
|
7
|
+
Keywords: rag,evaluation,retrieval,ml,nlp
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# rag-eval-lite
|
|
15
|
+
|
|
16
|
+
Lightweight RAG evaluation metrics.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
pip install rag-eval-lite
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
```python
|
|
23
|
+
from rag_eval import evaluate_dataset
|
|
24
|
+
|
|
25
|
+
results = evaluate_dataset(data, k=3)
|
|
26
|
+
print(results)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|