graphrag-eval 6.3.0__tar.gz → 6.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/PKG-INFO +1 -1
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/aggregation.py +4 -4
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/answer_correctness.py +52 -15
- graphrag_eval-6.4.0/graphrag_eval/answer_relevance.py +61 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/cli/answer_correctness.py +36 -19
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/custom_evaluation.py +34 -18
- graphrag_eval-6.4.0/graphrag_eval/evaluation.py +154 -0
- graphrag_eval-6.4.0/graphrag_eval/evaluator.py +14 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/llm_factory.py +18 -8
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/evaluation.py +11 -3
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/pyproject.toml +1 -1
- graphrag_eval-6.3.0/graphrag_eval/answer_relevance.py +0 -29
- graphrag_eval-6.3.0/graphrag_eval/evaluation.py +0 -104
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/LICENSE +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/README.md +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/cli/__init__.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/prompts/template.md +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/__init__.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/iri_discovery.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/sparql.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/steps/timeseries.py +0 -0
- {graphrag_eval-6.3.0 → graphrag_eval-6.4.0}/graphrag_eval/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: graphrag-eval
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.4.0
|
|
4
4
|
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Philip Ganchev
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import yaml
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
from collections.abc import Sequence
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from statistics import mean, median
|
|
7
6
|
from typing import Any, Collection, Iterable
|
|
8
7
|
|
|
9
|
-
|
|
8
|
+
import yaml
|
|
10
9
|
|
|
10
|
+
from . import evaluation
|
|
11
11
|
|
|
12
12
|
METRICS = [
|
|
13
13
|
"answer_recall",
|
|
@@ -155,7 +155,7 @@ def compute_micro_stats(
|
|
|
155
155
|
) -> dict:
|
|
156
156
|
if custom_metrics is None:
|
|
157
157
|
custom_metrics = []
|
|
158
|
-
|
|
158
|
+
|
|
159
159
|
values = number_of_samples_per_template_by_status.values()
|
|
160
160
|
micro_summary = defaultdict(dict, {
|
|
161
161
|
"number_of_error_samples": sum(v["error"] for v in values),
|
|
@@ -197,7 +197,7 @@ def compute_macro_stats(
|
|
|
197
197
|
) -> dict:
|
|
198
198
|
if custom_metrics is None:
|
|
199
199
|
custom_metrics = []
|
|
200
|
-
|
|
200
|
+
|
|
201
201
|
macro_summary = defaultdict(dict)
|
|
202
202
|
for metric in METRICS + custom_metrics:
|
|
203
203
|
means = [
|
|
@@ -1,21 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
4
|
+
from typing import Any, Self, TYPE_CHECKING
|
|
2
5
|
|
|
3
6
|
from pydantic import BaseModel, Field
|
|
4
7
|
|
|
5
8
|
from graphrag_eval.util import compute_f1
|
|
9
|
+
from .evaluator import Evaluator
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
6
13
|
|
|
7
14
|
|
|
8
15
|
def load_default_prompt() -> str:
|
|
9
|
-
with open(
|
|
16
|
+
with open(
|
|
17
|
+
Path(__file__).parent / "prompts" / "template.md",
|
|
18
|
+
encoding="utf-8"
|
|
19
|
+
) as f:
|
|
10
20
|
return f.read()
|
|
11
21
|
|
|
12
22
|
|
|
13
23
|
class AnswerCorrectnessConfig(BaseModel):
|
|
24
|
+
enabled: bool = Field(default=True)
|
|
14
25
|
prompt: str = Field(default_factory=load_default_prompt)
|
|
15
26
|
|
|
16
27
|
|
|
17
28
|
class InvalidPromptException(Exception):
|
|
18
|
-
def __init__(
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
message="The prompt template is invalid and cannot be "
|
|
32
|
+
"formatted."
|
|
33
|
+
):
|
|
19
34
|
self.message = message
|
|
20
35
|
super().__init__(self.message)
|
|
21
36
|
|
|
@@ -23,13 +38,25 @@ class InvalidPromptException(Exception):
|
|
|
23
38
|
class AnswerCorrectnessEvaluator:
|
|
24
39
|
def __init__(
|
|
25
40
|
self,
|
|
26
|
-
|
|
41
|
+
ragas_llm: InstructorBaseRagasLLM,
|
|
27
42
|
config: AnswerCorrectnessConfig | None = None,
|
|
28
43
|
):
|
|
29
44
|
self.config = config or AnswerCorrectnessConfig()
|
|
30
45
|
self.__validate_prompt_template(self.config.prompt)
|
|
31
46
|
self.prompt_template = self.config.prompt
|
|
32
|
-
self.
|
|
47
|
+
self.ragas_llm = ragas_llm
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def from_config(
|
|
51
|
+
cls,
|
|
52
|
+
ragas_llm: InstructorBaseRagasLLM | None,
|
|
53
|
+
config: AnswerCorrectnessConfig | None
|
|
54
|
+
) -> Self | None:
|
|
55
|
+
if ragas_llm is None:
|
|
56
|
+
return None
|
|
57
|
+
if config is None or not config.enabled:
|
|
58
|
+
return None
|
|
59
|
+
return cls(ragas_llm=ragas_llm, config=config)
|
|
33
60
|
|
|
34
61
|
@staticmethod
|
|
35
62
|
def __validate_prompt_template(prompt_template: str):
|
|
@@ -48,7 +75,7 @@ class AnswerCorrectnessEvaluator:
|
|
|
48
75
|
|
|
49
76
|
async def _agenerate(self, prompt):
|
|
50
77
|
"""Wrapper method for easier testing"""
|
|
51
|
-
return (await self.
|
|
78
|
+
return (await self.ragas_llm.agenerate(prompt, None)).choices[0].message.content
|
|
52
79
|
|
|
53
80
|
async def evaluate_answer(
|
|
54
81
|
self,
|
|
@@ -56,9 +83,13 @@ class AnswerCorrectnessEvaluator:
|
|
|
56
83
|
reference_answer: str,
|
|
57
84
|
actual_answer: str
|
|
58
85
|
) -> tuple[int, int, int, str]:
|
|
59
|
-
if any(
|
|
60
|
-
|
|
61
|
-
|
|
86
|
+
if any(
|
|
87
|
+
not s.strip() for s in [question, reference_answer, actual_answer]
|
|
88
|
+
):
|
|
89
|
+
raise ValueError(
|
|
90
|
+
"The question of the reference or the actual answer is a blank "
|
|
91
|
+
"string!"
|
|
92
|
+
)
|
|
62
93
|
prompt = self.prompt_template.format(
|
|
63
94
|
question=question,
|
|
64
95
|
reference_answer=reference_answer,
|
|
@@ -67,12 +98,14 @@ class AnswerCorrectnessEvaluator:
|
|
|
67
98
|
response_str = await self._agenerate(prompt)
|
|
68
99
|
return self.extract_response_values(response_str)
|
|
69
100
|
|
|
70
|
-
async def
|
|
101
|
+
async def evaluate(
|
|
71
102
|
self,
|
|
72
|
-
reference: dict,
|
|
73
|
-
actual: dict,
|
|
74
|
-
):
|
|
75
|
-
|
|
103
|
+
reference: dict[str, Any],
|
|
104
|
+
actual: dict[str, Any]
|
|
105
|
+
) -> dict[str, Any]:
|
|
106
|
+
if "actual_answer" not in actual or "reference_answer" not in reference:
|
|
107
|
+
return {}
|
|
108
|
+
result = {}
|
|
76
109
|
try:
|
|
77
110
|
num_ref_claims, num_actual_claims, num_matching_claims, reason = \
|
|
78
111
|
await self.evaluate_answer(
|
|
@@ -96,7 +129,7 @@ class AnswerCorrectnessEvaluator:
|
|
|
96
129
|
if f1 is not None:
|
|
97
130
|
result["answer_f1"] = f1
|
|
98
131
|
except Exception as exc:
|
|
99
|
-
result["
|
|
132
|
+
result["answer_correctness_error"] = str(exc)
|
|
100
133
|
return result
|
|
101
134
|
|
|
102
135
|
@staticmethod
|
|
@@ -134,6 +167,10 @@ class AnswerCorrectnessEvaluator:
|
|
|
134
167
|
n_matching > n_actual
|
|
135
168
|
]):
|
|
136
169
|
raise ValueError(
|
|
137
|
-
|
|
170
|
+
"Invalid claims counts combination: "
|
|
171
|
+
f"{n_ref}\t{n_actual}\t{n_matching}"
|
|
138
172
|
)
|
|
139
173
|
return n_ref, n_actual, n_matching, vals[3]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
_: Evaluator = AnswerCorrectnessEvaluator
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Self, TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from .evaluator import Evaluator
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
11
|
+
from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AnswerRelevanceConfig(BaseModel):
|
|
15
|
+
enabled: bool = Field(default=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AnswerRelevanceEvaluator:
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
ragas_llm: InstructorBaseRagasLLM,
|
|
22
|
+
ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding
|
|
23
|
+
):
|
|
24
|
+
from ragas.metrics.collections import AnswerRelevancy
|
|
25
|
+
self.scorer = AnswerRelevancy(llm=ragas_llm, embeddings=ragas_embedder)
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def from_config(
|
|
29
|
+
cls,
|
|
30
|
+
ragas_llm: InstructorBaseRagasLLM | None,
|
|
31
|
+
ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding | None,
|
|
32
|
+
config: AnswerRelevanceConfig | None
|
|
33
|
+
) -> Self | None:
|
|
34
|
+
if ragas_llm is None or ragas_embedder is None:
|
|
35
|
+
return None
|
|
36
|
+
if config is None or not config.enabled:
|
|
37
|
+
return None
|
|
38
|
+
return cls(ragas_llm=ragas_llm, ragas_embedder=ragas_embedder)
|
|
39
|
+
|
|
40
|
+
async def evaluate(
|
|
41
|
+
self,
|
|
42
|
+
reference: dict[str, Any],
|
|
43
|
+
actual: dict[str, Any]
|
|
44
|
+
) -> dict[str, Any]:
|
|
45
|
+
if "actual_answer" not in actual:
|
|
46
|
+
return {}
|
|
47
|
+
try:
|
|
48
|
+
result = await self.scorer.ascore(
|
|
49
|
+
user_input=reference["question_text"],
|
|
50
|
+
response=actual["actual_answer"]
|
|
51
|
+
)
|
|
52
|
+
return {
|
|
53
|
+
"answer_relevance": result.value
|
|
54
|
+
}
|
|
55
|
+
except Exception as e:
|
|
56
|
+
return {
|
|
57
|
+
"answer_relevance_error": str(e)
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
_: Evaluator = AnswerRelevanceEvaluator
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import argparse
|
|
2
4
|
import asyncio
|
|
3
5
|
import csv
|
|
4
6
|
from argparse import ArgumentParser
|
|
5
7
|
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
6
9
|
|
|
7
10
|
from tqdm import tqdm
|
|
8
11
|
|
|
@@ -10,34 +13,39 @@ from graphrag_eval import llm_factory
|
|
|
10
13
|
from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
|
|
11
14
|
from graphrag_eval.evaluation import Config
|
|
12
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
18
|
+
|
|
13
19
|
|
|
14
20
|
def parse_args() -> argparse.Namespace:
|
|
15
21
|
parser = ArgumentParser(
|
|
16
|
-
description="Calculates answer correctness over the entries from the
|
|
17
|
-
"stores the output in the output tsv
|
|
22
|
+
description="Calculates answer correctness over the entries from the "
|
|
23
|
+
"input tsv file and stores the output in the output tsv "
|
|
24
|
+
"file.",
|
|
18
25
|
)
|
|
19
26
|
parser.add_argument(
|
|
20
27
|
"-i",
|
|
21
28
|
"--input-tsv-file-path",
|
|
22
29
|
type=Path,
|
|
23
30
|
required=True,
|
|
24
|
-
help="Input tsv file path with columns `Question`, `Reference answer`
|
|
31
|
+
help="Input tsv file path with columns `Question`, `Reference answer` "
|
|
32
|
+
"and `Actual answer`",
|
|
25
33
|
)
|
|
26
34
|
parser.add_argument(
|
|
27
35
|
"-o",
|
|
28
36
|
"--output-tsv-file-path",
|
|
29
37
|
type=Path,
|
|
30
38
|
required=True,
|
|
31
|
-
help="Output tsv file path with columns `#Reference`, `#PTarget`,
|
|
32
|
-
"`Reasoning`, `Error`",
|
|
39
|
+
help="Output tsv file path with columns `#Reference`, `#PTarget`, "
|
|
40
|
+
"`#Matching`, `Reasoning`, `Error`",
|
|
33
41
|
)
|
|
34
42
|
parser.add_argument(
|
|
35
43
|
"-c",
|
|
36
44
|
"--config-yaml-file-path",
|
|
37
45
|
type=Path,
|
|
38
46
|
required=True,
|
|
39
|
-
help="Config yaml file path with definition of the LLM to use and
|
|
40
|
-
"prompt.",
|
|
47
|
+
help="Config yaml file path with definition of the LLM to use and "
|
|
48
|
+
"optionally a custom prompt.",
|
|
41
49
|
)
|
|
42
50
|
return parser.parse_args()
|
|
43
51
|
|
|
@@ -54,7 +62,9 @@ async def evaluate_and_write(
|
|
|
54
62
|
output_tsv_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
63
|
with open(output_tsv_file_path, "w", encoding="utf-8") as f:
|
|
56
64
|
writer = csv.writer(f, delimiter="\t")
|
|
57
|
-
writer.writerow(
|
|
65
|
+
writer.writerow(
|
|
66
|
+
["#Reference", "#PTarget", "#Matching", "Reasoning", "Error"]
|
|
67
|
+
)
|
|
58
68
|
|
|
59
69
|
for row in tqdm(rows):
|
|
60
70
|
if "Question" not in row or \
|
|
@@ -81,19 +91,26 @@ def run(
|
|
|
81
91
|
output_tsv_file_path: Path,
|
|
82
92
|
):
|
|
83
93
|
config = Config.parse(config_yaml_file_path)
|
|
84
|
-
ragas_llm = llm_factory.create_llm(
|
|
94
|
+
ragas_llm: InstructorBaseRagasLLM | None = llm_factory.create_llm(
|
|
95
|
+
config.llm
|
|
96
|
+
)
|
|
85
97
|
if ragas_llm is None:
|
|
86
|
-
raise ValueError(
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
98
|
+
raise ValueError(
|
|
99
|
+
"LLM must be configured to calculate the answer correctness!"
|
|
100
|
+
)
|
|
101
|
+
if config.answer_correctness and not config.answer_correctness.enabled:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
"Can't disable answer correctness, when running this script!"
|
|
91
104
|
)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
105
|
+
evaluator = AnswerCorrectnessEvaluator(
|
|
106
|
+
ragas_llm=ragas_llm,
|
|
107
|
+
config=config.answer_correctness,
|
|
108
|
+
)
|
|
109
|
+
asyncio.run(evaluate_and_write(
|
|
110
|
+
input_tsv_file_path,
|
|
111
|
+
output_tsv_file_path,
|
|
112
|
+
evaluator,
|
|
113
|
+
))
|
|
97
114
|
|
|
98
115
|
|
|
99
116
|
def main():
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
|
-
from typing import Literal
|
|
4
|
+
from typing import Literal, Self, TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
5
7
|
|
|
6
|
-
from
|
|
8
|
+
from .evaluator import Evaluator
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
7
12
|
|
|
8
13
|
RESERVED_KEYS = {
|
|
9
14
|
"template_id",
|
|
@@ -43,7 +48,7 @@ Inputs = Literal[
|
|
|
43
48
|
StepsKey = Literal["args", "output"]
|
|
44
49
|
|
|
45
50
|
|
|
46
|
-
class
|
|
51
|
+
class EvaluatorConfig(BaseModel):
|
|
47
52
|
model_config = ConfigDict(extra='forbid')
|
|
48
53
|
name: str
|
|
49
54
|
inputs: list[Inputs] = Field(..., min_length=1)
|
|
@@ -53,7 +58,7 @@ class Config(BaseModel):
|
|
|
53
58
|
steps_keys: set[StepsKey] | None = Field(default=None, min_length=1)
|
|
54
59
|
|
|
55
60
|
@model_validator(mode='after')
|
|
56
|
-
def validate_step_dependencies(self) ->
|
|
61
|
+
def validate_step_dependencies(self) -> Self:
|
|
57
62
|
if set(self.inputs) & {"reference_steps", "actual_steps"}:
|
|
58
63
|
suffix = "is required when steps are in inputs"
|
|
59
64
|
for var_name in ["steps_name", "steps_keys"]:
|
|
@@ -62,7 +67,7 @@ class Config(BaseModel):
|
|
|
62
67
|
return self
|
|
63
68
|
|
|
64
69
|
@model_validator(mode='after')
|
|
65
|
-
def validate_name_and_outputs(self) ->
|
|
70
|
+
def validate_name_and_outputs(self) -> Self:
|
|
66
71
|
if self.name + "_error" in RESERVED_KEYS:
|
|
67
72
|
raise ValueError(f"Name {self.name} is reserved")
|
|
68
73
|
conflicting_keys = set(self.outputs.keys()) & RESERVED_KEYS
|
|
@@ -76,7 +81,7 @@ def create_input_template(input_key: str) -> str:
|
|
|
76
81
|
return f"# {header}\n{{{input_key}}}"
|
|
77
82
|
|
|
78
83
|
|
|
79
|
-
def create_prompt_template(config:
|
|
84
|
+
def create_prompt_template(config: EvaluatorConfig, output_variables: list[str]) -> str:
|
|
80
85
|
"""
|
|
81
86
|
Return a template for the LLM prompt, with placeholders for the inputs,
|
|
82
87
|
instructions, outputs etc. We use this template at evaluation time to
|
|
@@ -99,8 +104,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
|
|
|
99
104
|
class CustomEvaluator:
|
|
100
105
|
def __init__(
|
|
101
106
|
self,
|
|
102
|
-
|
|
103
|
-
|
|
107
|
+
ragas_llm: InstructorBaseRagasLLM,
|
|
108
|
+
config: EvaluatorConfig,
|
|
104
109
|
):
|
|
105
110
|
self.name = config.name
|
|
106
111
|
self.input_variables = config.inputs
|
|
@@ -111,11 +116,24 @@ class CustomEvaluator:
|
|
|
111
116
|
config,
|
|
112
117
|
self.output_variables
|
|
113
118
|
)
|
|
114
|
-
self.
|
|
119
|
+
self.ragas_llm = ragas_llm
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def from_config(
|
|
123
|
+
cls,
|
|
124
|
+
ragas_llm: InstructorBaseRagasLLM | None,
|
|
125
|
+
evaluation_configs: list[EvaluatorConfig] | None
|
|
126
|
+
) -> list[Self]:
|
|
127
|
+
if ragas_llm and evaluation_configs:
|
|
128
|
+
return [
|
|
129
|
+
cls(ragas_llm, evaluation_config)
|
|
130
|
+
for evaluation_config in evaluation_configs
|
|
131
|
+
]
|
|
132
|
+
return []
|
|
115
133
|
|
|
116
134
|
async def _agenerate(self, prompt: str) -> str:
|
|
117
135
|
"""Wrapper method for easier testing"""
|
|
118
|
-
return (await self.
|
|
136
|
+
return (await self.ragas_llm.agenerate(prompt, None)).choices[0].message.content
|
|
119
137
|
|
|
120
138
|
def format_steps(self, steps: list) -> str:
|
|
121
139
|
steps_formatted = []
|
|
@@ -157,7 +175,11 @@ class CustomEvaluator:
|
|
|
157
175
|
return result
|
|
158
176
|
return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
|
|
159
177
|
|
|
160
|
-
async def evaluate(
|
|
178
|
+
async def evaluate(
|
|
179
|
+
self,
|
|
180
|
+
reference: dict[str, Any],
|
|
181
|
+
actual: dict[str, Any]
|
|
182
|
+
) -> dict[str, Any]:
|
|
161
183
|
inputs = {}
|
|
162
184
|
if "question" in self.input_variables:
|
|
163
185
|
if "question_text" not in reference:
|
|
@@ -195,10 +217,4 @@ class CustomEvaluator:
|
|
|
195
217
|
return self.parse_outputs(response)
|
|
196
218
|
|
|
197
219
|
|
|
198
|
-
|
|
199
|
-
if config.custom_evaluations and config.llm:
|
|
200
|
-
return [
|
|
201
|
-
CustomEvaluator(custom_evaluation_config, config)
|
|
202
|
-
for custom_evaluation_config in config.custom_evaluations
|
|
203
|
-
]
|
|
204
|
-
return []
|
|
220
|
+
_: Evaluator = CustomEvaluator
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Self, TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
from pydantic import BaseModel, Field, model_validator
|
|
8
|
+
|
|
9
|
+
from .answer_correctness import (
|
|
10
|
+
AnswerCorrectnessConfig,
|
|
11
|
+
AnswerCorrectnessEvaluator,
|
|
12
|
+
)
|
|
13
|
+
from .answer_relevance import AnswerRelevanceConfig, AnswerRelevanceEvaluator
|
|
14
|
+
from .custom_evaluation import EvaluatorConfig, CustomEvaluator
|
|
15
|
+
from .evaluator import Evaluator
|
|
16
|
+
from .llm_factory import LLMConfig, create_llm, create_embedder
|
|
17
|
+
from .steps.evaluation import evaluate_steps
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
21
|
+
from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Config(BaseModel):
|
|
25
|
+
llm: LLMConfig | None = None
|
|
26
|
+
custom_evaluations: list[EvaluatorConfig] | None = Field(
|
|
27
|
+
default=None,
|
|
28
|
+
min_length=1
|
|
29
|
+
)
|
|
30
|
+
answer_correctness: AnswerCorrectnessConfig | None = None
|
|
31
|
+
answer_relevance: AnswerRelevanceConfig | None = None
|
|
32
|
+
|
|
33
|
+
@model_validator(mode="after")
|
|
34
|
+
def validate_config_and_set_defaults(self) -> Self:
|
|
35
|
+
has_llm = self.llm is not None
|
|
36
|
+
has_embedding = has_llm and self.llm.embedding is not None
|
|
37
|
+
|
|
38
|
+
if self.answer_correctness is None and has_llm:
|
|
39
|
+
self.answer_correctness = AnswerCorrectnessConfig()
|
|
40
|
+
|
|
41
|
+
if self.answer_relevance is None and has_embedding:
|
|
42
|
+
self.answer_relevance = AnswerRelevanceConfig()
|
|
43
|
+
|
|
44
|
+
if self.custom_evaluations and not has_llm:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
"llm config is required if custom_evaluations are provided"
|
|
47
|
+
)
|
|
48
|
+
if (
|
|
49
|
+
self.answer_correctness
|
|
50
|
+
and self.answer_correctness.enabled
|
|
51
|
+
and not has_llm
|
|
52
|
+
):
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"llm config is required if answer correctness is enabled"
|
|
55
|
+
)
|
|
56
|
+
if (
|
|
57
|
+
self.answer_relevance
|
|
58
|
+
and self.answer_relevance.enabled
|
|
59
|
+
and not has_embedding
|
|
60
|
+
):
|
|
61
|
+
raise ValueError(
|
|
62
|
+
"llm config including embedding is required if answer "
|
|
63
|
+
"relevance is enabled"
|
|
64
|
+
)
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def parse(cls, config_file_path: str | Path | None) -> Self:
|
|
69
|
+
if config_file_path:
|
|
70
|
+
with open(config_file_path, encoding="utf-8") as f:
|
|
71
|
+
config_dict = yaml.safe_load(f)
|
|
72
|
+
return cls(**config_dict)
|
|
73
|
+
return cls()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def run_evaluation(
|
|
77
|
+
qa_dataset: list[dict],
|
|
78
|
+
responses_dict: dict,
|
|
79
|
+
config_file_path: str | Path | None = None,
|
|
80
|
+
) -> list[dict]:
|
|
81
|
+
evaluators, ragas_llm = parse_config_and_init_evaluators(config_file_path)
|
|
82
|
+
|
|
83
|
+
# Output metrics are not nested, for simpler aggregation
|
|
84
|
+
evaluation_results = []
|
|
85
|
+
for template in qa_dataset:
|
|
86
|
+
template_id = template["template_id"]
|
|
87
|
+
for question in template["questions"]:
|
|
88
|
+
actual_result = responses_dict[question["id"]]
|
|
89
|
+
eval_result = {
|
|
90
|
+
"template_id": template_id,
|
|
91
|
+
"question_id": actual_result["question_id"],
|
|
92
|
+
"question_text": question["question_text"]
|
|
93
|
+
}
|
|
94
|
+
for key in ("input_tokens", "output_tokens", "total_tokens",
|
|
95
|
+
"elapsed_sec"):
|
|
96
|
+
if key in actual_result:
|
|
97
|
+
eval_result[key] = actual_result[key]
|
|
98
|
+
if "actual_answer" in actual_result:
|
|
99
|
+
eval_result["actual_answer"] = actual_result["actual_answer"]
|
|
100
|
+
if "reference_answer" in question:
|
|
101
|
+
eval_result["reference_answer"] = question["reference_answer"]
|
|
102
|
+
if "reference_steps" in question:
|
|
103
|
+
eval_result["reference_steps"] = question["reference_steps"]
|
|
104
|
+
if "error" in actual_result:
|
|
105
|
+
eval_result.update({
|
|
106
|
+
"status": "error",
|
|
107
|
+
"error": actual_result["error"],
|
|
108
|
+
})
|
|
109
|
+
else:
|
|
110
|
+
eval_result["status"] = "success"
|
|
111
|
+
|
|
112
|
+
eval_result.update(
|
|
113
|
+
await evaluate_steps(question, actual_result, ragas_llm)
|
|
114
|
+
)
|
|
115
|
+
for evaluator in evaluators:
|
|
116
|
+
eval_result.update(
|
|
117
|
+
await evaluator.evaluate(question, actual_result)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
evaluation_results.append(eval_result)
|
|
121
|
+
return evaluation_results
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def parse_config_and_init_evaluators(
|
|
125
|
+
config_file_path: str | Path | None
|
|
126
|
+
) -> tuple[
|
|
127
|
+
list[Evaluator],
|
|
128
|
+
InstructorBaseRagasLLM | None,
|
|
129
|
+
]:
|
|
130
|
+
config = Config.parse(config_file_path)
|
|
131
|
+
ragas_llm: InstructorBaseRagasLLM | None = create_llm(config.llm)
|
|
132
|
+
ragas_embedder: BaseRagasEmbeddings | BaseRagasEmbedding | None = (
|
|
133
|
+
create_embedder(config.llm)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
evaluators: list[Evaluator] = []
|
|
137
|
+
|
|
138
|
+
answer_relevance_evaluator = AnswerRelevanceEvaluator.from_config(
|
|
139
|
+
ragas_llm, ragas_embedder, config.answer_relevance
|
|
140
|
+
)
|
|
141
|
+
if answer_relevance_evaluator:
|
|
142
|
+
evaluators.append(answer_relevance_evaluator)
|
|
143
|
+
|
|
144
|
+
answer_correctness_evaluator = AnswerCorrectnessEvaluator.from_config(
|
|
145
|
+
ragas_llm, config.answer_correctness
|
|
146
|
+
)
|
|
147
|
+
if answer_correctness_evaluator:
|
|
148
|
+
evaluators.append(answer_correctness_evaluator)
|
|
149
|
+
|
|
150
|
+
evaluators.extend(
|
|
151
|
+
CustomEvaluator.from_config(ragas_llm, config.custom_evaluations)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return evaluators, ragas_llm
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import Protocol, Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Evaluator(Protocol):
|
|
5
|
+
async def evaluate(
|
|
6
|
+
self,
|
|
7
|
+
reference: dict[str, Any],
|
|
8
|
+
actual: dict[str, Any]
|
|
9
|
+
) -> dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Evaluate the actual output against the reference.
|
|
12
|
+
Returns a flat dictionary containing scores or error tracking logs.
|
|
13
|
+
"""
|
|
14
|
+
...
|
|
@@ -1,7 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
3
5
|
from pydantic import BaseModel, ConfigDict, Field
|
|
4
6
|
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
9
|
+
from ragas.embeddings.base import BaseRagasEmbeddings, BaseRagasEmbedding
|
|
10
|
+
|
|
5
11
|
|
|
6
12
|
class GenerationConfig(BaseModel):
|
|
7
13
|
provider: str
|
|
@@ -17,18 +23,20 @@ class EmbeddingConfig(BaseModel):
|
|
|
17
23
|
model_config = ConfigDict(extra='allow')
|
|
18
24
|
|
|
19
25
|
|
|
20
|
-
class
|
|
26
|
+
class LLMConfig(BaseModel):
|
|
21
27
|
generation: GenerationConfig
|
|
22
28
|
embedding: EmbeddingConfig | None = None
|
|
23
29
|
|
|
24
30
|
|
|
25
|
-
def create_llm(
|
|
26
|
-
|
|
31
|
+
def create_llm(
|
|
32
|
+
config: LLMConfig | None
|
|
33
|
+
) -> InstructorBaseRagasLLM | None:
|
|
34
|
+
if config:
|
|
27
35
|
import litellm
|
|
28
36
|
from ragas.llms import llm_factory
|
|
29
37
|
|
|
30
38
|
litellm.drop_params = True # Remove unsupported params from requests
|
|
31
|
-
params = config.
|
|
39
|
+
params = config.generation.model_dump()
|
|
32
40
|
ragas_llm = llm_factory(
|
|
33
41
|
provider="litellm",
|
|
34
42
|
model=f"{params.pop('provider')}/{params.pop('model')}",
|
|
@@ -40,13 +48,15 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
|
|
|
40
48
|
return None
|
|
41
49
|
|
|
42
50
|
|
|
43
|
-
def create_embedder(
|
|
44
|
-
|
|
51
|
+
def create_embedder(
|
|
52
|
+
config: LLMConfig | None
|
|
53
|
+
) -> BaseRagasEmbeddings | BaseRagasEmbedding | None:
|
|
54
|
+
if config and config.embedding:
|
|
45
55
|
import litellm
|
|
46
56
|
from ragas.embeddings.base import embedding_factory
|
|
47
57
|
|
|
48
58
|
litellm.drop_params = True # Remove unsupported params from requests
|
|
49
|
-
params = config.
|
|
59
|
+
params = config.embedding.model_dump()
|
|
50
60
|
ragas_embedder = embedding_factory(
|
|
51
61
|
provider="litellm",
|
|
52
62
|
model=f"{params.pop('provider')}/{params.pop('model')}",
|
|
@@ -1,13 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
from collections import defaultdict
|
|
4
6
|
from collections.abc import Sequence
|
|
5
|
-
from typing import Any
|
|
7
|
+
from typing import Any, TYPE_CHECKING
|
|
6
8
|
|
|
7
9
|
from .iri_discovery import do_iri_discovery_steps_equal
|
|
8
10
|
from .retrieval_context_ids import recall_at_k
|
|
9
11
|
from .sparql import compare_sparql_results
|
|
10
|
-
from .timeseries import
|
|
12
|
+
from .timeseries import (
|
|
13
|
+
do_retrieve_time_series_steps_equal,
|
|
14
|
+
do_retrieve_data_points_steps_equal,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from ragas.llms.base import InstructorBaseRagasLLM
|
|
11
19
|
|
|
12
20
|
logger = logging.getLogger(__name__)
|
|
13
21
|
|
|
@@ -140,7 +148,7 @@ def calculate_steps_score(
|
|
|
140
148
|
async def evaluate_steps(
|
|
141
149
|
reference: dict,
|
|
142
150
|
actual: dict,
|
|
143
|
-
ragas_llm:
|
|
151
|
+
ragas_llm: InstructorBaseRagasLLM | None,
|
|
144
152
|
) -> dict:
|
|
145
153
|
eval_result = {}
|
|
146
154
|
actual_steps = actual.get("actual_steps", [])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "6.
|
|
3
|
+
version = "6.4.0"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from ragas.embeddings.base import BaseRagasEmbedding
|
|
2
|
-
from ragas.llms.base import InstructorBaseRagasLLM
|
|
3
|
-
from ragas.metrics.collections import AnswerRelevancy
|
|
4
|
-
|
|
5
|
-
from graphrag_eval.util import singleton
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@singleton
|
|
9
|
-
class Evaluator:
|
|
10
|
-
def __init__(self, ragas_llm: InstructorBaseRagasLLM, ragas_embedder: BaseRagasEmbedding):
|
|
11
|
-
self.scorer = AnswerRelevancy(llm=ragas_llm, embeddings=ragas_embedder)
|
|
12
|
-
|
|
13
|
-
async def get_relevance_dict(
|
|
14
|
-
self,
|
|
15
|
-
question_text: str,
|
|
16
|
-
actual_answer: str,
|
|
17
|
-
) -> dict:
|
|
18
|
-
try:
|
|
19
|
-
result = await self.scorer.ascore(
|
|
20
|
-
user_input=question_text,
|
|
21
|
-
response=actual_answer
|
|
22
|
-
)
|
|
23
|
-
return {
|
|
24
|
-
"answer_relevance": result.value
|
|
25
|
-
}
|
|
26
|
-
except Exception as e:
|
|
27
|
-
return {
|
|
28
|
-
"answer_relevance_error": str(e)
|
|
29
|
-
}
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import yaml
|
|
4
|
-
from pydantic import BaseModel, Field, model_validator
|
|
5
|
-
|
|
6
|
-
from . import custom_evaluation
|
|
7
|
-
from .answer_correctness import AnswerCorrectnessConfig
|
|
8
|
-
from .llm_factory import Config as LLMConfig, create_llm, create_embedder
|
|
9
|
-
from .steps.evaluation import evaluate_steps
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class Config(BaseModel):
|
|
13
|
-
llm: LLMConfig | None = None
|
|
14
|
-
custom_evaluations: list[custom_evaluation.Config] | None \
|
|
15
|
-
= Field(default=None, min_length=1)
|
|
16
|
-
answer_correctness: AnswerCorrectnessConfig | None = None
|
|
17
|
-
|
|
18
|
-
@model_validator(mode="after")
|
|
19
|
-
def validate_config(self) -> "Config":
|
|
20
|
-
if self.custom_evaluations and not self.llm:
|
|
21
|
-
msg = "llm config is required if custom_evaluations are provided"
|
|
22
|
-
raise ValueError(msg)
|
|
23
|
-
return self
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def parse(cls, config_file_path: str | Path | None) -> "Config":
|
|
27
|
-
if config_file_path:
|
|
28
|
-
with open(config_file_path, encoding="utf-8") as f:
|
|
29
|
-
config_dict = yaml.safe_load(f)
|
|
30
|
-
return cls(**config_dict)
|
|
31
|
-
return cls()
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
async def run_evaluation(
|
|
35
|
-
qa_dataset: list[dict],
|
|
36
|
-
responses_dict: dict,
|
|
37
|
-
config_file_path: str | Path | None = None,
|
|
38
|
-
) -> list[dict]:
|
|
39
|
-
# Output metrics are not nested, for simpler aggregation
|
|
40
|
-
evaluation_results = []
|
|
41
|
-
config = Config.parse(config_file_path)
|
|
42
|
-
ragas_llm = create_llm(config)
|
|
43
|
-
ragas_embedder = create_embedder(config)
|
|
44
|
-
custom_evaluators = custom_evaluation.create_evaluators(config)
|
|
45
|
-
for template in qa_dataset:
|
|
46
|
-
template_id = template["template_id"]
|
|
47
|
-
for question in template["questions"]:
|
|
48
|
-
actual_result = responses_dict[question["id"]]
|
|
49
|
-
eval_result = {
|
|
50
|
-
"template_id": template_id,
|
|
51
|
-
"question_id": actual_result["question_id"],
|
|
52
|
-
"question_text": question["question_text"]
|
|
53
|
-
}
|
|
54
|
-
if "reference_answer" in question:
|
|
55
|
-
eval_result["reference_answer"] = question["reference_answer"]
|
|
56
|
-
if "reference_steps" in question:
|
|
57
|
-
eval_result["reference_steps"] = question["reference_steps"]
|
|
58
|
-
if "error" in actual_result:
|
|
59
|
-
eval_result.update({
|
|
60
|
-
"status": "error",
|
|
61
|
-
"error": actual_result["error"],
|
|
62
|
-
})
|
|
63
|
-
else:
|
|
64
|
-
eval_result["status"] = "success"
|
|
65
|
-
|
|
66
|
-
if "actual_answer" in actual_result:
|
|
67
|
-
eval_result["actual_answer"] = actual_result["actual_answer"]
|
|
68
|
-
if ragas_llm:
|
|
69
|
-
from graphrag_eval.answer_relevance import Evaluator
|
|
70
|
-
relevance_evaluator = Evaluator(ragas_llm, ragas_embedder)
|
|
71
|
-
eval_result.update(
|
|
72
|
-
await relevance_evaluator.get_relevance_dict(
|
|
73
|
-
question["question_text"],
|
|
74
|
-
actual_result["actual_answer"],
|
|
75
|
-
)
|
|
76
|
-
)
|
|
77
|
-
if "reference_answer" in question and ragas_llm:
|
|
78
|
-
from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
|
|
79
|
-
answer_correctness_evaluator = AnswerCorrectnessEvaluator(
|
|
80
|
-
llm=ragas_llm,
|
|
81
|
-
config=config.answer_correctness,
|
|
82
|
-
)
|
|
83
|
-
eval_result.update(
|
|
84
|
-
await answer_correctness_evaluator.get_correctness_dict(
|
|
85
|
-
question,
|
|
86
|
-
actual_result,
|
|
87
|
-
)
|
|
88
|
-
)
|
|
89
|
-
eval_result.update(
|
|
90
|
-
await evaluate_steps(
|
|
91
|
-
question,
|
|
92
|
-
actual_result,
|
|
93
|
-
ragas_llm,
|
|
94
|
-
)
|
|
95
|
-
)
|
|
96
|
-
for custom_evaluator in custom_evaluators:
|
|
97
|
-
custom_metrics = await custom_evaluator.evaluate(question, actual_result)
|
|
98
|
-
eval_result.update(**custom_metrics)
|
|
99
|
-
for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
|
|
100
|
-
if key in actual_result:
|
|
101
|
-
eval_result[key] = actual_result[key]
|
|
102
|
-
|
|
103
|
-
evaluation_results.append(eval_result)
|
|
104
|
-
return evaluation_results
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|