graphrag-eval 6.1.0__tar.gz → 6.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphrag_eval-6.3.0/PKG-INFO +46 -0
- graphrag_eval-6.3.0/README.md +27 -0
- graphrag_eval-6.3.0/graphrag_eval/answer_correctness.py +139 -0
- graphrag_eval-6.3.0/graphrag_eval/cli/answer_correctness.py +105 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/custom_evaluation.py +15 -15
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/evaluation.py +9 -6
- graphrag_eval-6.1.0/graphrag_eval/llm.py → graphrag_eval-6.3.0/graphrag_eval/llm_factory.py +4 -2
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/prompts/template.md +1 -1
- graphrag_eval-6.3.0/graphrag_eval/steps/__init__.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/pyproject.toml +3 -3
- graphrag_eval-6.1.0/PKG-INFO +0 -1310
- graphrag_eval-6.1.0/README.md +0 -1291
- graphrag_eval-6.1.0/graphrag_eval/answer_correctness.py +0 -192
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/LICENSE +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/aggregation.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/answer_relevance.py +0 -0
- {graphrag_eval-6.1.0/graphrag_eval/steps → graphrag_eval-6.3.0/graphrag_eval/cli}/__init__.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/evaluation.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/iri_discovery.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/sparql.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/steps/timeseries.py +0 -0
- {graphrag_eval-6.1.0 → graphrag_eval-6.3.0}/graphrag_eval/util.py +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: graphrag-eval
|
|
3
|
+
Version: 6.3.0
|
|
4
|
+
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Philip Ganchev
|
|
7
|
+
Author-email: philip.ganchev@graphwise.ai
|
|
8
|
+
Requires-Python: >=3.12,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Provides-Extra: llm
|
|
13
|
+
Requires-Dist: pydantic (==2.12.5)
|
|
14
|
+
Requires-Dist: python-dateutil (==2.9.0.post0)
|
|
15
|
+
Requires-Dist: ragas (==0.4.3) ; extra == "llm"
|
|
16
|
+
Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
# QA Evaluation
|
|
24
|
+
|
|
25
|
+
This is a Python library for assessing the quality of question-answering systems, such as systems built with LLM-based agents. It is agnostic to the agent implementation and the LLM it uses.
|
|
26
|
+
|
|
27
|
+
The evaluation is based on a user-provided reference dataset containing queries, reference responses, and optional reference steps, such as expected tool uses. The evaluator compares these references with the agent's actual responses and executed steps. Reference steps can be grouped to allow some expected steps to occur in any order.
|
|
28
|
+
|
|
29
|
+
The library provides built-in evaluation metrics and supports user-defined custom metrics ([§ Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)).
|
|
30
|
+
|
|
31
|
+
## Documentation
|
|
32
|
+
|
|
33
|
+
- [Quickstart](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/quickstart.md)
|
|
34
|
+
- [Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)
|
|
35
|
+
- [Configuration](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/config.md)
|
|
36
|
+
- [Input](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/input.md)
|
|
37
|
+
- [Output](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/output.md)
|
|
38
|
+
|
|
39
|
+
## Maintainers
|
|
40
|
+
|
|
41
|
+
Developed and maintained by [Graphwise](https://graphwise.ai/). For issues and feature requests, please open a [GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
Apache-2.0 License. See the [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
|
|
46
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# QA Evaluation
|
|
6
|
+
|
|
7
|
+
This is a Python library for assessing the quality of question-answering systems, such as systems built with LLM-based agents. It is agnostic to the agent implementation and the LLM it uses.
|
|
8
|
+
|
|
9
|
+
The evaluation is based on a user-provided reference dataset containing queries, reference responses, and optional reference steps, such as expected tool uses. The evaluator compares these references with the agent's actual responses and executed steps. Reference steps can be grouped to allow some expected steps to occur in any order.
|
|
10
|
+
|
|
11
|
+
The library provides built-in evaluation metrics and supports user-defined custom metrics ([§ Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)).
|
|
12
|
+
|
|
13
|
+
## Documentation
|
|
14
|
+
|
|
15
|
+
- [Quickstart](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/quickstart.md)
|
|
16
|
+
- [Metrics](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/metrics.md)
|
|
17
|
+
- [Configuration](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/config.md)
|
|
18
|
+
- [Input](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/input.md)
|
|
19
|
+
- [Output](https://github.com/Ontotext-AD/graphrag-eval/blob/main/docs/output.md)
|
|
20
|
+
|
|
21
|
+
## Maintainers
|
|
22
|
+
|
|
23
|
+
Developed and maintained by [Graphwise](https://graphwise.ai/). For issues and feature requests, please open a [GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
|
|
24
|
+
|
|
25
|
+
## License
|
|
26
|
+
|
|
27
|
+
Apache-2.0 License. See the [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from graphrag_eval.util import compute_f1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_default_prompt() -> str:
|
|
9
|
+
with open(Path(__file__).parent / "prompts" / "template.md", "r", encoding="utf-8") as f:
|
|
10
|
+
return f.read()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AnswerCorrectnessConfig(BaseModel):
|
|
14
|
+
prompt: str = Field(default_factory=load_default_prompt)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InvalidPromptException(Exception):
|
|
18
|
+
def __init__(self, message="The prompt template is invalid and cannot be formatted."):
|
|
19
|
+
self.message = message
|
|
20
|
+
super().__init__(self.message)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AnswerCorrectnessEvaluator:
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
llm: "InstructorBaseRagasLLM",
|
|
27
|
+
config: AnswerCorrectnessConfig | None = None,
|
|
28
|
+
):
|
|
29
|
+
self.config = config or AnswerCorrectnessConfig()
|
|
30
|
+
self.__validate_prompt_template(self.config.prompt)
|
|
31
|
+
self.prompt_template = self.config.prompt
|
|
32
|
+
self.llm = llm
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def __validate_prompt_template(prompt_template: str):
|
|
36
|
+
try:
|
|
37
|
+
prompt_template.format(
|
|
38
|
+
question="Q?",
|
|
39
|
+
reference_answer="R",
|
|
40
|
+
actual_answer="A",
|
|
41
|
+
)
|
|
42
|
+
except Exception as exc:
|
|
43
|
+
raise InvalidPromptException(
|
|
44
|
+
"Invalid prompt template. Must only contain placeholders: "
|
|
45
|
+
"{question}, {reference_answer}, and {actual_answer}. "
|
|
46
|
+
f"Original error: {exc}"
|
|
47
|
+
) from exc
|
|
48
|
+
|
|
49
|
+
async def _agenerate(self, prompt):
|
|
50
|
+
"""Wrapper method for easier testing"""
|
|
51
|
+
return (await self.llm.agenerate(prompt, None)).choices[0].message.content
|
|
52
|
+
|
|
53
|
+
async def evaluate_answer(
|
|
54
|
+
self,
|
|
55
|
+
question: str,
|
|
56
|
+
reference_answer: str,
|
|
57
|
+
actual_answer: str
|
|
58
|
+
) -> tuple[int, int, int, str]:
|
|
59
|
+
if any(not s.strip() for s in [question, reference_answer, actual_answer]):
|
|
60
|
+
raise ValueError("The question of the reference or the actual answer is a blank "
|
|
61
|
+
"string!")
|
|
62
|
+
prompt = self.prompt_template.format(
|
|
63
|
+
question=question,
|
|
64
|
+
reference_answer=reference_answer,
|
|
65
|
+
actual_answer=actual_answer,
|
|
66
|
+
)
|
|
67
|
+
response_str = await self._agenerate(prompt)
|
|
68
|
+
return self.extract_response_values(response_str)
|
|
69
|
+
|
|
70
|
+
async def get_correctness_dict(
|
|
71
|
+
self,
|
|
72
|
+
reference: dict,
|
|
73
|
+
actual: dict,
|
|
74
|
+
):
|
|
75
|
+
result = {"reference_answer": reference["reference_answer"]}
|
|
76
|
+
try:
|
|
77
|
+
num_ref_claims, num_actual_claims, num_matching_claims, reason = \
|
|
78
|
+
await self.evaluate_answer(
|
|
79
|
+
reference["question_text"],
|
|
80
|
+
reference["reference_answer"],
|
|
81
|
+
actual["actual_answer"],
|
|
82
|
+
)
|
|
83
|
+
result.update({
|
|
84
|
+
"answer_reference_claims_count": num_ref_claims,
|
|
85
|
+
"answer_actual_claims_count": num_actual_claims,
|
|
86
|
+
"answer_matching_claims_count": num_matching_claims,
|
|
87
|
+
"answer_correctness_reason": reason,
|
|
88
|
+
})
|
|
89
|
+
recall, precision, f1 = self.compute_recall_precision_f1(
|
|
90
|
+
num_ref_claims, num_actual_claims, num_matching_claims
|
|
91
|
+
)
|
|
92
|
+
if recall is not None:
|
|
93
|
+
result["answer_recall"] = recall
|
|
94
|
+
if precision is not None:
|
|
95
|
+
result["answer_precision"] = precision
|
|
96
|
+
if f1 is not None:
|
|
97
|
+
result["answer_f1"] = f1
|
|
98
|
+
except Exception as exc:
|
|
99
|
+
result["answer_eval_error"] = str(exc)
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def compute_recall_precision_f1(
|
|
104
|
+
n_pos: int,
|
|
105
|
+
n_pred_pos: int,
|
|
106
|
+
n_true_pos: int,
|
|
107
|
+
) -> tuple[float | None, float | None, float | None]:
|
|
108
|
+
recall = None
|
|
109
|
+
precision = None
|
|
110
|
+
if n_pos:
|
|
111
|
+
recall = n_true_pos / n_pos
|
|
112
|
+
if n_pred_pos:
|
|
113
|
+
precision = n_true_pos / n_pred_pos
|
|
114
|
+
return recall, precision, compute_f1(recall, precision)
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def extract_response_values(
|
|
118
|
+
response: str
|
|
119
|
+
) -> tuple[int, int, int, str]:
|
|
120
|
+
vals = response.split("\t")
|
|
121
|
+
n = len(vals)
|
|
122
|
+
if n < 4:
|
|
123
|
+
raise ValueError(f"Expected 4 tab-separated values: {response}")
|
|
124
|
+
vals = vals[:4]
|
|
125
|
+
try:
|
|
126
|
+
n_ref, n_actual, n_matching = map(int, vals[:3])
|
|
127
|
+
except ValueError:
|
|
128
|
+
raise ValueError(f"Claims counts should be ints: {vals}")
|
|
129
|
+
if any([
|
|
130
|
+
n_ref < 1,
|
|
131
|
+
n_actual < 1,
|
|
132
|
+
n_matching < 0,
|
|
133
|
+
n_matching > n_ref,
|
|
134
|
+
n_matching > n_actual
|
|
135
|
+
]):
|
|
136
|
+
raise ValueError(
|
|
137
|
+
f"Invalid claims counts combination: {n_ref}\t{n_actual}\t{n_matching}"
|
|
138
|
+
)
|
|
139
|
+
return n_ref, n_actual, n_matching, vals[3]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import asyncio
|
|
3
|
+
import csv
|
|
4
|
+
from argparse import ArgumentParser
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from graphrag_eval import llm_factory
|
|
10
|
+
from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
|
|
11
|
+
from graphrag_eval.evaluation import Config
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_args() -> argparse.Namespace:
|
|
15
|
+
parser = ArgumentParser(
|
|
16
|
+
description="Calculates answer correctness over the entries from the input tsv file and "
|
|
17
|
+
"stores the output in the output tsv file.",
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"-i",
|
|
21
|
+
"--input-tsv-file-path",
|
|
22
|
+
type=Path,
|
|
23
|
+
required=True,
|
|
24
|
+
help="Input tsv file path with columns `Question`, `Reference answer` and `Actual answer`",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"-o",
|
|
28
|
+
"--output-tsv-file-path",
|
|
29
|
+
type=Path,
|
|
30
|
+
required=True,
|
|
31
|
+
help="Output tsv file path with columns `#Reference`, `#PTarget`, `#Matching`, "
|
|
32
|
+
"`Reasoning`, `Error`",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-c",
|
|
36
|
+
"--config-yaml-file-path",
|
|
37
|
+
type=Path,
|
|
38
|
+
required=True,
|
|
39
|
+
help="Config yaml file path with definition of the LLM to use and optionally a custom "
|
|
40
|
+
"prompt.",
|
|
41
|
+
)
|
|
42
|
+
return parser.parse_args()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def evaluate_and_write(
|
|
46
|
+
input_tsv_file_path: Path,
|
|
47
|
+
output_tsv_file_path: Path,
|
|
48
|
+
evaluator: AnswerCorrectnessEvaluator,
|
|
49
|
+
) -> None:
|
|
50
|
+
with open(input_tsv_file_path, encoding="utf-8") as f:
|
|
51
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
52
|
+
rows = [row for row in reader]
|
|
53
|
+
print(f"Writing results to {output_tsv_file_path}")
|
|
54
|
+
output_tsv_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
with open(output_tsv_file_path, "w", encoding="utf-8") as f:
|
|
56
|
+
writer = csv.writer(f, delimiter="\t")
|
|
57
|
+
writer.writerow(["#Reference", "#PTarget", "#Matching", "Reasoning", "Error"])
|
|
58
|
+
|
|
59
|
+
for row in tqdm(rows):
|
|
60
|
+
if "Question" not in row or \
|
|
61
|
+
"Reference answer" not in row or \
|
|
62
|
+
"Actual answer" not in row:
|
|
63
|
+
raise ValueError("Unexpected input format!")
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
vals = await evaluator.evaluate_answer(
|
|
67
|
+
row["Question"],
|
|
68
|
+
row["Reference answer"],
|
|
69
|
+
row["Actual answer"]
|
|
70
|
+
)
|
|
71
|
+
vals = vals + ("",)
|
|
72
|
+
writer.writerow(vals)
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
writer.writerow(["", "", "", "", str(exc)])
|
|
75
|
+
f.flush()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def run(
|
|
79
|
+
config_yaml_file_path: Path,
|
|
80
|
+
input_tsv_file_path: Path,
|
|
81
|
+
output_tsv_file_path: Path,
|
|
82
|
+
):
|
|
83
|
+
config = Config.parse(config_yaml_file_path)
|
|
84
|
+
ragas_llm = llm_factory.create_llm(config)
|
|
85
|
+
if ragas_llm is None:
|
|
86
|
+
raise ValueError("LLM must be configured to calculate the answer correctness!")
|
|
87
|
+
else:
|
|
88
|
+
evaluator = AnswerCorrectnessEvaluator(
|
|
89
|
+
llm=ragas_llm,
|
|
90
|
+
config=config.answer_correctness,
|
|
91
|
+
)
|
|
92
|
+
asyncio.run(evaluate_and_write(
|
|
93
|
+
input_tsv_file_path,
|
|
94
|
+
output_tsv_file_path,
|
|
95
|
+
evaluator,
|
|
96
|
+
))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def main():
|
|
100
|
+
args = parse_args()
|
|
101
|
+
run(
|
|
102
|
+
args.config_yaml_file_path,
|
|
103
|
+
args.input_tsv_file_path,
|
|
104
|
+
args.output_tsv_file_path,
|
|
105
|
+
)
|
|
@@ -3,6 +3,7 @@ from typing import Literal
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
5
5
|
|
|
6
|
+
from graphrag_eval.llm_factory import create_llm
|
|
6
7
|
|
|
7
8
|
RESERVED_KEYS = {
|
|
8
9
|
"template_id",
|
|
@@ -31,7 +32,6 @@ RESERVED_KEYS = {
|
|
|
31
32
|
"elapsed_sec",
|
|
32
33
|
}
|
|
33
34
|
|
|
34
|
-
|
|
35
35
|
Inputs = Literal[
|
|
36
36
|
"question",
|
|
37
37
|
"reference_answer",
|
|
@@ -84,8 +84,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
|
|
|
84
84
|
|
|
85
85
|
output_variables specifies the order of the outputs.
|
|
86
86
|
"""
|
|
87
|
-
output_instructions = "Output the following values separated by tabs:"\
|
|
88
|
-
|
|
87
|
+
output_instructions = "Output the following values separated by tabs:" \
|
|
88
|
+
+ "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
|
|
89
89
|
inputs_template = "\n\n".join(
|
|
90
90
|
create_input_template(k) for k in config.inputs
|
|
91
91
|
)
|
|
@@ -98,9 +98,9 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
|
|
|
98
98
|
|
|
99
99
|
class CustomEvaluator:
|
|
100
100
|
def __init__(
|
|
101
|
-
self,
|
|
101
|
+
self,
|
|
102
102
|
config: Config,
|
|
103
|
-
|
|
103
|
+
eval_config: "evaluation.Config",
|
|
104
104
|
):
|
|
105
105
|
self.name = config.name
|
|
106
106
|
self.input_variables = config.inputs
|
|
@@ -111,11 +111,11 @@ class CustomEvaluator:
|
|
|
111
111
|
config,
|
|
112
112
|
self.output_variables
|
|
113
113
|
)
|
|
114
|
-
self.llm =
|
|
114
|
+
self.llm = create_llm(eval_config)
|
|
115
115
|
|
|
116
|
-
def
|
|
116
|
+
async def _agenerate(self, prompt: str) -> str:
|
|
117
117
|
"""Wrapper method for easier testing"""
|
|
118
|
-
return self.llm.
|
|
118
|
+
return (await self.llm.agenerate(prompt, None)).choices[0].message.content
|
|
119
119
|
|
|
120
120
|
def format_steps(self, steps: list) -> str:
|
|
121
121
|
steps_formatted = []
|
|
@@ -134,9 +134,9 @@ class CustomEvaluator:
|
|
|
134
134
|
step_out[k] = val
|
|
135
135
|
else:
|
|
136
136
|
step_out[k] = val
|
|
137
|
-
steps_formatted.append(step_out)
|
|
137
|
+
steps_formatted.append(step_out)
|
|
138
138
|
return json.dumps(steps_formatted, indent=2)
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
def error(self, msg: str) -> dict:
|
|
141
141
|
result = {k: None for k in self.output_variables}
|
|
142
142
|
result[self.name + '_error'] = msg
|
|
@@ -157,7 +157,7 @@ class CustomEvaluator:
|
|
|
157
157
|
return result
|
|
158
158
|
return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
|
|
159
159
|
|
|
160
|
-
def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
|
|
160
|
+
async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
|
|
161
161
|
inputs = {}
|
|
162
162
|
if "question" in self.input_variables:
|
|
163
163
|
if "question_text" not in reference:
|
|
@@ -176,7 +176,7 @@ class CustomEvaluator:
|
|
|
176
176
|
return self.error("Reference missing key 'reference_steps'")
|
|
177
177
|
try:
|
|
178
178
|
formatted_steps_lists = [
|
|
179
|
-
self.format_steps(group)
|
|
179
|
+
self.format_steps(group)
|
|
180
180
|
for group in reference["reference_steps"]
|
|
181
181
|
]
|
|
182
182
|
except json.JSONDecodeError:
|
|
@@ -191,14 +191,14 @@ class CustomEvaluator:
|
|
|
191
191
|
return self.error("Malformed actual step JSON")
|
|
192
192
|
inputs["actual_steps"] = formatted_steps_lists
|
|
193
193
|
prompt = self.prompt_template.format(**inputs)
|
|
194
|
-
response = self.
|
|
194
|
+
response = await self._agenerate(prompt)
|
|
195
195
|
return self.parse_outputs(response)
|
|
196
196
|
|
|
197
197
|
|
|
198
198
|
def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
|
|
199
199
|
if config.custom_evaluations and config.llm:
|
|
200
200
|
return [
|
|
201
|
-
CustomEvaluator(
|
|
202
|
-
for
|
|
201
|
+
CustomEvaluator(custom_evaluation_config, config)
|
|
202
|
+
for custom_evaluation_config in config.custom_evaluations
|
|
203
203
|
]
|
|
204
204
|
return []
|
|
@@ -4,7 +4,8 @@ import yaml
|
|
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
|
5
5
|
|
|
6
6
|
from . import custom_evaluation
|
|
7
|
-
from .
|
|
7
|
+
from .answer_correctness import AnswerCorrectnessConfig
|
|
8
|
+
from .llm_factory import Config as LLMConfig, create_llm, create_embedder
|
|
8
9
|
from .steps.evaluation import evaluate_steps
|
|
9
10
|
|
|
10
11
|
|
|
@@ -12,6 +13,7 @@ class Config(BaseModel):
|
|
|
12
13
|
llm: LLMConfig | None = None
|
|
13
14
|
custom_evaluations: list[custom_evaluation.Config] | None \
|
|
14
15
|
= Field(default=None, min_length=1)
|
|
16
|
+
answer_correctness: AnswerCorrectnessConfig | None = None
|
|
15
17
|
|
|
16
18
|
@model_validator(mode="after")
|
|
17
19
|
def validate_config(self) -> "Config":
|
|
@@ -19,7 +21,7 @@ class Config(BaseModel):
|
|
|
19
21
|
msg = "llm config is required if custom_evaluations are provided"
|
|
20
22
|
raise ValueError(msg)
|
|
21
23
|
return self
|
|
22
|
-
|
|
24
|
+
|
|
23
25
|
@classmethod
|
|
24
26
|
def parse(cls, config_file_path: str | Path | None) -> "Config":
|
|
25
27
|
if config_file_path:
|
|
@@ -75,10 +77,11 @@ async def run_evaluation(
|
|
|
75
77
|
if "reference_answer" in question and ragas_llm:
|
|
76
78
|
from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
|
|
77
79
|
answer_correctness_evaluator = AnswerCorrectnessEvaluator(
|
|
78
|
-
llm=ragas_llm
|
|
80
|
+
llm=ragas_llm,
|
|
81
|
+
config=config.answer_correctness,
|
|
79
82
|
)
|
|
80
83
|
eval_result.update(
|
|
81
|
-
answer_correctness_evaluator.get_correctness_dict(
|
|
84
|
+
await answer_correctness_evaluator.get_correctness_dict(
|
|
82
85
|
question,
|
|
83
86
|
actual_result,
|
|
84
87
|
)
|
|
@@ -90,8 +93,8 @@ async def run_evaluation(
|
|
|
90
93
|
ragas_llm,
|
|
91
94
|
)
|
|
92
95
|
)
|
|
93
|
-
for
|
|
94
|
-
custom_metrics =
|
|
96
|
+
for custom_evaluator in custom_evaluators:
|
|
97
|
+
custom_metrics = await custom_evaluator.evaluate(question, actual_result)
|
|
95
98
|
eval_result.update(**custom_metrics)
|
|
96
99
|
for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
|
|
97
100
|
if key in actual_result:
|
|
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
|
6
6
|
class GenerationConfig(BaseModel):
|
|
7
7
|
provider: str
|
|
8
8
|
model: str
|
|
9
|
-
temperature: float = Field(ge=0.0, le=2.0)
|
|
10
|
-
max_tokens: int = Field(ge=1)
|
|
9
|
+
temperature: float = Field(default=0.0, ge=0.0, le=2.0)
|
|
10
|
+
max_tokens: int | None = Field(default=None, ge=1)
|
|
11
11
|
model_config = ConfigDict(extra='allow')
|
|
12
12
|
|
|
13
13
|
|
|
@@ -37,6 +37,7 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
|
|
|
37
37
|
)
|
|
38
38
|
ragas_llm.is_async = True
|
|
39
39
|
return ragas_llm
|
|
40
|
+
return None
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding"]:
|
|
@@ -53,3 +54,4 @@ def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding
|
|
|
53
54
|
**params,
|
|
54
55
|
)
|
|
55
56
|
return ragas_embedder
|
|
57
|
+
return None
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "6.
|
|
3
|
+
version = "6.3.0"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
@@ -24,7 +24,7 @@ llm = ["ragas", "litellm", "pyyaml"]
|
|
|
24
24
|
|
|
25
25
|
[tool.poetry.group.llm.dependencies]
|
|
26
26
|
ragas = "0.4.3"
|
|
27
|
-
litellm = "1.
|
|
27
|
+
litellm = "1.85.1"
|
|
28
28
|
pyyaml = "6.0.3"
|
|
29
29
|
|
|
30
30
|
[tool.poetry.group.llm]
|
|
@@ -41,7 +41,7 @@ pyyaml = "6.0.3"
|
|
|
41
41
|
optional = true
|
|
42
42
|
|
|
43
43
|
[project.scripts]
|
|
44
|
-
answer-correctness = "graphrag_eval.answer_correctness:main"
|
|
44
|
+
answer-correctness = "graphrag_eval.cli.answer_correctness:main"
|
|
45
45
|
|
|
46
46
|
[build-system]
|
|
47
47
|
requires = ["poetry-core>=2.0.0"]
|