graphrag-eval 6.0.0__tar.gz → 6.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/PKG-INFO +4 -3
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/README.md +3 -2
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/answer_correctness.py +22 -22
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/custom_evaluation.py +15 -15
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/evaluation.py +5 -5
- graphrag_eval-6.0.0/graphrag_eval/llm.py → graphrag_eval-6.2.0/graphrag_eval/llm_factory.py +2 -0
- graphrag_eval-6.2.0/graphrag_eval/steps/iri_discovery.py +21 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/pyproject.toml +2 -2
- graphrag_eval-6.0.0/graphrag_eval/steps/iri_discovery.py +0 -20
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/LICENSE +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/aggregation.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/answer_relevance.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/prompts/template.md +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/__init__.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/evaluation.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/sparql.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/steps/timeseries.py +0 -0
- {graphrag_eval-6.0.0 → graphrag_eval-6.2.0}/graphrag_eval/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: graphrag-eval
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.2.0
|
|
4
4
|
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Philip Ganchev
|
|
@@ -1167,8 +1167,9 @@ the steps are matching.
|
|
|
1167
1167
|
- if both are named "retrieve_data_points", then we check if the arguments of
|
|
1168
1168
|
the steps are matching.
|
|
1169
1169
|
- if the reference step is named "iri_discovery" and the actual step name is
|
|
1170
|
-
"autocomplete_search",
|
|
1171
|
-
"iri_discovery" step is present in the "output" of the
|
|
1170
|
+
"autocomplete_search" or "sparql_query", then check if the IRI specified as
|
|
1171
|
+
"output" of the "iri_discovery" step is present in the "output" of the
|
|
1172
|
+
actual step.
|
|
1172
1173
|
- if the reference and actual step names are the same and the
|
|
1173
1174
|
"output_media_type" of the reference step is "application/json", then the steps
|
|
1174
1175
|
match, if the json outputs are the same.
|
|
@@ -1149,8 +1149,9 @@ the steps are matching.
|
|
|
1149
1149
|
- if both are named "retrieve_data_points", then we check if the arguments of
|
|
1150
1150
|
the steps are matching.
|
|
1151
1151
|
- if the reference step is named "iri_discovery" and the actual step name is
|
|
1152
|
-
"autocomplete_search",
|
|
1153
|
-
"iri_discovery" step is present in the "output" of the
|
|
1152
|
+
"autocomplete_search" or "sparql_query", then check if the IRI specified as
|
|
1153
|
+
"output" of the "iri_discovery" step is present in the "output" of the
|
|
1154
|
+
actual step.
|
|
1154
1155
|
- if the reference and actual step names are the same and the
|
|
1155
1156
|
"output_media_type" of the reference step is "application/json", then the steps
|
|
1156
1157
|
match, if the json outputs are the same.
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import csv
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from tqdm import tqdm
|
|
5
6
|
|
|
6
|
-
from graphrag_eval import
|
|
7
|
+
from graphrag_eval import llm_factory
|
|
7
8
|
from graphrag_eval.evaluation import Config
|
|
8
9
|
from graphrag_eval.util import compute_f1, singleton
|
|
9
10
|
|
|
10
|
-
|
|
11
11
|
IN_FILE_PATH = "../data/data-1.tsv"
|
|
12
12
|
PROMPT_FILE_PATH = Path(__file__).parent / "prompts" / "template.md"
|
|
13
13
|
OUT_FILE_PATH = "results/data-1.tsv"
|
|
@@ -26,17 +26,17 @@ def parse_args() -> "argparse.Namespace":
|
|
|
26
26
|
f = float(value)
|
|
27
27
|
except ValueError:
|
|
28
28
|
raise ArgumentTypeError(f"Invalid float value: {value}")
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
if f <= 0.0 or f >= 2.0:
|
|
31
31
|
raise ArgumentTypeError(f"Value must be between 0.0 and 2.0, got {f}")
|
|
32
32
|
return f
|
|
33
33
|
|
|
34
|
-
parser = ArgumentParser()
|
|
34
|
+
parser = ArgumentParser()
|
|
35
35
|
parser.add_argument("-i", "--in-file", type=str, default=IN_FILE_PATH)
|
|
36
36
|
parser.add_argument("-o", "--out-file", type=str, default=OUT_FILE_PATH)
|
|
37
37
|
parser.add_argument("-p", "--provider", type=str, default=LLM_PROVIDER)
|
|
38
38
|
parser.add_argument("-l", "--llm", type=str, default=LLM_MODEL)
|
|
39
|
-
parser.add_argument("-m", "--max-tokens", type=int, default=MAX_TOKENS)
|
|
39
|
+
parser.add_argument("-m", "--max-tokens", type=int, default=MAX_TOKENS)
|
|
40
40
|
parser.add_argument(
|
|
41
41
|
"-t",
|
|
42
42
|
"--temperature",
|
|
@@ -97,11 +97,11 @@ class AnswerCorrectnessEvaluator:
|
|
|
97
97
|
self.prompt_template = f.read()
|
|
98
98
|
self.llm = llm
|
|
99
99
|
|
|
100
|
-
def
|
|
100
|
+
async def _agenerate(self, prompt):
|
|
101
101
|
"""Wrapper method for easier testing"""
|
|
102
|
-
return self.llm.
|
|
102
|
+
return (await self.llm.agenerate(prompt, None)).choices[0].message.content
|
|
103
103
|
|
|
104
|
-
def evaluate_answer(
|
|
104
|
+
async def evaluate_answer(
|
|
105
105
|
self,
|
|
106
106
|
question: str,
|
|
107
107
|
reference_answer: str,
|
|
@@ -112,21 +112,21 @@ class AnswerCorrectnessEvaluator:
|
|
|
112
112
|
reference_answer=reference_answer,
|
|
113
113
|
candidate_answer=actual_answer,
|
|
114
114
|
)
|
|
115
|
-
response_str = self.
|
|
115
|
+
response_str = await self._agenerate(prompt)
|
|
116
116
|
return extract_response_values(response_str)
|
|
117
117
|
|
|
118
|
-
def get_correctness_dict(
|
|
118
|
+
async def get_correctness_dict(
|
|
119
119
|
self,
|
|
120
120
|
reference: dict,
|
|
121
121
|
actual: dict,
|
|
122
122
|
):
|
|
123
123
|
result = {"reference_answer": reference["reference_answer"]}
|
|
124
124
|
num_ref_claims, num_actual_claims, num_matching_claims, reason, error = \
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
125
|
+
await self.evaluate_answer(
|
|
126
|
+
reference["question_text"],
|
|
127
|
+
reference["reference_answer"],
|
|
128
|
+
actual["actual_answer"],
|
|
129
|
+
)
|
|
130
130
|
if error:
|
|
131
131
|
result["answer_eval_error"] = error
|
|
132
132
|
else:
|
|
@@ -148,12 +148,12 @@ class AnswerCorrectnessEvaluator:
|
|
|
148
148
|
return result
|
|
149
149
|
|
|
150
150
|
|
|
151
|
-
def evaluate_and_write(
|
|
151
|
+
async def evaluate_and_write(
|
|
152
152
|
in_file_path: str | Path,
|
|
153
153
|
out_file_path: str | Path,
|
|
154
154
|
config: "evaluation.Config",
|
|
155
155
|
) -> None:
|
|
156
|
-
ragas_llm =
|
|
156
|
+
ragas_llm = llm_factory.create_llm(config)
|
|
157
157
|
evaluator = AnswerCorrectnessEvaluator(llm=ragas_llm)
|
|
158
158
|
with open(in_file_path, encoding="utf-8") as f:
|
|
159
159
|
reader = csv.DictReader(f, delimiter="\t")
|
|
@@ -164,7 +164,7 @@ def evaluate_and_write(
|
|
|
164
164
|
writer = csv.writer(f, delimiter="\t")
|
|
165
165
|
writer.writerow(OUT_FIELDS)
|
|
166
166
|
for row in tqdm(rows):
|
|
167
|
-
vals = evaluator.evaluate_answer(
|
|
167
|
+
vals = await evaluator.evaluate_answer(
|
|
168
168
|
row["Question"],
|
|
169
169
|
row["Reference answer"],
|
|
170
170
|
row["Actual answer"]
|
|
@@ -176,8 +176,8 @@ def evaluate_and_write(
|
|
|
176
176
|
def main():
|
|
177
177
|
args = parse_args()
|
|
178
178
|
config = Config(
|
|
179
|
-
llm=
|
|
180
|
-
generation=
|
|
179
|
+
llm=llm_factory.Config(
|
|
180
|
+
generation=llm_factory.GenerationConfig(
|
|
181
181
|
provider=args.provider,
|
|
182
182
|
model=args.llm,
|
|
183
183
|
temperature=args.temperature,
|
|
@@ -185,8 +185,8 @@ def main():
|
|
|
185
185
|
)
|
|
186
186
|
)
|
|
187
187
|
)
|
|
188
|
-
evaluate_and_write(
|
|
188
|
+
asyncio.run(evaluate_and_write(
|
|
189
189
|
args.in_file,
|
|
190
190
|
args.out_file,
|
|
191
191
|
config,
|
|
192
|
-
)
|
|
192
|
+
))
|
|
@@ -3,6 +3,7 @@ from typing import Literal
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
5
5
|
|
|
6
|
+
from graphrag_eval.llm_factory import create_llm
|
|
6
7
|
|
|
7
8
|
RESERVED_KEYS = {
|
|
8
9
|
"template_id",
|
|
@@ -31,7 +32,6 @@ RESERVED_KEYS = {
|
|
|
31
32
|
"elapsed_sec",
|
|
32
33
|
}
|
|
33
34
|
|
|
34
|
-
|
|
35
35
|
Inputs = Literal[
|
|
36
36
|
"question",
|
|
37
37
|
"reference_answer",
|
|
@@ -84,8 +84,8 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
|
|
|
84
84
|
|
|
85
85
|
output_variables specifies the order of the outputs.
|
|
86
86
|
"""
|
|
87
|
-
output_instructions = "Output the following values separated by tabs:"\
|
|
88
|
-
|
|
87
|
+
output_instructions = "Output the following values separated by tabs:" \
|
|
88
|
+
+ "".join(f"\n- {k}: {config.outputs[k]}" for k in output_variables)
|
|
89
89
|
inputs_template = "\n\n".join(
|
|
90
90
|
create_input_template(k) for k in config.inputs
|
|
91
91
|
)
|
|
@@ -98,9 +98,9 @@ def create_prompt_template(config: Config, output_variables: list[str]) -> str:
|
|
|
98
98
|
|
|
99
99
|
class CustomEvaluator:
|
|
100
100
|
def __init__(
|
|
101
|
-
self,
|
|
101
|
+
self,
|
|
102
102
|
config: Config,
|
|
103
|
-
|
|
103
|
+
eval_config: "evaluation.Config",
|
|
104
104
|
):
|
|
105
105
|
self.name = config.name
|
|
106
106
|
self.input_variables = config.inputs
|
|
@@ -111,11 +111,11 @@ class CustomEvaluator:
|
|
|
111
111
|
config,
|
|
112
112
|
self.output_variables
|
|
113
113
|
)
|
|
114
|
-
self.llm =
|
|
114
|
+
self.llm = create_llm(eval_config)
|
|
115
115
|
|
|
116
|
-
def
|
|
116
|
+
async def _agenerate(self, prompt: str) -> str:
|
|
117
117
|
"""Wrapper method for easier testing"""
|
|
118
|
-
return self.llm.
|
|
118
|
+
return (await self.llm.agenerate(prompt, None)).choices[0].message.content
|
|
119
119
|
|
|
120
120
|
def format_steps(self, steps: list) -> str:
|
|
121
121
|
steps_formatted = []
|
|
@@ -134,9 +134,9 @@ class CustomEvaluator:
|
|
|
134
134
|
step_out[k] = val
|
|
135
135
|
else:
|
|
136
136
|
step_out[k] = val
|
|
137
|
-
steps_formatted.append(step_out)
|
|
137
|
+
steps_formatted.append(step_out)
|
|
138
138
|
return json.dumps(steps_formatted, indent=2)
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
def error(self, msg: str) -> dict:
|
|
141
141
|
result = {k: None for k in self.output_variables}
|
|
142
142
|
result[self.name + '_error'] = msg
|
|
@@ -157,7 +157,7 @@ class CustomEvaluator:
|
|
|
157
157
|
return result
|
|
158
158
|
return self.error(f"Expected {n_exp} tab-separated values, got: {response}")
|
|
159
159
|
|
|
160
|
-
def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
|
|
160
|
+
async def evaluate(self, reference: dict, actual: dict) -> dict[str, str | None]:
|
|
161
161
|
inputs = {}
|
|
162
162
|
if "question" in self.input_variables:
|
|
163
163
|
if "question_text" not in reference:
|
|
@@ -176,7 +176,7 @@ class CustomEvaluator:
|
|
|
176
176
|
return self.error("Reference missing key 'reference_steps'")
|
|
177
177
|
try:
|
|
178
178
|
formatted_steps_lists = [
|
|
179
|
-
self.format_steps(group)
|
|
179
|
+
self.format_steps(group)
|
|
180
180
|
for group in reference["reference_steps"]
|
|
181
181
|
]
|
|
182
182
|
except json.JSONDecodeError:
|
|
@@ -191,14 +191,14 @@ class CustomEvaluator:
|
|
|
191
191
|
return self.error("Malformed actual step JSON")
|
|
192
192
|
inputs["actual_steps"] = formatted_steps_lists
|
|
193
193
|
prompt = self.prompt_template.format(**inputs)
|
|
194
|
-
response = self.
|
|
194
|
+
response = await self._agenerate(prompt)
|
|
195
195
|
return self.parse_outputs(response)
|
|
196
196
|
|
|
197
197
|
|
|
198
198
|
def create_evaluators(config: "evaluation.Config") -> list[CustomEvaluator]:
|
|
199
199
|
if config.custom_evaluations and config.llm:
|
|
200
200
|
return [
|
|
201
|
-
CustomEvaluator(
|
|
202
|
-
for
|
|
201
|
+
CustomEvaluator(custom_evaluation_config, config)
|
|
202
|
+
for custom_evaluation_config in config.custom_evaluations
|
|
203
203
|
]
|
|
204
204
|
return []
|
|
@@ -4,7 +4,7 @@ import yaml
|
|
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
|
5
5
|
|
|
6
6
|
from . import custom_evaluation
|
|
7
|
-
from .
|
|
7
|
+
from .llm_factory import Config as LLMConfig, create_llm, create_embedder
|
|
8
8
|
from .steps.evaluation import evaluate_steps
|
|
9
9
|
|
|
10
10
|
|
|
@@ -19,7 +19,7 @@ class Config(BaseModel):
|
|
|
19
19
|
msg = "llm config is required if custom_evaluations are provided"
|
|
20
20
|
raise ValueError(msg)
|
|
21
21
|
return self
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
@classmethod
|
|
24
24
|
def parse(cls, config_file_path: str | Path | None) -> "Config":
|
|
25
25
|
if config_file_path:
|
|
@@ -78,7 +78,7 @@ async def run_evaluation(
|
|
|
78
78
|
llm=ragas_llm
|
|
79
79
|
)
|
|
80
80
|
eval_result.update(
|
|
81
|
-
answer_correctness_evaluator.get_correctness_dict(
|
|
81
|
+
await answer_correctness_evaluator.get_correctness_dict(
|
|
82
82
|
question,
|
|
83
83
|
actual_result,
|
|
84
84
|
)
|
|
@@ -90,8 +90,8 @@ async def run_evaluation(
|
|
|
90
90
|
ragas_llm,
|
|
91
91
|
)
|
|
92
92
|
)
|
|
93
|
-
for
|
|
94
|
-
custom_metrics =
|
|
93
|
+
for custom_evaluator in custom_evaluators:
|
|
94
|
+
custom_metrics = await custom_evaluator.evaluate(question, actual_result)
|
|
95
95
|
eval_result.update(**custom_metrics)
|
|
96
96
|
for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
|
|
97
97
|
if key in actual_result:
|
|
@@ -37,6 +37,7 @@ def create_llm(config: "evaluation.Config") -> Optional["InstructorBaseRagasLLM"
|
|
|
37
37
|
)
|
|
38
38
|
ragas_llm.is_async = True
|
|
39
39
|
return ragas_llm
|
|
40
|
+
return None
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding"]:
|
|
@@ -53,3 +54,4 @@ def create_embedder(config: "evaluation.Config") -> Optional["BaseRagasEmbedding
|
|
|
53
54
|
**params,
|
|
54
55
|
)
|
|
55
56
|
return ragas_embedder
|
|
57
|
+
return None
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def do_iri_discovery_steps_equal(
|
|
6
|
+
reference_step: dict[str, Any],
|
|
7
|
+
actual_step: dict[str, Any],
|
|
8
|
+
) -> bool:
|
|
9
|
+
if actual_step["name"] == "autocomplete_search":
|
|
10
|
+
reference_iri = reference_step["output"]
|
|
11
|
+
actual_output = json.loads(actual_step["output"])
|
|
12
|
+
|
|
13
|
+
for binding in actual_output["results"]["bindings"]:
|
|
14
|
+
for _, type_value in binding.items():
|
|
15
|
+
if type_value["type"] == "uri" and type_value["value"] == reference_iri:
|
|
16
|
+
return True
|
|
17
|
+
elif actual_step["name"] == "sparql_query":
|
|
18
|
+
reference_iri = reference_step["output"]
|
|
19
|
+
if reference_iri in actual_step["output"]:
|
|
20
|
+
return True
|
|
21
|
+
return False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "6.
|
|
3
|
+
version = "6.2.0"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
@@ -24,7 +24,7 @@ llm = ["ragas", "litellm", "pyyaml"]
|
|
|
24
24
|
|
|
25
25
|
[tool.poetry.group.llm.dependencies]
|
|
26
26
|
ragas = "0.4.3"
|
|
27
|
-
litellm = "1.
|
|
27
|
+
litellm = "1.85.1"
|
|
28
28
|
pyyaml = "6.0.3"
|
|
29
29
|
|
|
30
30
|
[tool.poetry.group.llm]
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def do_iri_discovery_steps_equal(
|
|
6
|
-
reference_step: dict[str, Any],
|
|
7
|
-
actual_step: dict[str, Any],
|
|
8
|
-
) -> bool:
|
|
9
|
-
if actual_step["name"] != "autocomplete_search":
|
|
10
|
-
return False
|
|
11
|
-
|
|
12
|
-
reference_iri = reference_step["output"]
|
|
13
|
-
actual_output = json.loads(actual_step["output"])
|
|
14
|
-
|
|
15
|
-
for binding in actual_output["results"]["bindings"]:
|
|
16
|
-
for _, type_value in binding.items():
|
|
17
|
-
if type_value["type"] == "uri" and type_value["value"] == reference_iri:
|
|
18
|
-
return True
|
|
19
|
-
|
|
20
|
-
return False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|