graphrag-eval 5.1.2__tar.gz → 5.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/PKG-INFO +12 -6
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/README.md +11 -5
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/aggregation.py +17 -5
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/evaluation.py +22 -23
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/steps/evaluation.py +2 -2
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/steps/sparql.py +10 -7
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/pyproject.toml +1 -1
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/LICENSE +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/answer_correctness.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/answer_relevance.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/prompts/template.md +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/steps/__init__.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-5.1.2 → graphrag_eval-5.3.0}/graphrag_eval/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: graphrag-eval
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.3.0
|
|
4
4
|
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Philip Ganchev
|
|
@@ -19,7 +19,7 @@ Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
|
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
|
|
21
21
|
<p align="center">
|
|
22
|
-
<img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
|
|
22
|
+
<img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
|
|
23
23
|
</p>
|
|
24
24
|
|
|
25
25
|
# QA Evaluation
|
|
@@ -28,7 +28,7 @@ This is a Python module for assessing the quality of question-answering systems
|
|
|
28
28
|
|
|
29
29
|
## License
|
|
30
30
|
|
|
31
|
-
Apache-2.0 License. See [LICENSE](LICENSE) file for details.
|
|
31
|
+
Apache-2.0 License. See [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
|
|
32
32
|
|
|
33
33
|
## Installation
|
|
34
34
|
|
|
@@ -107,6 +107,7 @@ Each step includes:
|
|
|
107
107
|
- `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
|
|
108
108
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
109
109
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
110
|
+
- `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
|
|
110
111
|
|
|
111
112
|
#### Reference Data
|
|
112
113
|
|
|
@@ -591,7 +592,7 @@ Aggregates are:
|
|
|
591
592
|
- `per_template`: a dictionary mapping a template identifier to the following statistics:
|
|
592
593
|
- `number_of_error_samples`: number of questions for this template, which resulted in error response
|
|
593
594
|
- `number_of_success_samples`: number of questions for this template, which resulted in successful response
|
|
594
|
-
- `sum`, `mean`, `median`, `min` and `max` statistics over all
|
|
595
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics over all questions of this template for which the metrics exist:
|
|
595
596
|
- `input_tokens`
|
|
596
597
|
- `output_tokens`
|
|
597
598
|
- `total_tokens`
|
|
@@ -608,14 +609,19 @@ Aggregates are:
|
|
|
608
609
|
- `retrieval_context_precision`
|
|
609
610
|
- `retrieval_context_f1`
|
|
610
611
|
- `steps`: includes:
|
|
611
|
-
- `
|
|
612
|
+
- `total`: for each step type how many times it was executed
|
|
612
613
|
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
613
614
|
- `empty_results`: how many times the step was executed and returned empty results
|
|
614
615
|
- `errors`: how many times the step was executed and resulted in error
|
|
615
616
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
616
617
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
617
618
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
618
|
-
- `
|
|
619
|
+
- `steps`: includes:
|
|
620
|
+
- `total`: for each step type how many times it was executed
|
|
621
|
+
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
622
|
+
- `empty_results`: how many times the step was executed and returned empty results
|
|
623
|
+
- `errors`: how many times the step was executed and resulted in error
|
|
624
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics, over all questions where the metrics exist:
|
|
619
625
|
- `input_tokens`
|
|
620
626
|
- `output_tokens`
|
|
621
627
|
- `total_tokens`
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
|
|
2
|
+
<img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
# QA Evaluation
|
|
@@ -8,7 +8,7 @@ This is a Python module for assessing the quality of question-answering systems
|
|
|
8
8
|
|
|
9
9
|
## License
|
|
10
10
|
|
|
11
|
-
Apache-2.0 License. See [LICENSE](LICENSE) file for details.
|
|
11
|
+
Apache-2.0 License. See [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
|
|
12
12
|
|
|
13
13
|
## Installation
|
|
14
14
|
|
|
@@ -87,6 +87,7 @@ Each step includes:
|
|
|
87
87
|
- `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
|
|
88
88
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
89
89
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
90
|
+
- `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
|
|
90
91
|
|
|
91
92
|
#### Reference Data
|
|
92
93
|
|
|
@@ -571,7 +572,7 @@ Aggregates are:
|
|
|
571
572
|
- `per_template`: a dictionary mapping a template identifier to the following statistics:
|
|
572
573
|
- `number_of_error_samples`: number of questions for this template, which resulted in error response
|
|
573
574
|
- `number_of_success_samples`: number of questions for this template, which resulted in successful response
|
|
574
|
-
- `sum`, `mean`, `median`, `min` and `max` statistics over all
|
|
575
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics over all questions of this template for which the metrics exist:
|
|
575
576
|
- `input_tokens`
|
|
576
577
|
- `output_tokens`
|
|
577
578
|
- `total_tokens`
|
|
@@ -588,14 +589,19 @@ Aggregates are:
|
|
|
588
589
|
- `retrieval_context_precision`
|
|
589
590
|
- `retrieval_context_f1`
|
|
590
591
|
- `steps`: includes:
|
|
591
|
-
- `
|
|
592
|
+
- `total`: for each step type how many times it was executed
|
|
592
593
|
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
593
594
|
- `empty_results`: how many times the step was executed and returned empty results
|
|
594
595
|
- `errors`: how many times the step was executed and resulted in error
|
|
595
596
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
596
597
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
597
598
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
598
|
-
- `
|
|
599
|
+
- `steps`: includes:
|
|
600
|
+
- `total`: for each step type how many times it was executed
|
|
601
|
+
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
602
|
+
- `empty_results`: how many times the step was executed and returned empty results
|
|
603
|
+
- `errors`: how many times the step was executed and resulted in error
|
|
604
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics, over all questions where the metrics exist:
|
|
599
605
|
- `input_tokens`
|
|
600
606
|
- `output_tokens`
|
|
601
607
|
- `total_tokens`
|
|
@@ -131,9 +131,10 @@ def compute_per_template_stats(
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
def compute_micro_stats(
|
|
134
|
-
number_of_samples_per_template_by_status,
|
|
135
|
-
stats_per_template,
|
|
136
|
-
|
|
134
|
+
number_of_samples_per_template_by_status: dict[str, dict[str, int]],
|
|
135
|
+
stats_per_template: dict[str, dict[str, Sequence[int]]],
|
|
136
|
+
steps_summary_per_template: dict[str, dict[str, dict[str, int]]],
|
|
137
|
+
step_metrics_per_template: dict[str, dict[str, Sequence[int]]],
|
|
137
138
|
) -> dict:
|
|
138
139
|
values = number_of_samples_per_template_by_status.values()
|
|
139
140
|
micro_summary = defaultdict(dict, {
|
|
@@ -157,6 +158,16 @@ def compute_micro_stats(
|
|
|
157
158
|
micro_step_metrics[metric].extend(values)
|
|
158
159
|
for metric, values in micro_step_metrics.items():
|
|
159
160
|
micro_summary[metric] = stats_for_series(values)
|
|
161
|
+
|
|
162
|
+
steps_summary = defaultdict(lambda: defaultdict(int))
|
|
163
|
+
for template_steps_summary in steps_summary_per_template.values():
|
|
164
|
+
for summary_name, steps_stats in template_steps_summary.items():
|
|
165
|
+
for step_id, count in steps_stats.items():
|
|
166
|
+
steps_summary[summary_name][step_id] += count
|
|
167
|
+
steps_summary = {k: dict(v) for k, v in steps_summary.items()}
|
|
168
|
+
if len(steps_summary) > 0:
|
|
169
|
+
micro_summary["steps"] = steps_summary
|
|
170
|
+
|
|
160
171
|
return dict(micro_summary)
|
|
161
172
|
|
|
162
173
|
|
|
@@ -198,8 +209,8 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
198
209
|
|
|
199
210
|
if "error" in sample:
|
|
200
211
|
number_of_samples_per_template_by_status[template_id]["error"] += 1
|
|
201
|
-
|
|
202
|
-
|
|
212
|
+
else:
|
|
213
|
+
number_of_samples_per_template_by_status[template_id]["success"] += 1
|
|
203
214
|
update_stats(sample, stats_per_template[template_id])
|
|
204
215
|
update_steps_summary(sample, steps_summary_per_template[template_id])
|
|
205
216
|
update_step_metrics(sample, step_metrics_per_template[template_id])
|
|
@@ -215,6 +226,7 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
215
226
|
"micro": compute_micro_stats(
|
|
216
227
|
number_of_samples_per_template_by_status,
|
|
217
228
|
stats_per_template,
|
|
229
|
+
steps_summary_per_template,
|
|
218
230
|
step_metrics_per_template
|
|
219
231
|
)
|
|
220
232
|
}
|
|
@@ -6,7 +6,7 @@ def run_evaluation(
|
|
|
6
6
|
responses_dict: dict,
|
|
7
7
|
) -> list[dict]:
|
|
8
8
|
# Output metrics are not nested, for simpler aggregation
|
|
9
|
-
|
|
9
|
+
answer_correctness_evaluator = None
|
|
10
10
|
evaluation_results = []
|
|
11
11
|
for template in qa_dataset:
|
|
12
12
|
template_id = template["template_id"]
|
|
@@ -26,9 +26,9 @@ def run_evaluation(
|
|
|
26
26
|
"status": "error",
|
|
27
27
|
"error": actual_result["error"],
|
|
28
28
|
})
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
else:
|
|
30
|
+
eval_result["status"] = "success"
|
|
31
|
+
|
|
32
32
|
if "actual_answer" in actual_result:
|
|
33
33
|
eval_result["actual_answer"] = actual_result["actual_answer"]
|
|
34
34
|
from graphrag_eval import answer_relevance
|
|
@@ -38,25 +38,24 @@ def run_evaluation(
|
|
|
38
38
|
actual_result["actual_answer"],
|
|
39
39
|
)
|
|
40
40
|
)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
41
|
+
|
|
42
|
+
if "reference_answer" in question:
|
|
43
|
+
from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
|
|
44
|
+
if not answer_correctness_evaluator:
|
|
45
|
+
answer_correctness_evaluator = AnswerCorrectnessEvaluator()
|
|
46
|
+
eval_result.update(
|
|
47
|
+
answer_correctness_evaluator.get_correctness_dict(
|
|
48
|
+
question,
|
|
49
|
+
actual_result,
|
|
50
|
+
)
|
|
49
51
|
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
"total_tokens": actual_result["total_tokens"],
|
|
59
|
-
"elapsed_sec": actual_result["elapsed_sec"],
|
|
60
|
-
})
|
|
52
|
+
|
|
53
|
+
eval_result.update(
|
|
54
|
+
get_steps_evaluation_result_dict(question, actual_result)
|
|
55
|
+
)
|
|
56
|
+
for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
|
|
57
|
+
if key in actual_result:
|
|
58
|
+
eval_result[key] = actual_result[key]
|
|
59
|
+
|
|
61
60
|
evaluation_results.append(eval_result)
|
|
62
61
|
return evaluation_results
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from typing import Any
|
|
4
3
|
from collections.abc import Sequence
|
|
4
|
+
from typing import Any
|
|
5
5
|
|
|
6
6
|
from .retrieval_context_ids import recall_at_k
|
|
7
7
|
from .sparql import compare_sparql_results
|
|
8
8
|
|
|
9
|
-
|
|
10
9
|
Match = tuple[int, int, int, float]
|
|
11
10
|
Step = dict[str, Any]
|
|
12
11
|
StepsGroup = Sequence[Step] # We will index into a group
|
|
@@ -23,6 +22,7 @@ def compare_steps_outputs(reference_step: Step, actual_step: Step) -> float:
|
|
|
23
22
|
json.loads(actual_output),
|
|
24
23
|
reference_step["required_columns"],
|
|
25
24
|
reference_step.get("ordered", False),
|
|
25
|
+
reference_step.get("ignore_duplicates", True),
|
|
26
26
|
)
|
|
27
27
|
if reference_step.get("output_media_type") == "application/json":
|
|
28
28
|
return float(json.loads(reference_output) == json.loads(actual_output))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from collections import Counter
|
|
2
|
-
from typing import Union
|
|
3
1
|
import itertools
|
|
4
2
|
import math
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Union
|
|
5
5
|
|
|
6
6
|
XSD_NUMERIC_TYPES = {
|
|
7
7
|
"http://www.w3.org/2001/XMLSchema#integer",
|
|
@@ -35,7 +35,7 @@ def truncate(number: float, decimals: int = 0) -> float:
|
|
|
35
35
|
elif decimals == 0:
|
|
36
36
|
return math.trunc(number)
|
|
37
37
|
|
|
38
|
-
factor = 10.0**decimals
|
|
38
|
+
factor = 10.0 ** decimals
|
|
39
39
|
return math.trunc(number * factor) / factor
|
|
40
40
|
|
|
41
41
|
|
|
@@ -137,8 +137,8 @@ def compare_values(
|
|
|
137
137
|
actual_vars: Union[list[str], tuple[str, ...]],
|
|
138
138
|
actual_var_to_values: dict[str, list],
|
|
139
139
|
results_are_ordered: bool,
|
|
140
|
+
ignore_duplicates: bool,
|
|
140
141
|
) -> bool:
|
|
141
|
-
|
|
142
142
|
if len(reference_vars) < len(actual_vars):
|
|
143
143
|
for combination in itertools.combinations(actual_vars, len(reference_vars)):
|
|
144
144
|
if compare_values(
|
|
@@ -147,6 +147,7 @@ def compare_values(
|
|
|
147
147
|
combination,
|
|
148
148
|
actual_var_to_values,
|
|
149
149
|
results_are_ordered,
|
|
150
|
+
ignore_duplicates,
|
|
150
151
|
):
|
|
151
152
|
return True
|
|
152
153
|
return False
|
|
@@ -154,9 +155,9 @@ def compare_values(
|
|
|
154
155
|
table = convert_table_dict2lines(reference_vars, reference_var_to_values)
|
|
155
156
|
for permutation in itertools.permutations(actual_vars):
|
|
156
157
|
actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
|
|
157
|
-
if (results_are_ordered and table == actual_table) or
|
|
158
|
-
not results_are_ordered and
|
|
159
|
-
|
|
158
|
+
if (results_are_ordered and table == actual_table) or \
|
|
159
|
+
((not results_are_ordered) and ignore_duplicates and set(table) == set(actual_table)) or \
|
|
160
|
+
((not results_are_ordered) and (not ignore_duplicates) and Counter(table) == Counter(actual_table)):
|
|
160
161
|
return True
|
|
161
162
|
|
|
162
163
|
return False
|
|
@@ -167,6 +168,7 @@ def compare_sparql_results(
|
|
|
167
168
|
actual_sparql_result: dict,
|
|
168
169
|
required_vars: list[str],
|
|
169
170
|
results_are_ordered: bool = False,
|
|
171
|
+
ignore_duplicates: bool = True,
|
|
170
172
|
) -> float:
|
|
171
173
|
# DESCRIBE results
|
|
172
174
|
if isinstance(actual_sparql_result, str):
|
|
@@ -208,5 +210,6 @@ def compare_sparql_results(
|
|
|
208
210
|
actual_vars,
|
|
209
211
|
actual_var_to_values,
|
|
210
212
|
results_are_ordered,
|
|
213
|
+
ignore_duplicates,
|
|
211
214
|
)
|
|
212
215
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "5.
|
|
3
|
+
version = "5.3.0"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|