graphrag-eval 5.2.0__tar.gz → 5.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/PKG-INFO +11 -6
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/README.md +10 -5
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/aggregation.py +17 -5
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/evaluation.py +22 -23
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/pyproject.toml +1 -1
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/LICENSE +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/answer_correctness.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/answer_relevance.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/prompts/template.md +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/steps/__init__.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/steps/evaluation.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/steps/sparql.py +0 -0
- {graphrag_eval-5.2.0 → graphrag_eval-5.3.1}/graphrag_eval/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: graphrag-eval
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.3.1
|
|
4
4
|
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Philip Ganchev
|
|
@@ -19,7 +19,7 @@ Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
|
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
|
|
21
21
|
<p align="center">
|
|
22
|
-
<img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
|
|
22
|
+
<img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
|
|
23
23
|
</p>
|
|
24
24
|
|
|
25
25
|
# QA Evaluation
|
|
@@ -28,7 +28,7 @@ This is a Python module for assessing the quality of question-answering systems
|
|
|
28
28
|
|
|
29
29
|
## License
|
|
30
30
|
|
|
31
|
-
Apache-2.0 License. See [LICENSE](LICENSE) file for details.
|
|
31
|
+
Apache-2.0 License. See [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
|
|
32
32
|
|
|
33
33
|
## Installation
|
|
34
34
|
|
|
@@ -592,7 +592,7 @@ Aggregates are:
|
|
|
592
592
|
- `per_template`: a dictionary mapping a template identifier to the following statistics:
|
|
593
593
|
- `number_of_error_samples`: number of questions for this template, which resulted in error response
|
|
594
594
|
- `number_of_success_samples`: number of questions for this template, which resulted in successful response
|
|
595
|
-
- `sum`, `mean`, `median`, `min` and `max` statistics over all
|
|
595
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics over all questions of this template for which the metrics exist:
|
|
596
596
|
- `input_tokens`
|
|
597
597
|
- `output_tokens`
|
|
598
598
|
- `total_tokens`
|
|
@@ -609,14 +609,19 @@ Aggregates are:
|
|
|
609
609
|
- `retrieval_context_precision`
|
|
610
610
|
- `retrieval_context_f1`
|
|
611
611
|
- `steps`: includes:
|
|
612
|
-
- `
|
|
612
|
+
- `total`: for each step type how many times it was executed
|
|
613
613
|
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
614
614
|
- `empty_results`: how many times the step was executed and returned empty results
|
|
615
615
|
- `errors`: how many times the step was executed and resulted in error
|
|
616
616
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
617
617
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
618
618
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
619
|
-
- `
|
|
619
|
+
- `steps`: includes:
|
|
620
|
+
- `total`: for each step type how many times it was executed
|
|
621
|
+
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
622
|
+
- `empty_results`: how many times the step was executed and returned empty results
|
|
623
|
+
- `errors`: how many times the step was executed and resulted in error
|
|
624
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics, over all questions where the metrics exist:
|
|
620
625
|
- `input_tokens`
|
|
621
626
|
- `output_tokens`
|
|
622
627
|
- `total_tokens`
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
|
|
2
|
+
<img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
# QA Evaluation
|
|
@@ -8,7 +8,7 @@ This is a Python module for assessing the quality of question-answering systems
|
|
|
8
8
|
|
|
9
9
|
## License
|
|
10
10
|
|
|
11
|
-
Apache-2.0 License. See [LICENSE](LICENSE) file for details.
|
|
11
|
+
Apache-2.0 License. See [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
|
|
12
12
|
|
|
13
13
|
## Installation
|
|
14
14
|
|
|
@@ -572,7 +572,7 @@ Aggregates are:
|
|
|
572
572
|
- `per_template`: a dictionary mapping a template identifier to the following statistics:
|
|
573
573
|
- `number_of_error_samples`: number of questions for this template, which resulted in error response
|
|
574
574
|
- `number_of_success_samples`: number of questions for this template, which resulted in successful response
|
|
575
|
-
- `sum`, `mean`, `median`, `min` and `max` statistics over all
|
|
575
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics over all questions of this template for which the metrics exist:
|
|
576
576
|
- `input_tokens`
|
|
577
577
|
- `output_tokens`
|
|
578
578
|
- `total_tokens`
|
|
@@ -589,14 +589,19 @@ Aggregates are:
|
|
|
589
589
|
- `retrieval_context_precision`
|
|
590
590
|
- `retrieval_context_f1`
|
|
591
591
|
- `steps`: includes:
|
|
592
|
-
- `
|
|
592
|
+
- `total`: for each step type how many times it was executed
|
|
593
593
|
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
594
594
|
- `empty_results`: how many times the step was executed and returned empty results
|
|
595
595
|
- `errors`: how many times the step was executed and resulted in error
|
|
596
596
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
597
597
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
598
598
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
599
|
-
- `
|
|
599
|
+
- `steps`: includes:
|
|
600
|
+
- `total`: for each step type how many times it was executed
|
|
601
|
+
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
602
|
+
- `empty_results`: how many times the step was executed and returned empty results
|
|
603
|
+
- `errors`: how many times the step was executed and resulted in error
|
|
604
|
+
- `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics, over all questions where the metrics exist:
|
|
600
605
|
- `input_tokens`
|
|
601
606
|
- `output_tokens`
|
|
602
607
|
- `total_tokens`
|
|
@@ -131,9 +131,10 @@ def compute_per_template_stats(
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
def compute_micro_stats(
|
|
134
|
-
number_of_samples_per_template_by_status,
|
|
135
|
-
stats_per_template,
|
|
136
|
-
|
|
134
|
+
number_of_samples_per_template_by_status: dict[str, dict[str, int]],
|
|
135
|
+
stats_per_template: dict[str, dict[str, Sequence[int]]],
|
|
136
|
+
steps_summary_per_template: dict[str, dict[str, dict[str, int]]],
|
|
137
|
+
step_metrics_per_template: dict[str, dict[str, Sequence[int]]],
|
|
137
138
|
) -> dict:
|
|
138
139
|
values = number_of_samples_per_template_by_status.values()
|
|
139
140
|
micro_summary = defaultdict(dict, {
|
|
@@ -157,6 +158,16 @@ def compute_micro_stats(
|
|
|
157
158
|
micro_step_metrics[metric].extend(values)
|
|
158
159
|
for metric, values in micro_step_metrics.items():
|
|
159
160
|
micro_summary[metric] = stats_for_series(values)
|
|
161
|
+
|
|
162
|
+
steps_summary = defaultdict(lambda: defaultdict(int))
|
|
163
|
+
for template_steps_summary in steps_summary_per_template.values():
|
|
164
|
+
for summary_name, steps_stats in template_steps_summary.items():
|
|
165
|
+
for step_id, count in steps_stats.items():
|
|
166
|
+
steps_summary[summary_name][step_id] += count
|
|
167
|
+
steps_summary = {k: dict(v) for k, v in steps_summary.items()}
|
|
168
|
+
if len(steps_summary) > 0:
|
|
169
|
+
micro_summary["steps"] = steps_summary
|
|
170
|
+
|
|
160
171
|
return dict(micro_summary)
|
|
161
172
|
|
|
162
173
|
|
|
@@ -198,8 +209,8 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
198
209
|
|
|
199
210
|
if "error" in sample:
|
|
200
211
|
number_of_samples_per_template_by_status[template_id]["error"] += 1
|
|
201
|
-
|
|
202
|
-
|
|
212
|
+
else:
|
|
213
|
+
number_of_samples_per_template_by_status[template_id]["success"] += 1
|
|
203
214
|
update_stats(sample, stats_per_template[template_id])
|
|
204
215
|
update_steps_summary(sample, steps_summary_per_template[template_id])
|
|
205
216
|
update_step_metrics(sample, step_metrics_per_template[template_id])
|
|
@@ -215,6 +226,7 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
215
226
|
"micro": compute_micro_stats(
|
|
216
227
|
number_of_samples_per_template_by_status,
|
|
217
228
|
stats_per_template,
|
|
229
|
+
steps_summary_per_template,
|
|
218
230
|
step_metrics_per_template
|
|
219
231
|
)
|
|
220
232
|
}
|
|
@@ -6,7 +6,7 @@ def run_evaluation(
|
|
|
6
6
|
responses_dict: dict,
|
|
7
7
|
) -> list[dict]:
|
|
8
8
|
# Output metrics are not nested, for simpler aggregation
|
|
9
|
-
|
|
9
|
+
answer_correctness_evaluator = None
|
|
10
10
|
evaluation_results = []
|
|
11
11
|
for template in qa_dataset:
|
|
12
12
|
template_id = template["template_id"]
|
|
@@ -26,9 +26,9 @@ def run_evaluation(
|
|
|
26
26
|
"status": "error",
|
|
27
27
|
"error": actual_result["error"],
|
|
28
28
|
})
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
else:
|
|
30
|
+
eval_result["status"] = "success"
|
|
31
|
+
|
|
32
32
|
if "actual_answer" in actual_result:
|
|
33
33
|
eval_result["actual_answer"] = actual_result["actual_answer"]
|
|
34
34
|
from graphrag_eval import answer_relevance
|
|
@@ -38,25 +38,24 @@ def run_evaluation(
|
|
|
38
38
|
actual_result["actual_answer"],
|
|
39
39
|
)
|
|
40
40
|
)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
41
|
+
|
|
42
|
+
if "reference_answer" in question:
|
|
43
|
+
from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
|
|
44
|
+
if not answer_correctness_evaluator:
|
|
45
|
+
answer_correctness_evaluator = AnswerCorrectnessEvaluator()
|
|
46
|
+
eval_result.update(
|
|
47
|
+
answer_correctness_evaluator.get_correctness_dict(
|
|
48
|
+
question,
|
|
49
|
+
actual_result,
|
|
50
|
+
)
|
|
49
51
|
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
"total_tokens": actual_result["total_tokens"],
|
|
59
|
-
"elapsed_sec": actual_result["elapsed_sec"],
|
|
60
|
-
})
|
|
52
|
+
|
|
53
|
+
eval_result.update(
|
|
54
|
+
get_steps_evaluation_result_dict(question, actual_result)
|
|
55
|
+
)
|
|
56
|
+
for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
|
|
57
|
+
if key in actual_result:
|
|
58
|
+
eval_result[key] = actual_result[key]
|
|
59
|
+
|
|
61
60
|
evaluation_results.append(eval_result)
|
|
62
61
|
return evaluation_results
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "5.
|
|
3
|
+
version = "5.3.1"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|