graphrag-eval 5.1.1__tar.gz → 5.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/PKG-INFO +2 -1
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/README.md +1 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/aggregation.py +17 -17
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/evaluation.py +2 -2
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/sparql.py +10 -7
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/pyproject.toml +1 -1
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/LICENSE +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/answer_correctness.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/answer_relevance.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/evaluation.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/prompts/template.md +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/__init__.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/retrieval_answer.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/retrieval_context_ids.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/steps/retrieval_context_texts.py +0 -0
- {graphrag_eval-5.1.1 → graphrag_eval-5.2.0}/graphrag_eval/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: graphrag-eval
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.2.0
|
|
4
4
|
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Philip Ganchev
|
|
@@ -107,6 +107,7 @@ Each step includes:
|
|
|
107
107
|
- `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
|
|
108
108
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
109
109
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
110
|
+
- `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
|
|
110
111
|
|
|
111
112
|
#### Reference Data
|
|
112
113
|
|
|
@@ -87,6 +87,7 @@ Each step includes:
|
|
|
87
87
|
- `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
|
|
88
88
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
89
89
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
90
|
+
- `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
|
|
90
91
|
|
|
91
92
|
#### Reference Data
|
|
92
93
|
|
|
@@ -4,7 +4,6 @@ from collections.abc import Sequence
|
|
|
4
4
|
from statistics import mean, median
|
|
5
5
|
from typing import Any, Collection, Iterable
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
METRICS = [
|
|
9
8
|
"answer_recall",
|
|
10
9
|
"answer_precision",
|
|
@@ -135,7 +134,7 @@ def compute_micro_stats(
|
|
|
135
134
|
number_of_samples_per_template_by_status,
|
|
136
135
|
stats_per_template,
|
|
137
136
|
step_metrics_per_template
|
|
138
|
-
):
|
|
137
|
+
) -> dict:
|
|
139
138
|
values = number_of_samples_per_template_by_status.values()
|
|
140
139
|
micro_summary = defaultdict(dict, {
|
|
141
140
|
"number_of_error_samples": sum(v["error"] for v in values),
|
|
@@ -158,7 +157,7 @@ def compute_micro_stats(
|
|
|
158
157
|
micro_step_metrics[metric].extend(values)
|
|
159
158
|
for metric, values in micro_step_metrics.items():
|
|
160
159
|
micro_summary[metric] = stats_for_series(values)
|
|
161
|
-
return micro_summary
|
|
160
|
+
return dict(micro_summary)
|
|
162
161
|
|
|
163
162
|
|
|
164
163
|
def compute_macro_stats(
|
|
@@ -182,7 +181,7 @@ def compute_macro_stats(
|
|
|
182
181
|
macro_step_metrics[metric].append(stats["mean"])
|
|
183
182
|
for metric, values in macro_step_metrics.items():
|
|
184
183
|
macro_summary[metric]["mean"] = mean(values or [0])
|
|
185
|
-
return macro_summary
|
|
184
|
+
return dict(macro_summary)
|
|
186
185
|
|
|
187
186
|
|
|
188
187
|
def compute_aggregates(samples: list[dict]) -> dict:
|
|
@@ -205,18 +204,19 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
205
204
|
update_steps_summary(sample, steps_summary_per_template[template_id])
|
|
206
205
|
update_step_metrics(sample, step_metrics_per_template[template_id])
|
|
207
206
|
|
|
208
|
-
summary = {
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
207
|
+
summary = {
|
|
208
|
+
"per_template": compute_per_template_stats(
|
|
209
|
+
templates_ids,
|
|
210
|
+
number_of_samples_per_template_by_status,
|
|
211
|
+
stats_per_template,
|
|
212
|
+
steps_summary_per_template,
|
|
213
|
+
step_metrics_per_template,
|
|
214
|
+
),
|
|
215
|
+
"micro": compute_micro_stats(
|
|
216
|
+
number_of_samples_per_template_by_status,
|
|
217
|
+
stats_per_template,
|
|
218
|
+
step_metrics_per_template
|
|
219
|
+
)
|
|
220
|
+
}
|
|
221
221
|
summary["macro"] = compute_macro_stats(summary["per_template"])
|
|
222
222
|
return summary
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from typing import Any
|
|
4
3
|
from collections.abc import Sequence
|
|
4
|
+
from typing import Any
|
|
5
5
|
|
|
6
6
|
from .retrieval_context_ids import recall_at_k
|
|
7
7
|
from .sparql import compare_sparql_results
|
|
8
8
|
|
|
9
|
-
|
|
10
9
|
Match = tuple[int, int, int, float]
|
|
11
10
|
Step = dict[str, Any]
|
|
12
11
|
StepsGroup = Sequence[Step] # We will index into a group
|
|
@@ -23,6 +22,7 @@ def compare_steps_outputs(reference_step: Step, actual_step: Step) -> float:
|
|
|
23
22
|
json.loads(actual_output),
|
|
24
23
|
reference_step["required_columns"],
|
|
25
24
|
reference_step.get("ordered", False),
|
|
25
|
+
reference_step.get("ignore_duplicates", True),
|
|
26
26
|
)
|
|
27
27
|
if reference_step.get("output_media_type") == "application/json":
|
|
28
28
|
return float(json.loads(reference_output) == json.loads(actual_output))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from collections import Counter
|
|
2
|
-
from typing import Union
|
|
3
1
|
import itertools
|
|
4
2
|
import math
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Union
|
|
5
5
|
|
|
6
6
|
XSD_NUMERIC_TYPES = {
|
|
7
7
|
"http://www.w3.org/2001/XMLSchema#integer",
|
|
@@ -35,7 +35,7 @@ def truncate(number: float, decimals: int = 0) -> float:
|
|
|
35
35
|
elif decimals == 0:
|
|
36
36
|
return math.trunc(number)
|
|
37
37
|
|
|
38
|
-
factor = 10.0**decimals
|
|
38
|
+
factor = 10.0 ** decimals
|
|
39
39
|
return math.trunc(number * factor) / factor
|
|
40
40
|
|
|
41
41
|
|
|
@@ -137,8 +137,8 @@ def compare_values(
|
|
|
137
137
|
actual_vars: Union[list[str], tuple[str, ...]],
|
|
138
138
|
actual_var_to_values: dict[str, list],
|
|
139
139
|
results_are_ordered: bool,
|
|
140
|
+
ignore_duplicates: bool,
|
|
140
141
|
) -> bool:
|
|
141
|
-
|
|
142
142
|
if len(reference_vars) < len(actual_vars):
|
|
143
143
|
for combination in itertools.combinations(actual_vars, len(reference_vars)):
|
|
144
144
|
if compare_values(
|
|
@@ -147,6 +147,7 @@ def compare_values(
|
|
|
147
147
|
combination,
|
|
148
148
|
actual_var_to_values,
|
|
149
149
|
results_are_ordered,
|
|
150
|
+
ignore_duplicates,
|
|
150
151
|
):
|
|
151
152
|
return True
|
|
152
153
|
return False
|
|
@@ -154,9 +155,9 @@ def compare_values(
|
|
|
154
155
|
table = convert_table_dict2lines(reference_vars, reference_var_to_values)
|
|
155
156
|
for permutation in itertools.permutations(actual_vars):
|
|
156
157
|
actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
|
|
157
|
-
if (results_are_ordered and table == actual_table) or
|
|
158
|
-
not results_are_ordered and
|
|
159
|
-
|
|
158
|
+
if (results_are_ordered and table == actual_table) or \
|
|
159
|
+
((not results_are_ordered) and ignore_duplicates and set(table) == set(actual_table)) or \
|
|
160
|
+
((not results_are_ordered) and (not ignore_duplicates) and Counter(table) == Counter(actual_table)):
|
|
160
161
|
return True
|
|
161
162
|
|
|
162
163
|
return False
|
|
@@ -167,6 +168,7 @@ def compare_sparql_results(
|
|
|
167
168
|
actual_sparql_result: dict,
|
|
168
169
|
required_vars: list[str],
|
|
169
170
|
results_are_ordered: bool = False,
|
|
171
|
+
ignore_duplicates: bool = True,
|
|
170
172
|
) -> float:
|
|
171
173
|
# DESCRIBE results
|
|
172
174
|
if isinstance(actual_sparql_result, str):
|
|
@@ -208,5 +210,6 @@ def compare_sparql_results(
|
|
|
208
210
|
actual_vars,
|
|
209
211
|
actual_var_to_values,
|
|
210
212
|
results_are_ordered,
|
|
213
|
+
ignore_duplicates,
|
|
211
214
|
)
|
|
212
215
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "5.
|
|
3
|
+
version = "5.2.0"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|