graphrag-eval 5.1.1__tar.gz → 5.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: graphrag-eval
3
- Version: 5.1.1
3
+ Version: 5.2.0
4
4
  Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
5
  License: Apache-2.0
6
6
  Author: Philip Ganchev
@@ -107,6 +107,7 @@ Each step includes:
107
107
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
108
108
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
109
109
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
110
+ - `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
110
111
 
111
112
  #### Reference Data
112
113
 
@@ -87,6 +87,7 @@ Each step includes:
87
87
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
88
88
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
89
89
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
90
+ - `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
90
91
 
91
92
  #### Reference Data
92
93
 
@@ -4,7 +4,6 @@ from collections.abc import Sequence
4
4
  from statistics import mean, median
5
5
  from typing import Any, Collection, Iterable
6
6
 
7
-
8
7
  METRICS = [
9
8
  "answer_recall",
10
9
  "answer_precision",
@@ -135,7 +134,7 @@ def compute_micro_stats(
135
134
  number_of_samples_per_template_by_status,
136
135
  stats_per_template,
137
136
  step_metrics_per_template
138
- ):
137
+ ) -> dict:
139
138
  values = number_of_samples_per_template_by_status.values()
140
139
  micro_summary = defaultdict(dict, {
141
140
  "number_of_error_samples": sum(v["error"] for v in values),
@@ -158,7 +157,7 @@ def compute_micro_stats(
158
157
  micro_step_metrics[metric].extend(values)
159
158
  for metric, values in micro_step_metrics.items():
160
159
  micro_summary[metric] = stats_for_series(values)
161
- return micro_summary
160
+ return dict(micro_summary)
162
161
 
163
162
 
164
163
  def compute_macro_stats(
@@ -182,7 +181,7 @@ def compute_macro_stats(
182
181
  macro_step_metrics[metric].append(stats["mean"])
183
182
  for metric, values in macro_step_metrics.items():
184
183
  macro_summary[metric]["mean"] = mean(values or [0])
185
- return macro_summary
184
+ return dict(macro_summary)
186
185
 
187
186
 
188
187
  def compute_aggregates(samples: list[dict]) -> dict:
@@ -205,18 +204,19 @@ def compute_aggregates(samples: list[dict]) -> dict:
205
204
  update_steps_summary(sample, steps_summary_per_template[template_id])
206
205
  update_step_metrics(sample, step_metrics_per_template[template_id])
207
206
 
208
- summary = {}
209
- summary["per_template"] = compute_per_template_stats(
210
- templates_ids,
211
- number_of_samples_per_template_by_status,
212
- stats_per_template,
213
- steps_summary_per_template,
214
- step_metrics_per_template,
215
- )
216
- summary["micro"] = compute_micro_stats(
217
- number_of_samples_per_template_by_status,
218
- stats_per_template,
219
- step_metrics_per_template
220
- )
207
+ summary = {
208
+ "per_template": compute_per_template_stats(
209
+ templates_ids,
210
+ number_of_samples_per_template_by_status,
211
+ stats_per_template,
212
+ steps_summary_per_template,
213
+ step_metrics_per_template,
214
+ ),
215
+ "micro": compute_micro_stats(
216
+ number_of_samples_per_template_by_status,
217
+ stats_per_template,
218
+ step_metrics_per_template
219
+ )
220
+ }
221
221
  summary["macro"] = compute_macro_stats(summary["per_template"])
222
222
  return summary
@@ -1,12 +1,11 @@
1
1
  import json
2
2
  from collections import defaultdict
3
- from typing import Any
4
3
  from collections.abc import Sequence
4
+ from typing import Any
5
5
 
6
6
  from .retrieval_context_ids import recall_at_k
7
7
  from .sparql import compare_sparql_results
8
8
 
9
-
10
9
  Match = tuple[int, int, int, float]
11
10
  Step = dict[str, Any]
12
11
  StepsGroup = Sequence[Step] # We will index into a group
@@ -23,6 +22,7 @@ def compare_steps_outputs(reference_step: Step, actual_step: Step) -> float:
23
22
  json.loads(actual_output),
24
23
  reference_step["required_columns"],
25
24
  reference_step.get("ordered", False),
25
+ reference_step.get("ignore_duplicates", True),
26
26
  )
27
27
  if reference_step.get("output_media_type") == "application/json":
28
28
  return float(json.loads(reference_output) == json.loads(actual_output))
@@ -1,7 +1,7 @@
1
- from collections import Counter
2
- from typing import Union
3
1
  import itertools
4
2
  import math
3
+ from collections import Counter
4
+ from typing import Union
5
5
 
6
6
  XSD_NUMERIC_TYPES = {
7
7
  "http://www.w3.org/2001/XMLSchema#integer",
@@ -35,7 +35,7 @@ def truncate(number: float, decimals: int = 0) -> float:
35
35
  elif decimals == 0:
36
36
  return math.trunc(number)
37
37
 
38
- factor = 10.0**decimals
38
+ factor = 10.0 ** decimals
39
39
  return math.trunc(number * factor) / factor
40
40
 
41
41
 
@@ -137,8 +137,8 @@ def compare_values(
137
137
  actual_vars: Union[list[str], tuple[str, ...]],
138
138
  actual_var_to_values: dict[str, list],
139
139
  results_are_ordered: bool,
140
+ ignore_duplicates: bool,
140
141
  ) -> bool:
141
-
142
142
  if len(reference_vars) < len(actual_vars):
143
143
  for combination in itertools.combinations(actual_vars, len(reference_vars)):
144
144
  if compare_values(
@@ -147,6 +147,7 @@ def compare_values(
147
147
  combination,
148
148
  actual_var_to_values,
149
149
  results_are_ordered,
150
+ ignore_duplicates,
150
151
  ):
151
152
  return True
152
153
  return False
@@ -154,9 +155,9 @@ def compare_values(
154
155
  table = convert_table_dict2lines(reference_vars, reference_var_to_values)
155
156
  for permutation in itertools.permutations(actual_vars):
156
157
  actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
157
- if (results_are_ordered and table == actual_table) or (
158
- not results_are_ordered and Counter(table) == Counter(actual_table)
159
- ):
158
+ if (results_are_ordered and table == actual_table) or \
159
+ ((not results_are_ordered) and ignore_duplicates and set(table) == set(actual_table)) or \
160
+ ((not results_are_ordered) and (not ignore_duplicates) and Counter(table) == Counter(actual_table)):
160
161
  return True
161
162
 
162
163
  return False
@@ -167,6 +168,7 @@ def compare_sparql_results(
167
168
  actual_sparql_result: dict,
168
169
  required_vars: list[str],
169
170
  results_are_ordered: bool = False,
171
+ ignore_duplicates: bool = True,
170
172
  ) -> float:
171
173
  # DESCRIBE results
172
174
  if isinstance(actual_sparql_result, str):
@@ -208,5 +210,6 @@ def compare_sparql_results(
208
210
  actual_vars,
209
211
  actual_var_to_values,
210
212
  results_are_ordered,
213
+ ignore_duplicates,
211
214
  )
212
215
  )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "5.1.1"
3
+ version = "5.2.0"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
File without changes