graphrag-eval 5.1.2__tar.gz → 5.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: graphrag-eval
3
- Version: 5.1.2
3
+ Version: 5.3.0
4
4
  Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
5
5
  License: Apache-2.0
6
6
  Author: Philip Ganchev
@@ -19,7 +19,7 @@ Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
19
19
  Description-Content-Type: text/markdown
20
20
 
21
21
  <p align="center">
22
- <img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
22
+ <img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
23
23
  </p>
24
24
 
25
25
  # QA Evaluation
@@ -28,7 +28,7 @@ This is a Python module for assessing the quality of question-answering systems
28
28
 
29
29
  ## License
30
30
 
31
- Apache-2.0 License. See [LICENSE](LICENSE) file for details.
31
+ Apache-2.0 License. See [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
32
32
 
33
33
  ## Installation
34
34
 
@@ -107,6 +107,7 @@ Each step includes:
107
107
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
108
108
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
109
109
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
110
+ - `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
110
111
 
111
112
  #### Reference Data
112
113
 
@@ -591,7 +592,7 @@ Aggregates are:
591
592
  - `per_template`: a dictionary mapping a template identifier to the following statistics:
592
593
  - `number_of_error_samples`: number of questions for this template, which resulted in error response
593
594
  - `number_of_success_samples`: number of questions for this template, which resulted in successful response
594
- - `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for this template for the following metrics:
595
+ - `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics over all questions of this template for which the metrics exist:
595
596
  - `input_tokens`
596
597
  - `output_tokens`
597
598
  - `total_tokens`
@@ -608,14 +609,19 @@ Aggregates are:
608
609
  - `retrieval_context_precision`
609
610
  - `retrieval_context_f1`
610
611
  - `steps`: includes:
611
- - `steps`: for each step type how many times it was executed
612
+ - `total`: for each step type how many times it was executed
612
613
  - `once_per_sample`: how many times each step was executed, counted only once per question
613
614
  - `empty_results`: how many times the step was executed and returned empty results
614
615
  - `errors`: how many times the step was executed and resulted in error
615
616
  - `micro`: statistics across questions, regardless of template. It includes:
616
617
  - `number_of_error_samples`: total number of questions, which resulted in error response
617
618
  - `number_of_success_samples`: total number of questions, which resulted in successful response
618
- - `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for the following metrics:
619
+ - `steps`: includes:
620
+ - `total`: for each step type how many times it was executed
621
+ - `once_per_sample`: how many times each step was executed, counted only once per question
622
+ - `empty_results`: how many times the step was executed and returned empty results
623
+ - `errors`: how many times the step was executed and resulted in error
624
+ - `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics, over all questions where the metrics exist:
619
625
  - `input_tokens`
620
626
  - `output_tokens`
621
627
  - `total_tokens`
@@ -1,5 +1,5 @@
1
1
  <p align="center">
2
- <img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
2
+ <img alt="Graphwise Logo" src="https://github.com/Ontotext-AD/graphrag-eval/blob/main/.github/Graphwise_Logo.jpg">
3
3
  </p>
4
4
 
5
5
  # QA Evaluation
@@ -8,7 +8,7 @@ This is a Python module for assessing the quality of question-answering systems
8
8
 
9
9
  ## License
10
10
 
11
- Apache-2.0 License. See [LICENSE](LICENSE) file for details.
11
+ Apache-2.0 License. See [LICENSE](https://github.com/Ontotext-AD/graphrag-eval/blob/main/LICENSE) file for details.
12
12
 
13
13
  ## Installation
14
14
 
@@ -87,6 +87,7 @@ Each step includes:
87
87
  - `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
88
88
  - `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
89
89
  - `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
90
+ - `ignore_duplicates`: (optional, defaults to `true`) For SPARQL query results, whether duplicate binding values in the expected or in the actual results should be ignored for the comparison.
90
91
 
91
92
  #### Reference Data
92
93
 
@@ -571,7 +572,7 @@ Aggregates are:
571
572
  - `per_template`: a dictionary mapping a template identifier to the following statistics:
572
573
  - `number_of_error_samples`: number of questions for this template, which resulted in error response
573
574
  - `number_of_success_samples`: number of questions for this template, which resulted in successful response
574
- - `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for this template for the following metrics:
575
+ - `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics over all questions of this template for which the metrics exist:
575
576
  - `input_tokens`
576
577
  - `output_tokens`
577
578
  - `total_tokens`
@@ -588,14 +589,19 @@ Aggregates are:
588
589
  - `retrieval_context_precision`
589
590
  - `retrieval_context_f1`
590
591
  - `steps`: includes:
591
- - `steps`: for each step type how many times it was executed
592
+ - `total`: for each step type how many times it was executed
592
593
  - `once_per_sample`: how many times each step was executed, counted only once per question
593
594
  - `empty_results`: how many times the step was executed and returned empty results
594
595
  - `errors`: how many times the step was executed and resulted in error
595
596
  - `micro`: statistics across questions, regardless of template. It includes:
596
597
  - `number_of_error_samples`: total number of questions, which resulted in error response
597
598
  - `number_of_success_samples`: total number of questions, which resulted in successful response
598
- - `sum`, `mean`, `median`, `min` and `max` statistics over all non-error responses for the following metrics:
599
+ - `steps`: includes:
600
+ - `total`: for each step type how many times it was executed
601
+ - `once_per_sample`: how many times each step was executed, counted only once per question
602
+ - `empty_results`: how many times the step was executed and returned empty results
603
+ - `errors`: how many times the step was executed and resulted in error
604
+ - `sum`, `mean`, `median`, `min` and `max` statistics for the following metrics, over all questions where the metrics exist:
599
605
  - `input_tokens`
600
606
  - `output_tokens`
601
607
  - `total_tokens`
@@ -131,9 +131,10 @@ def compute_per_template_stats(
131
131
 
132
132
 
133
133
  def compute_micro_stats(
134
- number_of_samples_per_template_by_status,
135
- stats_per_template,
136
- step_metrics_per_template
134
+ number_of_samples_per_template_by_status: dict[str, dict[str, int]],
135
+ stats_per_template: dict[str, dict[str, Sequence[int]]],
136
+ steps_summary_per_template: dict[str, dict[str, dict[str, int]]],
137
+ step_metrics_per_template: dict[str, dict[str, Sequence[int]]],
137
138
  ) -> dict:
138
139
  values = number_of_samples_per_template_by_status.values()
139
140
  micro_summary = defaultdict(dict, {
@@ -157,6 +158,16 @@ def compute_micro_stats(
157
158
  micro_step_metrics[metric].extend(values)
158
159
  for metric, values in micro_step_metrics.items():
159
160
  micro_summary[metric] = stats_for_series(values)
161
+
162
+ steps_summary = defaultdict(lambda: defaultdict(int))
163
+ for template_steps_summary in steps_summary_per_template.values():
164
+ for summary_name, steps_stats in template_steps_summary.items():
165
+ for step_id, count in steps_stats.items():
166
+ steps_summary[summary_name][step_id] += count
167
+ steps_summary = {k: dict(v) for k, v in steps_summary.items()}
168
+ if len(steps_summary) > 0:
169
+ micro_summary["steps"] = steps_summary
170
+
160
171
  return dict(micro_summary)
161
172
 
162
173
 
@@ -198,8 +209,8 @@ def compute_aggregates(samples: list[dict]) -> dict:
198
209
 
199
210
  if "error" in sample:
200
211
  number_of_samples_per_template_by_status[template_id]["error"] += 1
201
- continue
202
- number_of_samples_per_template_by_status[template_id]["success"] += 1
212
+ else:
213
+ number_of_samples_per_template_by_status[template_id]["success"] += 1
203
214
  update_stats(sample, stats_per_template[template_id])
204
215
  update_steps_summary(sample, steps_summary_per_template[template_id])
205
216
  update_step_metrics(sample, step_metrics_per_template[template_id])
@@ -215,6 +226,7 @@ def compute_aggregates(samples: list[dict]) -> dict:
215
226
  "micro": compute_micro_stats(
216
227
  number_of_samples_per_template_by_status,
217
228
  stats_per_template,
229
+ steps_summary_per_template,
218
230
  step_metrics_per_template
219
231
  )
220
232
  }
@@ -6,7 +6,7 @@ def run_evaluation(
6
6
  responses_dict: dict,
7
7
  ) -> list[dict]:
8
8
  # Output metrics are not nested, for simpler aggregation
9
- answer_correctess_evaluator = None
9
+ answer_correctness_evaluator = None
10
10
  evaluation_results = []
11
11
  for template in qa_dataset:
12
12
  template_id = template["template_id"]
@@ -26,9 +26,9 @@ def run_evaluation(
26
26
  "status": "error",
27
27
  "error": actual_result["error"],
28
28
  })
29
- evaluation_results.append(eval_result)
30
- continue
31
- eval_result["status"] = "success"
29
+ else:
30
+ eval_result["status"] = "success"
31
+
32
32
  if "actual_answer" in actual_result:
33
33
  eval_result["actual_answer"] = actual_result["actual_answer"]
34
34
  from graphrag_eval import answer_relevance
@@ -38,25 +38,24 @@ def run_evaluation(
38
38
  actual_result["actual_answer"],
39
39
  )
40
40
  )
41
- if "reference_answer" in question and "actual_answer" in actual_result:
42
- from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
43
- if not answer_correctess_evaluator:
44
- answer_correctess_evaluator = AnswerCorrectnessEvaluator()
45
- eval_result.update(
46
- answer_correctess_evaluator.get_correctness_dict(
47
- question,
48
- actual_result,
41
+
42
+ if "reference_answer" in question:
43
+ from graphrag_eval.answer_correctness import AnswerCorrectnessEvaluator
44
+ if not answer_correctness_evaluator:
45
+ answer_correctness_evaluator = AnswerCorrectnessEvaluator()
46
+ eval_result.update(
47
+ answer_correctness_evaluator.get_correctness_dict(
48
+ question,
49
+ actual_result,
50
+ )
49
51
  )
50
- )
51
- if "actual_steps" in actual_result:
52
- eval_result.update(
53
- get_steps_evaluation_result_dict(question, actual_result)
54
- )
55
- eval_result.update({
56
- "input_tokens": actual_result["input_tokens"],
57
- "output_tokens": actual_result["output_tokens"],
58
- "total_tokens": actual_result["total_tokens"],
59
- "elapsed_sec": actual_result["elapsed_sec"],
60
- })
52
+
53
+ eval_result.update(
54
+ get_steps_evaluation_result_dict(question, actual_result)
55
+ )
56
+ for key in "input_tokens", "output_tokens", "total_tokens", "elapsed_sec":
57
+ if key in actual_result:
58
+ eval_result[key] = actual_result[key]
59
+
61
60
  evaluation_results.append(eval_result)
62
61
  return evaluation_results
@@ -1,12 +1,11 @@
1
1
  import json
2
2
  from collections import defaultdict
3
- from typing import Any
4
3
  from collections.abc import Sequence
4
+ from typing import Any
5
5
 
6
6
  from .retrieval_context_ids import recall_at_k
7
7
  from .sparql import compare_sparql_results
8
8
 
9
-
10
9
  Match = tuple[int, int, int, float]
11
10
  Step = dict[str, Any]
12
11
  StepsGroup = Sequence[Step] # We will index into a group
@@ -23,6 +22,7 @@ def compare_steps_outputs(reference_step: Step, actual_step: Step) -> float:
23
22
  json.loads(actual_output),
24
23
  reference_step["required_columns"],
25
24
  reference_step.get("ordered", False),
25
+ reference_step.get("ignore_duplicates", True),
26
26
  )
27
27
  if reference_step.get("output_media_type") == "application/json":
28
28
  return float(json.loads(reference_output) == json.loads(actual_output))
@@ -1,7 +1,7 @@
1
- from collections import Counter
2
- from typing import Union
3
1
  import itertools
4
2
  import math
3
+ from collections import Counter
4
+ from typing import Union
5
5
 
6
6
  XSD_NUMERIC_TYPES = {
7
7
  "http://www.w3.org/2001/XMLSchema#integer",
@@ -35,7 +35,7 @@ def truncate(number: float, decimals: int = 0) -> float:
35
35
  elif decimals == 0:
36
36
  return math.trunc(number)
37
37
 
38
- factor = 10.0**decimals
38
+ factor = 10.0 ** decimals
39
39
  return math.trunc(number * factor) / factor
40
40
 
41
41
 
@@ -137,8 +137,8 @@ def compare_values(
137
137
  actual_vars: Union[list[str], tuple[str, ...]],
138
138
  actual_var_to_values: dict[str, list],
139
139
  results_are_ordered: bool,
140
+ ignore_duplicates: bool,
140
141
  ) -> bool:
141
-
142
142
  if len(reference_vars) < len(actual_vars):
143
143
  for combination in itertools.combinations(actual_vars, len(reference_vars)):
144
144
  if compare_values(
@@ -147,6 +147,7 @@ def compare_values(
147
147
  combination,
148
148
  actual_var_to_values,
149
149
  results_are_ordered,
150
+ ignore_duplicates,
150
151
  ):
151
152
  return True
152
153
  return False
@@ -154,9 +155,9 @@ def compare_values(
154
155
  table = convert_table_dict2lines(reference_vars, reference_var_to_values)
155
156
  for permutation in itertools.permutations(actual_vars):
156
157
  actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
157
- if (results_are_ordered and table == actual_table) or (
158
- not results_are_ordered and Counter(table) == Counter(actual_table)
159
- ):
158
+ if (results_are_ordered and table == actual_table) or \
159
+ ((not results_are_ordered) and ignore_duplicates and set(table) == set(actual_table)) or \
160
+ ((not results_are_ordered) and (not ignore_duplicates) and Counter(table) == Counter(actual_table)):
160
161
  return True
161
162
 
162
163
  return False
@@ -167,6 +168,7 @@ def compare_sparql_results(
167
168
  actual_sparql_result: dict,
168
169
  required_vars: list[str],
169
170
  results_are_ordered: bool = False,
171
+ ignore_duplicates: bool = True,
170
172
  ) -> float:
171
173
  # DESCRIBE results
172
174
  if isinstance(actual_sparql_result, str):
@@ -208,5 +210,6 @@ def compare_sparql_results(
208
210
  actual_vars,
209
211
  actual_var_to_values,
210
212
  results_are_ordered,
213
+ ignore_duplicates,
211
214
  )
212
215
  )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-eval"
3
- version = "5.1.2"
3
+ version = "5.3.0"
4
4
  description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
5
5
  authors = [
6
6
  { name = "Philip Ganchev", email = "philip.ganchev@graphwise.ai" },
File without changes