graphrag-eval 4.0.0__tar.gz → 5.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphrag_eval-4.0.0/README.md → graphrag_eval-5.0.0/PKG-INFO +133 -9
- graphrag_eval-4.0.0/PKG-INFO → graphrag_eval-5.0.0/README.md +118 -24
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/aggregation.py +66 -1
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/evaluation.py +1 -1
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/steps/__init__.py +36 -9
- graphrag_eval-5.0.0/graphrag_eval/steps/retrieval_answer.py +62 -0
- graphrag_eval-5.0.0/graphrag_eval/steps/retrieval_context_ids.py +50 -0
- graphrag_eval-5.0.0/graphrag_eval/steps/retrieval_context_texts.py +59 -0
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/steps/sparql.py +89 -15
- graphrag_eval-5.0.0/graphrag_eval/util.py +25 -0
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/pyproject.toml +3 -3
- graphrag_eval-4.0.0/graphrag_eval/steps/retrieval.py +0 -55
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/LICENSE +0 -0
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/__init__.py +0 -0
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/answer_correctness.py +0 -0
- {graphrag_eval-4.0.0 → graphrag_eval-5.0.0}/graphrag_eval/answer_relevance.py +0 -0
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: graphrag-eval
|
|
3
|
+
Version: 5.0.0
|
|
4
|
+
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Neli Hateva
|
|
7
|
+
Author-email: neli.hateva@graphwise.ai
|
|
8
|
+
Requires-Python: >=3.12,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Project-URL: Repository, https://github.com/Ontotext-AD/graphrag-eval
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
1
15
|
<p align="center">
|
|
2
16
|
<img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
|
|
3
17
|
</p>
|
|
@@ -36,7 +50,7 @@ graphrag-eval = {version = "*", extras = ["openai"]}
|
|
|
36
50
|
## Maintainers
|
|
37
51
|
|
|
38
52
|
Developed and maintained by [Graphwise](https://graphwise.ai/).
|
|
39
|
-
For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/
|
|
53
|
+
For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
|
|
40
54
|
|
|
41
55
|
## Command Line Use
|
|
42
56
|
|
|
@@ -77,13 +91,14 @@ A reference corpus is a list of templates, each of which contains:
|
|
|
77
91
|
- `question_text`: The natural language query passed to the LLM
|
|
78
92
|
- `reference_steps`: (optional) A list of expected steps grouped by expected order of execution, where all steps in a group can be executed in any order relative to each other, but after all steps in the previous group and before all steps in the next group.
|
|
79
93
|
- `reference_answer`: (optional) The expected answer to the question
|
|
94
|
+
|
|
80
95
|
The assumption is that the final answer to the question is derived from the outputs of the steps, which are executed last (last level).
|
|
81
96
|
|
|
82
97
|
Each step includes:
|
|
83
98
|
|
|
84
99
|
- `name`: The type of step being performed (e.g., `sparql_query`)
|
|
85
100
|
- `args`: Arguments of the step (e.g., arguments to a tool used in the step, such as a SPARQL query)
|
|
86
|
-
- `output`: The expected output from the step
|
|
101
|
+
- `output`: The expected output from the step.
|
|
87
102
|
- `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
|
|
88
103
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
89
104
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
@@ -99,7 +114,22 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
|
|
|
99
114
|
question_text: List all transformers within Substation OSLO
|
|
100
115
|
reference_answer: OSLO T1, OSLO T2
|
|
101
116
|
reference_steps:
|
|
102
|
-
- - name:
|
|
117
|
+
- - name: retrieval
|
|
118
|
+
args:
|
|
119
|
+
query: transformers Substation OSLO
|
|
120
|
+
k: 2
|
|
121
|
+
output: |-
|
|
122
|
+
[
|
|
123
|
+
{
|
|
124
|
+
"id": "http://example.com/resource/doc/1",
|
|
125
|
+
"text": "Transformer OSLO T1 is in Substation Oslo."
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
"id": "http://example.com/resource/doc/2",
|
|
129
|
+
"text": "Transformer OSLO T2 is in Substation Oslo."
|
|
130
|
+
}
|
|
131
|
+
]
|
|
132
|
+
- name: sparql_query
|
|
103
133
|
args:
|
|
104
134
|
query: |2
|
|
105
135
|
|
|
@@ -253,6 +283,16 @@ Below is an example response from the question-answering system for a single que
|
|
|
253
283
|
"total_tokens": 298753,
|
|
254
284
|
"elapsed_sec": 46.48961806297302,
|
|
255
285
|
"actual_steps": [
|
|
286
|
+
{
|
|
287
|
+
"name": "retrieval",
|
|
288
|
+
"args": {
|
|
289
|
+
"query": "transformers Substation OSLO",
|
|
290
|
+
"k": 2
|
|
291
|
+
},
|
|
292
|
+
"id": "call_3",
|
|
293
|
+
"status": "success",
|
|
294
|
+
"output": "[\n {\n \"id\": \"http://example.com/resource/doc/1\",\n \"text\": \"Transformer OSLO T1 is in Substation Oslo.\"\n },\n {\n \"id\": \"http://example.com/resource/doc/2\",\n \"text\": \"Transformer OSLO T2 is in Substation Oslo.\"\n }\n]"
|
|
295
|
+
},
|
|
256
296
|
{
|
|
257
297
|
"name": "autocomplete_search",
|
|
258
298
|
"args": {
|
|
@@ -323,7 +363,23 @@ The output is a list of statistics for each question from the reference Q&A data
|
|
|
323
363
|
question_text: List all transformers within Substation OSLO
|
|
324
364
|
reference_answer: OSLO T1, OSLO T2
|
|
325
365
|
reference_steps:
|
|
326
|
-
- - name:
|
|
366
|
+
- - name: retrieval
|
|
367
|
+
args:
|
|
368
|
+
query: transformers Substation OSLO
|
|
369
|
+
k: 2
|
|
370
|
+
matches: call_3
|
|
371
|
+
output: |-
|
|
372
|
+
[
|
|
373
|
+
{
|
|
374
|
+
"id": "http://example.com/resource/doc/1",
|
|
375
|
+
"text": "Transformer OSLO T1 is in Substation Oslo."
|
|
376
|
+
},
|
|
377
|
+
{
|
|
378
|
+
"id": "http://example.com/resource/doc/2",
|
|
379
|
+
"text": "Transformer OSLO T2 is in Substation Oslo."
|
|
380
|
+
}
|
|
381
|
+
]
|
|
382
|
+
- name: sparql_query
|
|
327
383
|
args:
|
|
328
384
|
query: |2
|
|
329
385
|
|
|
@@ -364,6 +420,31 @@ The output is a list of statistics for each question from the reference Q&A data
|
|
|
364
420
|
answer_relevance: 0.9
|
|
365
421
|
answer_relevance_cost: 0.0007
|
|
366
422
|
actual_steps:
|
|
423
|
+
- name: retrieval
|
|
424
|
+
id: call_3
|
|
425
|
+
args:
|
|
426
|
+
query: transformers Substation OSLO
|
|
427
|
+
k: 2
|
|
428
|
+
status: success
|
|
429
|
+
output: |-
|
|
430
|
+
[
|
|
431
|
+
{
|
|
432
|
+
"id": "http://example.com/resource/doc/1",
|
|
433
|
+
"text": "Transformer OSLO T1 is in Substation Oslo."
|
|
434
|
+
},
|
|
435
|
+
{
|
|
436
|
+
"id": "http://example.com/resource/doc/2",
|
|
437
|
+
"text": "Transformer OSLO T2 is in Substation Oslo."
|
|
438
|
+
}
|
|
439
|
+
]
|
|
440
|
+
retrieval_answer_recall: 1.0
|
|
441
|
+
retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
|
|
442
|
+
retrieval_answer_recall_cost: 0.0007
|
|
443
|
+
retrieval_answer_precision: 1.0
|
|
444
|
+
retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
|
|
445
|
+
retrieval_answer_precision_cost: 0.0003
|
|
446
|
+
retrieval_answer_f1: 1.0
|
|
447
|
+
retrieval_answer_f1_cost: 0.001
|
|
367
448
|
- name: autocomplete_search
|
|
368
449
|
args:
|
|
369
450
|
query: OSLO
|
|
@@ -470,12 +551,33 @@ The output is a list of statistics for each question from the reference Q&A data
|
|
|
470
551
|
- `answer_relevance_error`: (optional) error message if answer relevance evaluation failed
|
|
471
552
|
- `answer_relevance_cost`: The LLM use cost of computing `answer_relevance`, in US dollars
|
|
472
553
|
- `actual_steps`: (optional) copy of the steps in the evaluation target, if specified there
|
|
473
|
-
- `steps_score`: a real number between 0 and 1, computed by comparing the results of the last
|
|
554
|
+
- `steps_score`: a real number between 0 and 1, computed by comparing the results of the last executed steps to the output of the reference's last group of steps.
|
|
555
|
+
- If there is no match in the actual steps, then the score is `0.0`
|
|
556
|
+
- If the executed step's name is "retrieval" and the last reference group contains a retrieval step, then the score is the [recall at k](#context-recallk) of the retrieved document ids with respect to the reference.
|
|
557
|
+
- Otherwise, the score is the number of the matched steps on the last group divided by the total number of steps in the last group.
|
|
474
558
|
- `input_tokens`: input tokens usage
|
|
475
559
|
- `output_tokens`: output tokens usage
|
|
476
560
|
- `total_tokens`: total tokens usage
|
|
477
561
|
- `elapsed_sec`: elapsed seconds
|
|
478
562
|
|
|
563
|
+
All `actual_steps` with `name` "retrieval" contain:
|
|
564
|
+
- `retrieval_answer_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
565
|
+
- `retrieval_answer_recall_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_recall`
|
|
566
|
+
- `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
|
|
567
|
+
- `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
|
|
568
|
+
- `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
569
|
+
- `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
|
|
570
|
+
- `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
|
|
571
|
+
- `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
|
|
572
|
+
- `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
|
|
573
|
+
- `retrieval_answer_f1_cost`: The sum of `retrieval_answer_recall_cost` and `retrieval_answer_precision_cost`
|
|
574
|
+
- `retrieval_context_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
575
|
+
- `retrieval_context_recall_error`: (optional) error message if `retrieval_context_recall` evaluation fails
|
|
576
|
+
- `retrieval_context_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
577
|
+
- `retrieval_context_precision_error`: (optional) error message if `retrieval_context_precision` evaluation fails
|
|
578
|
+
- `retrieval_context_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_context_recall` and `retrieval_context_precision` succeed
|
|
579
|
+
|
|
580
|
+
|
|
479
581
|
#### Aggregates Keys
|
|
480
582
|
|
|
481
583
|
The `aggregates` object provides aggregated evaluation metrics.
|
|
@@ -499,6 +601,9 @@ Aggregates are:
|
|
|
499
601
|
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
500
602
|
- `empty_results`: how many times the step was executed and returned empty results
|
|
501
603
|
- `errors`: how many times the step was executed and resulted in error
|
|
604
|
+
- `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
|
|
605
|
+
- `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
|
|
606
|
+
- `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
|
|
502
607
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
503
608
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
504
609
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
@@ -511,6 +616,9 @@ Aggregates are:
|
|
|
511
616
|
- `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
|
|
512
617
|
- `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
|
|
513
618
|
- `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
|
|
619
|
+
- `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
|
|
620
|
+
- `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
|
|
621
|
+
- `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
|
|
514
622
|
- `steps_score`: `sum`, `mean`, `median`, `min` and `max` for `steps_score` of all successful questions
|
|
515
623
|
- `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes:
|
|
516
624
|
- `input_tokens`: `mean` for `input_tokens`
|
|
@@ -522,6 +630,9 @@ Aggregates are:
|
|
|
522
630
|
- `answer_f1`: `mean` for `answer_f1`
|
|
523
631
|
- `answer_relevance`: `mean` for `answer_relevance`
|
|
524
632
|
- `answer_relevance_cost`: `mean` for `answer_relevance_cost`
|
|
633
|
+
- `retrieval_context_recall`: `mean` for `retrieval_context_recall`
|
|
634
|
+
- `retrieval_context_precision`: `mean` for `retrieval_context_precision`
|
|
635
|
+
- `retrieval_context_f1`: `mean` for `retrieval_context_f1`
|
|
525
636
|
- `steps_score`: `mean` for `steps_score`
|
|
526
637
|
|
|
527
638
|
#### Example Aggregates
|
|
@@ -898,18 +1009,30 @@ macro:
|
|
|
898
1009
|
mean: 25.911653497483996
|
|
899
1010
|
```
|
|
900
1011
|
|
|
1012
|
+
### SPARQL queries comparison
|
|
1013
|
+
|
|
1014
|
+
The algorithm iterates over all subsets of columns in the actual result of the same size as in the reference result.
|
|
1015
|
+
For each subset, it compares the set of columns (skipping optional columns).
|
|
1016
|
+
It matches floating-point numbers up to a 1e-8 precision. It does not do this for special types such as duration.
|
|
1017
|
+
|
|
1018
|
+
The average time complexity is О(nr\*nc_ref!\*binomial(nc_act, nc_ref)), where
|
|
1019
|
+
|
|
1020
|
+
* *nr* is the number of rows in the actual result
|
|
1021
|
+
* *nc_ref* is the number of columns in the reference result
|
|
1022
|
+
* *nc_act* is the number of columns in the actual result
|
|
1023
|
+
|
|
901
1024
|
### Retrieval Evaluation
|
|
902
1025
|
|
|
903
|
-
The following metrics are based on the
|
|
1026
|
+
The following metrics are based on the content of retrieved documents.
|
|
904
1027
|
|
|
905
|
-
#### Recall@k
|
|
1028
|
+
#### Context Recall@k
|
|
906
1029
|
|
|
907
1030
|
The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
|
|
908
1031
|
* **Formula**:
|
|
909
1032
|
$`
|
|
910
1033
|
\frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
|
|
911
1034
|
`$
|
|
912
|
-
* **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the
|
|
1035
|
+
* **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the first 'k' relevant items.
|
|
913
1036
|
* **Example**: Suppose there are 4 relevant documents for a given query. Suppose our system retrieves 3 of them in the top 5 results (`k=5`). Recall@5 is `3 / 4 = 0.75`.
|
|
914
1037
|
|
|
915
1038
|
```python
|
|
@@ -920,7 +1043,7 @@ recall_at_k(
|
|
|
920
1043
|
) # => 0.75
|
|
921
1044
|
```
|
|
922
1045
|
|
|
923
|
-
####
|
|
1046
|
+
#### Context Precision@k
|
|
924
1047
|
|
|
925
1048
|
Evaluates a ranked list of recommendations by looking at the precision at the position of each correctly retrieved item. It rewards systems for placing relevant items higher up in the list. It's more sophisticated than just looking at precision at a single cutoff because it considers the entire ranking.
|
|
926
1049
|
* **Formula**:
|
|
@@ -950,3 +1073,4 @@ average_precision(
|
|
|
950
1073
|
retrieved_docs=[1, 4, 3, 5, 7]
|
|
951
1074
|
) # ~=> 0.8056
|
|
952
1075
|
```
|
|
1076
|
+
|
|
@@ -1,17 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: graphrag-eval
|
|
3
|
-
Version: 4.0.0
|
|
4
|
-
Summary: For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps.
|
|
5
|
-
License: Apache-2.0
|
|
6
|
-
Author: Neli Hateva
|
|
7
|
-
Author-email: neli.hateva@graphwise.ai
|
|
8
|
-
Requires-Python: >=3.12,<3.13
|
|
9
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
-
Project-URL: Repository, https://github.com/Ontotext-AD/qa-eval
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
|
|
15
1
|
<p align="center">
|
|
16
2
|
<img alt="Graphwise Logo" src=".github/Graphwise_Logo.jpg">
|
|
17
3
|
</p>
|
|
@@ -50,7 +36,7 @@ graphrag-eval = {version = "*", extras = ["openai"]}
|
|
|
50
36
|
## Maintainers
|
|
51
37
|
|
|
52
38
|
Developed and maintained by [Graphwise](https://graphwise.ai/).
|
|
53
|
-
For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/
|
|
39
|
+
For issues or feature requests, please open [a GitHub issue](https://github.com/Ontotext-AD/graphrag-eval/issues).
|
|
54
40
|
|
|
55
41
|
## Command Line Use
|
|
56
42
|
|
|
@@ -91,13 +77,14 @@ A reference corpus is a list of templates, each of which contains:
|
|
|
91
77
|
- `question_text`: The natural language query passed to the LLM
|
|
92
78
|
- `reference_steps`: (optional) A list of expected steps grouped by expected order of execution, where all steps in a group can be executed in any order relative to each other, but after all steps in the previous group and before all steps in the next group.
|
|
93
79
|
- `reference_answer`: (optional) The expected answer to the question
|
|
80
|
+
|
|
94
81
|
The assumption is that the final answer to the question is derived from the outputs of the steps, which are executed last (last level).
|
|
95
82
|
|
|
96
83
|
Each step includes:
|
|
97
84
|
|
|
98
85
|
- `name`: The type of step being performed (e.g., `sparql_query`)
|
|
99
86
|
- `args`: Arguments of the step (e.g., arguments to a tool used in the step, such as a SPARQL query)
|
|
100
|
-
- `output`: The expected output from the step
|
|
87
|
+
- `output`: The expected output from the step.
|
|
101
88
|
- `output_media_type`: (optional, missing or one of `application/sparql-results+json`, `application/json`) Indicates how the output of a step must be processed
|
|
102
89
|
- `ordered`: (optional, defaults to `false`) For SPARQL query results, whether results order matters. `true` means that the actual result rows must be ordered as the reference result; `false` means that result rows are matched as a set.
|
|
103
90
|
- `required_columns`: (optional) - required only for SPARQL query results; list of binding names, which are required for SPARQL query results to match
|
|
@@ -113,7 +100,22 @@ The example corpus below illustrates a minimal but realistic Q&A dataset, showin
|
|
|
113
100
|
question_text: List all transformers within Substation OSLO
|
|
114
101
|
reference_answer: OSLO T1, OSLO T2
|
|
115
102
|
reference_steps:
|
|
116
|
-
- - name:
|
|
103
|
+
- - name: retrieval
|
|
104
|
+
args:
|
|
105
|
+
query: transformers Substation OSLO
|
|
106
|
+
k: 2
|
|
107
|
+
output: |-
|
|
108
|
+
[
|
|
109
|
+
{
|
|
110
|
+
"id": "http://example.com/resource/doc/1",
|
|
111
|
+
"text": "Transformer OSLO T1 is in Substation Oslo."
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"id": "http://example.com/resource/doc/2",
|
|
115
|
+
"text": "Transformer OSLO T2 is in Substation Oslo."
|
|
116
|
+
}
|
|
117
|
+
]
|
|
118
|
+
- name: sparql_query
|
|
117
119
|
args:
|
|
118
120
|
query: |2
|
|
119
121
|
|
|
@@ -267,6 +269,16 @@ Below is an example response from the question-answering system for a single que
|
|
|
267
269
|
"total_tokens": 298753,
|
|
268
270
|
"elapsed_sec": 46.48961806297302,
|
|
269
271
|
"actual_steps": [
|
|
272
|
+
{
|
|
273
|
+
"name": "retrieval",
|
|
274
|
+
"args": {
|
|
275
|
+
"query": "transformers Substation OSLO",
|
|
276
|
+
"k": 2
|
|
277
|
+
},
|
|
278
|
+
"id": "call_3",
|
|
279
|
+
"status": "success",
|
|
280
|
+
"output": "[\n {\n \"id\": \"http://example.com/resource/doc/1\",\n \"text\": \"Transformer OSLO T1 is in Substation Oslo.\"\n },\n {\n \"id\": \"http://example.com/resource/doc/2\",\n \"text\": \"Transformer OSLO T2 is in Substation Oslo.\"\n }\n]"
|
|
281
|
+
},
|
|
270
282
|
{
|
|
271
283
|
"name": "autocomplete_search",
|
|
272
284
|
"args": {
|
|
@@ -337,7 +349,23 @@ The output is a list of statistics for each question from the reference Q&A data
|
|
|
337
349
|
question_text: List all transformers within Substation OSLO
|
|
338
350
|
reference_answer: OSLO T1, OSLO T2
|
|
339
351
|
reference_steps:
|
|
340
|
-
- - name:
|
|
352
|
+
- - name: retrieval
|
|
353
|
+
args:
|
|
354
|
+
query: transformers Substation OSLO
|
|
355
|
+
k: 2
|
|
356
|
+
matches: call_3
|
|
357
|
+
output: |-
|
|
358
|
+
[
|
|
359
|
+
{
|
|
360
|
+
"id": "http://example.com/resource/doc/1",
|
|
361
|
+
"text": "Transformer OSLO T1 is in Substation Oslo."
|
|
362
|
+
},
|
|
363
|
+
{
|
|
364
|
+
"id": "http://example.com/resource/doc/2",
|
|
365
|
+
"text": "Transformer OSLO T2 is in Substation Oslo."
|
|
366
|
+
}
|
|
367
|
+
]
|
|
368
|
+
- name: sparql_query
|
|
341
369
|
args:
|
|
342
370
|
query: |2
|
|
343
371
|
|
|
@@ -378,6 +406,31 @@ The output is a list of statistics for each question from the reference Q&A data
|
|
|
378
406
|
answer_relevance: 0.9
|
|
379
407
|
answer_relevance_cost: 0.0007
|
|
380
408
|
actual_steps:
|
|
409
|
+
- name: retrieval
|
|
410
|
+
id: call_3
|
|
411
|
+
args:
|
|
412
|
+
query: transformers Substation OSLO
|
|
413
|
+
k: 2
|
|
414
|
+
status: success
|
|
415
|
+
output: |-
|
|
416
|
+
[
|
|
417
|
+
{
|
|
418
|
+
"id": "http://example.com/resource/doc/1",
|
|
419
|
+
"text": "Transformer OSLO T1 is in Substation Oslo."
|
|
420
|
+
},
|
|
421
|
+
{
|
|
422
|
+
"id": "http://example.com/resource/doc/2",
|
|
423
|
+
"text": "Transformer OSLO T2 is in Substation Oslo."
|
|
424
|
+
}
|
|
425
|
+
]
|
|
426
|
+
retrieval_answer_recall: 1.0
|
|
427
|
+
retrieval_answer_recall_reason: The context contains all the transformers listed in the reference answer
|
|
428
|
+
retrieval_answer_recall_cost: 0.0007
|
|
429
|
+
retrieval_answer_precision: 1.0
|
|
430
|
+
retrieval_answer_precision_reason: The context contains only transformers listed in the reference answer
|
|
431
|
+
retrieval_answer_precision_cost: 0.0003
|
|
432
|
+
retrieval_answer_f1: 1.0
|
|
433
|
+
retrieval_answer_f1_cost: 0.001
|
|
381
434
|
- name: autocomplete_search
|
|
382
435
|
args:
|
|
383
436
|
query: OSLO
|
|
@@ -484,12 +537,33 @@ The output is a list of statistics for each question from the reference Q&A data
|
|
|
484
537
|
- `answer_relevance_error`: (optional) error message if answer relevance evaluation failed
|
|
485
538
|
- `answer_relevance_cost`: The LLM use cost of computing `answer_relevance`, in US dollars
|
|
486
539
|
- `actual_steps`: (optional) copy of the steps in the evaluation target, if specified there
|
|
487
|
-
- `steps_score`: a real number between 0 and 1, computed by comparing the results of the last
|
|
540
|
+
- `steps_score`: a real number between 0 and 1, computed by comparing the results of the last executed steps to the output of the reference's last group of steps.
|
|
541
|
+
- If there is no match in the actual steps, then the score is `0.0`
|
|
542
|
+
- If the executed step's name is "retrieval" and the last reference group contains a retrieval step, then the score is the [recall at k](#context-recallk) of the retrieved document ids with respect to the reference.
|
|
543
|
+
- Otherwise, the score is the number of the matched steps on the last group divided by the total number of steps in the last group.
|
|
488
544
|
- `input_tokens`: input tokens usage
|
|
489
545
|
- `output_tokens`: output tokens usage
|
|
490
546
|
- `total_tokens`: total tokens usage
|
|
491
547
|
- `elapsed_sec`: elapsed seconds
|
|
492
548
|
|
|
549
|
+
All `actual_steps` with `name` "retrieval" contain:
|
|
550
|
+
- `retrieval_answer_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
551
|
+
- `retrieval_answer_recall_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_recall`
|
|
552
|
+
- `retrieval_answer_recall_error`: (optional) error message if `retrieval_answer_recall` evaluation fails
|
|
553
|
+
- `retrieval_answer_recall_cost`: cost of evaluating `retrieval_answer_recall`, in US dollars
|
|
554
|
+
- `retrieval_answer_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
555
|
+
- `retrieval_answer_precision_reason`: (optional) LLM reasoning in evaluating `retrieval_answer_precision`
|
|
556
|
+
- `retrieval_answer_precision_error`: (optional) error message if `retrieval_answer_precision` evaluation fails
|
|
557
|
+
- `retrieval_answer_precision_cost`: cost of evaluating `retrieval_answer_precision`, in US dollars
|
|
558
|
+
- `retrieval_answer_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_answer_recall` and `retrieval_answer_precision` succeed
|
|
559
|
+
- `retrieval_answer_f1_cost`: The sum of `retrieval_answer_recall_cost` and `retrieval_answer_precision_cost`
|
|
560
|
+
- `retrieval_context_recall`: (optional) recall of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
561
|
+
- `retrieval_context_recall_error`: (optional) error message if `retrieval_context_recall` evaluation fails
|
|
562
|
+
- `retrieval_context_precision`: (optional) precision of the retrieved context with respect to the reference answer, if evaluation succeeds
|
|
563
|
+
- `retrieval_context_precision_error`: (optional) error message if `retrieval_context_precision` evaluation fails
|
|
564
|
+
- `retrieval_context_f1`: (optional) F1 score of the retrieved context with respect to the reference answer, if `retrieval_context_recall` and `retrieval_context_precision` succeed
|
|
565
|
+
|
|
566
|
+
|
|
493
567
|
#### Aggregates Keys
|
|
494
568
|
|
|
495
569
|
The `aggregates` object provides aggregated evaluation metrics.
|
|
@@ -513,6 +587,9 @@ Aggregates are:
|
|
|
513
587
|
- `once_per_sample`: how many times each step was executed, counted only once per question
|
|
514
588
|
- `empty_results`: how many times the step was executed and returned empty results
|
|
515
589
|
- `errors`: how many times the step was executed and resulted in error
|
|
590
|
+
- `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` for all successful questions in this template
|
|
591
|
+
- `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` for all successful questions in this template
|
|
592
|
+
- `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` for all successful questions in this template
|
|
516
593
|
- `micro`: statistics across questions, regardless of template. It includes:
|
|
517
594
|
- `number_of_error_samples`: total number of questions, which resulted in error response
|
|
518
595
|
- `number_of_success_samples`: total number of questions, which resulted in successful response
|
|
@@ -525,6 +602,9 @@ Aggregates are:
|
|
|
525
602
|
- `answer_f1`: `sum`, `mean`, `median`, `min` and `max` for `answer_f1` of all successful questions
|
|
526
603
|
- `answer_relevance`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance` of all successful questions
|
|
527
604
|
- `answer_relevance_cost`: `sum`, `mean`, `median`, `min` and `max` statistics for `answer_relevance_cost` of all successful questions
|
|
605
|
+
- `retrieval_context_recall`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_recall` of all successful questions
|
|
606
|
+
- `retrieval_context_precision`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_precision` of all successful questions
|
|
607
|
+
- `retrieval_context_f1`: `sum`, `mean`, `median`, `min` and `max` statistics for `retrieval_context_f1` of all successful questions
|
|
528
608
|
- `steps_score`: `sum`, `mean`, `median`, `min` and `max` for `steps_score` of all successful questions
|
|
529
609
|
- `macro`: averages across templates, i.e., the mean of each metric per template, averaged. It includes:
|
|
530
610
|
- `input_tokens`: `mean` for `input_tokens`
|
|
@@ -536,6 +616,9 @@ Aggregates are:
|
|
|
536
616
|
- `answer_f1`: `mean` for `answer_f1`
|
|
537
617
|
- `answer_relevance`: `mean` for `answer_relevance`
|
|
538
618
|
- `answer_relevance_cost`: `mean` for `answer_relevance_cost`
|
|
619
|
+
- `retrieval_context_recall`: `mean` for `retrieval_context_recall`
|
|
620
|
+
- `retrieval_context_precision`: `mean` for `retrieval_context_precision`
|
|
621
|
+
- `retrieval_context_f1`: `mean` for `retrieval_context_f1`
|
|
539
622
|
- `steps_score`: `mean` for `steps_score`
|
|
540
623
|
|
|
541
624
|
#### Example Aggregates
|
|
@@ -912,18 +995,30 @@ macro:
|
|
|
912
995
|
mean: 25.911653497483996
|
|
913
996
|
```
|
|
914
997
|
|
|
998
|
+
### SPARQL queries comparison
|
|
999
|
+
|
|
1000
|
+
The algorithm iterates over all subsets of columns in the actual result of the same size as in the reference result.
|
|
1001
|
+
For each subset, it compares the set of columns (skipping optional columns).
|
|
1002
|
+
It matches floating-point numbers up to a 1e-8 precision. It does not do this for special types such as duration.
|
|
1003
|
+
|
|
1004
|
+
The average time complexity is О(nr\*nc_ref!\*binomial(nc_act, nc_ref)), where
|
|
1005
|
+
|
|
1006
|
+
* *nr* is the number of rows in the actual result
|
|
1007
|
+
* *nc_ref* is the number of columns in the reference result
|
|
1008
|
+
* *nc_act* is the number of columns in the actual result
|
|
1009
|
+
|
|
915
1010
|
### Retrieval Evaluation
|
|
916
1011
|
|
|
917
|
-
The following metrics are based on the
|
|
1012
|
+
The following metrics are based on the content of retrieved documents.
|
|
918
1013
|
|
|
919
|
-
#### Recall@k
|
|
1014
|
+
#### Context Recall@k
|
|
920
1015
|
|
|
921
1016
|
The fraction of relevant items among the top *k* recommendations. It answers the question: "Of all items the user cares about, how many did we inclide in the first k spots?"
|
|
922
1017
|
* **Formula**:
|
|
923
1018
|
$`
|
|
924
1019
|
\frac{\text{Number of relevant items in top k}}{\text{Number of relevant items}}
|
|
925
1020
|
`$
|
|
926
|
-
* **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the
|
|
1021
|
+
* **Calculation**: Count the number of relevant items in the top `k` retrieved results; divide that by the first 'k' relevant items.
|
|
927
1022
|
* **Example**: Suppose there are 4 relevant documents for a given query. Suppose our system retrieves 3 of them in the top 5 results (`k=5`). Recall@5 is `3 / 4 = 0.75`.
|
|
928
1023
|
|
|
929
1024
|
```python
|
|
@@ -934,7 +1029,7 @@ recall_at_k(
|
|
|
934
1029
|
) # => 0.75
|
|
935
1030
|
```
|
|
936
1031
|
|
|
937
|
-
####
|
|
1032
|
+
#### Context Precision@k
|
|
938
1033
|
|
|
939
1034
|
Evaluates a ranked list of recommendations by looking at the precision at the position of each correctly retrieved item. It rewards systems for placing relevant items higher up in the list. It's more sophisticated than just looking at precision at a single cutoff because it considers the entire ranking.
|
|
940
1035
|
* **Formula**:
|
|
@@ -964,4 +1059,3 @@ average_precision(
|
|
|
964
1059
|
retrieved_docs=[1, 4, 3, 5, 7]
|
|
965
1060
|
) # ~=> 0.8056
|
|
966
1061
|
```
|
|
967
|
-
|
|
@@ -16,7 +16,22 @@ METRICS = [
|
|
|
16
16
|
"total_tokens",
|
|
17
17
|
"elapsed_sec"
|
|
18
18
|
]
|
|
19
|
-
|
|
19
|
+
STEPS_METRICS = {
|
|
20
|
+
"retrieval": [
|
|
21
|
+
"retrieval_answer_precision",
|
|
22
|
+
"retrieval_answer_precision_cost",
|
|
23
|
+
"retrieval_answer_recall",
|
|
24
|
+
"retrieval_answer_recall_cost",
|
|
25
|
+
"retrieval_answer_f1",
|
|
26
|
+
"retrieval_answer_f1_cost",
|
|
27
|
+
"retrieval_context_precision",
|
|
28
|
+
"retrieval_context_precision_cost",
|
|
29
|
+
"retrieval_context_recall",
|
|
30
|
+
"retrieval_context_recall_cost",
|
|
31
|
+
"retrieval_context_f1",
|
|
32
|
+
"retrieval_context_f1_cost",
|
|
33
|
+
]
|
|
34
|
+
}
|
|
20
35
|
PROTECTED_METRICS = [
|
|
21
36
|
"input_tokens",
|
|
22
37
|
"output_tokens",
|
|
@@ -35,6 +50,19 @@ def stats_for_series(values: Iterable[int | float]) -> dict[str, float]:
|
|
|
35
50
|
}
|
|
36
51
|
|
|
37
52
|
|
|
53
|
+
def update_step_metrics_per_template(
|
|
54
|
+
sample: dict,
|
|
55
|
+
step_metrics_per_template: dict,
|
|
56
|
+
template_id: str
|
|
57
|
+
):
|
|
58
|
+
for step in sample.get("actual_steps", []):
|
|
59
|
+
if step["name"] in STEPS_METRICS:
|
|
60
|
+
for metric in STEPS_METRICS[step["name"]]:
|
|
61
|
+
value = step.get(metric)
|
|
62
|
+
if value is not None:
|
|
63
|
+
step_metrics_per_template[template_id][metric].append(value)
|
|
64
|
+
|
|
65
|
+
|
|
38
66
|
def update_stats_per_template(
|
|
39
67
|
sample: dict,
|
|
40
68
|
stats_per_template: dict,
|
|
@@ -76,6 +104,7 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
76
104
|
number_of_samples_per_template_by_status = defaultdict(lambda: defaultdict(int))
|
|
77
105
|
stats_per_template = defaultdict(lambda: defaultdict(list))
|
|
78
106
|
steps_summary_per_template = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
107
|
+
step_metrics_per_template = defaultdict(lambda: defaultdict(list))
|
|
79
108
|
|
|
80
109
|
# Compute per-template stats
|
|
81
110
|
templates_ids = set()
|
|
@@ -94,6 +123,11 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
94
123
|
steps_summary_per_template,
|
|
95
124
|
template_id
|
|
96
125
|
)
|
|
126
|
+
update_step_metrics_per_template(
|
|
127
|
+
sample,
|
|
128
|
+
step_metrics_per_template,
|
|
129
|
+
template_id
|
|
130
|
+
)
|
|
97
131
|
|
|
98
132
|
summary = {"per_template": {}}
|
|
99
133
|
|
|
@@ -115,6 +149,13 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
115
149
|
if series or metric in PROTECTED_METRICS:
|
|
116
150
|
template_summary[metric] = stats_for_series(series)
|
|
117
151
|
|
|
152
|
+
# Add step metrics for the template
|
|
153
|
+
template_step_metrics = {}
|
|
154
|
+
for metric, values in step_metrics_per_template[template_id].items():
|
|
155
|
+
template_step_metrics[metric] = stats_for_series(values)
|
|
156
|
+
if template_step_metrics:
|
|
157
|
+
template_summary["steps"].update(template_step_metrics)
|
|
158
|
+
|
|
118
159
|
summary["per_template"][template_id] = template_summary
|
|
119
160
|
|
|
120
161
|
# Add micro stats
|
|
@@ -137,6 +178,17 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
137
178
|
if series or metric in PROTECTED_METRICS:
|
|
138
179
|
summary["micro"][metric] = stats_for_series(series)
|
|
139
180
|
|
|
181
|
+
# Add micro step metrics
|
|
182
|
+
micro_step_metrics = defaultdict(list)
|
|
183
|
+
for template_metrics in step_metrics_per_template.values():
|
|
184
|
+
for metric, values in template_metrics.items():
|
|
185
|
+
micro_step_metrics[metric].extend(values)
|
|
186
|
+
step_metrics = {
|
|
187
|
+
metric: stats_for_series(values)
|
|
188
|
+
for metric, values in micro_step_metrics.items()
|
|
189
|
+
}
|
|
190
|
+
summary["micro"].update(step_metrics)
|
|
191
|
+
|
|
140
192
|
# Add macro stats
|
|
141
193
|
summary["macro"] = {}
|
|
142
194
|
for metric in METRICS:
|
|
@@ -148,4 +200,17 @@ def compute_aggregates(samples: list[dict]) -> dict:
|
|
|
148
200
|
if means or metric in PROTECTED_METRICS:
|
|
149
201
|
summary["macro"][metric] = {"mean": mean(means) if means else 0}
|
|
150
202
|
|
|
203
|
+
# Add macro step metrics
|
|
204
|
+
macro_step_metrics = defaultdict(list)
|
|
205
|
+
for template_id, template_summary in summary["per_template"].items():
|
|
206
|
+
if "steps" in template_summary:
|
|
207
|
+
for metric, stats in template_summary["steps"].items():
|
|
208
|
+
if "mean" in stats:
|
|
209
|
+
macro_step_metrics[metric].append(stats["mean"])
|
|
210
|
+
step_metrics = {
|
|
211
|
+
metric: {"mean": mean(values) if values else 0}
|
|
212
|
+
for metric, values in macro_step_metrics.items()
|
|
213
|
+
}
|
|
214
|
+
summary["macro"].update(step_metrics)
|
|
215
|
+
|
|
151
216
|
return summary
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
|
|
4
|
-
from .
|
|
4
|
+
from .retrieval_context_ids import recall_at_k
|
|
5
5
|
from .sparql import compare_sparql_results
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def compare_steps_outputs(reference: dict, actual: dict) -> float:
|
|
9
|
-
ref_output = reference
|
|
9
|
+
ref_output = reference.get("output")
|
|
10
10
|
act_output = actual["output"]
|
|
11
|
+
assert ref_output, "Reference step output is mandatory"
|
|
11
12
|
if reference.get("output_media_type") == "application/sparql-results+json":
|
|
12
13
|
return compare_sparql_results(
|
|
13
14
|
json.loads(ref_output),
|
|
@@ -17,9 +18,11 @@ def compare_steps_outputs(reference: dict, actual: dict) -> float:
|
|
|
17
18
|
)
|
|
18
19
|
if reference.get("output_media_type") == "application/json":
|
|
19
20
|
return float(json.loads(ref_output) == json.loads(act_output))
|
|
20
|
-
if reference["name"] == "retrieval":
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
if reference["name"] == actual["name"] == "retrieval":
|
|
22
|
+
ref_contexts_ids = [c["id"] for c in json.loads(ref_output)]
|
|
23
|
+
act_contexts_ids = [c["id"] for c in json.loads(act_output)]
|
|
24
|
+
k = actual["args"]["k"]
|
|
25
|
+
return recall_at_k(ref_contexts_ids, act_contexts_ids, k)
|
|
23
26
|
return float(ref_output == act_output)
|
|
24
27
|
|
|
25
28
|
|
|
@@ -95,9 +98,11 @@ def get_steps_matches(
|
|
|
95
98
|
|
|
96
99
|
def evaluate_steps(
|
|
97
100
|
reference_steps_groups: list[list[dict]],
|
|
98
|
-
actual_steps: list[dict]
|
|
101
|
+
actual_steps: list[dict],
|
|
102
|
+
matches: list[tuple[int, int, int, float]] | None = None
|
|
99
103
|
) -> float:
|
|
100
|
-
matches
|
|
104
|
+
if matches is None:
|
|
105
|
+
matches = get_steps_matches(reference_steps_groups, actual_steps)
|
|
101
106
|
matches_by_group = defaultdict(list)
|
|
102
107
|
scores_by_group = defaultdict(float)
|
|
103
108
|
for ref_group_idx, ref_match_idx, actual_idx, score in matches:
|
|
@@ -110,11 +115,33 @@ def evaluate_steps(
|
|
|
110
115
|
|
|
111
116
|
|
|
112
117
|
def get_steps_evaluation_result_dict(reference: dict, target: dict) -> dict:
|
|
113
|
-
act_steps = target["steps"]
|
|
114
118
|
eval_result = {}
|
|
119
|
+
act_steps = target.get("actual_steps", [])
|
|
115
120
|
eval_result["actual_steps"] = act_steps
|
|
121
|
+
for act_step in act_steps:
|
|
122
|
+
if act_step["name"] == "retrieval":
|
|
123
|
+
from .retrieval_answer import get_retrieval_evaluation_dict
|
|
124
|
+
result = get_retrieval_evaluation_dict(
|
|
125
|
+
question_text=reference["question_text"],
|
|
126
|
+
reference_answer=reference.get("reference_answer"),
|
|
127
|
+
actual_answer=target.get("actual_answer"),
|
|
128
|
+
actual_contexts=json.loads(act_step["output"])
|
|
129
|
+
)
|
|
130
|
+
act_step.update(result)
|
|
116
131
|
if "reference_steps" in reference:
|
|
117
132
|
ref_steps = reference["reference_steps"]
|
|
118
|
-
|
|
133
|
+
matches = get_steps_matches(ref_steps, act_steps)
|
|
134
|
+
steps_score = evaluate_steps(ref_steps, act_steps, matches)
|
|
119
135
|
eval_result["steps_score"] = steps_score
|
|
136
|
+
for ref_group_idx, ref_match_idx, act_idx, _ in matches:
|
|
137
|
+
ref_step = ref_steps[ref_group_idx][ref_match_idx]
|
|
138
|
+
act_step = act_steps[act_idx]
|
|
139
|
+
if ref_step["name"] == "retrieval":
|
|
140
|
+
from .retrieval_context_texts import \
|
|
141
|
+
get_retrieval_evaluation_dict
|
|
142
|
+
res = get_retrieval_evaluation_dict(
|
|
143
|
+
reference_contexts=json.loads(ref_step["output"]),
|
|
144
|
+
actual_contexts=json.loads(act_step["output"])
|
|
145
|
+
)
|
|
146
|
+
act_step.update(res)
|
|
120
147
|
return eval_result
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from langevals_ragas.response_context_recall import (
|
|
2
|
+
RagasResponseContextRecallEntry,
|
|
3
|
+
RagasResponseContextRecallEvaluator,
|
|
4
|
+
)
|
|
5
|
+
from langevals_ragas.response_context_precision import (
|
|
6
|
+
RagasResponseContextPrecisionEntry,
|
|
7
|
+
RagasResponseContextPrecisionEvaluator,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from graphrag_eval.util import get_f1_dict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _evaluate(
|
|
14
|
+
evaluator: RagasResponseContextRecallEvaluator | RagasResponseContextPrecisionEvaluator,
|
|
15
|
+
entry: RagasResponseContextRecallEntry | RagasResponseContextPrecisionEntry,
|
|
16
|
+
metric: str
|
|
17
|
+
) -> dict[str, float | str]:
|
|
18
|
+
try:
|
|
19
|
+
result = evaluator.evaluate(entry)
|
|
20
|
+
if result.status == "processed":
|
|
21
|
+
return {
|
|
22
|
+
f"retrieval_answer_{metric}": result.score,
|
|
23
|
+
f"retrieval_answer_{metric}_cost": result.cost.amount,
|
|
24
|
+
f"retrieval_answer_{metric}_reason": result.details
|
|
25
|
+
}
|
|
26
|
+
else:
|
|
27
|
+
return {
|
|
28
|
+
f"retrieval_answer_{metric}_error": result.details
|
|
29
|
+
}
|
|
30
|
+
except Exception as e:
|
|
31
|
+
return {
|
|
32
|
+
f"retrieval_answer_{metric}_error": str(e)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_retrieval_evaluation_dict(
|
|
37
|
+
question_text: str,
|
|
38
|
+
actual_contexts: list[dict[str, str]],
|
|
39
|
+
reference_answer: str | None = None,
|
|
40
|
+
actual_answer: str | None = None,
|
|
41
|
+
model_name : str = "openai/gpt-4o-mini",
|
|
42
|
+
max_tokens : int = 65_536
|
|
43
|
+
) -> dict:
|
|
44
|
+
if not reference_answer and not actual_answer:
|
|
45
|
+
return {}
|
|
46
|
+
settings_dict = {
|
|
47
|
+
"model": model_name,
|
|
48
|
+
"max_tokens": max_tokens
|
|
49
|
+
}
|
|
50
|
+
entry = RagasResponseContextPrecisionEntry(
|
|
51
|
+
input=question_text,
|
|
52
|
+
expected_output=reference_answer,
|
|
53
|
+
output=actual_answer,
|
|
54
|
+
contexts=[a["text"] for a in actual_contexts]
|
|
55
|
+
)
|
|
56
|
+
result = {}
|
|
57
|
+
evaluator = RagasResponseContextRecallEvaluator(settings=settings_dict)
|
|
58
|
+
result.update(_evaluate(evaluator, entry, "recall"))
|
|
59
|
+
evaluator = RagasResponseContextPrecisionEvaluator(settings=settings_dict)
|
|
60
|
+
result.update(_evaluate(evaluator, entry, "precision"))
|
|
61
|
+
result.update(get_f1_dict(result, "retrieval_answer"))
|
|
62
|
+
return result
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def recall_at_k(relevant_ids: list, retrieved_ids: list, k: int = 10) -> float:
|
|
5
|
+
"""
|
|
6
|
+
Calculates Recall@k.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
relevant_ids (list): A list of ground truth relevant document IDs.
|
|
10
|
+
retrieved_ids (list): A list of retrieved document IDs, ordered by rank.
|
|
11
|
+
k (int): The cutoff for the retrieval list.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
float: The Recall@k score.
|
|
15
|
+
"""
|
|
16
|
+
retrieved_at_k = retrieved_ids[:k]
|
|
17
|
+
relevant_at_k = relevant_ids[:k]
|
|
18
|
+
true_positives = len(set(relevant_at_k).intersection(set(retrieved_at_k)))
|
|
19
|
+
total_relevant = len(relevant_at_k)
|
|
20
|
+
if total_relevant == 0:
|
|
21
|
+
return 0.0
|
|
22
|
+
return true_positives / total_relevant
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def average_precision(relevant_ids: Iterable, retrieved_ids: Iterable) -> float:
|
|
26
|
+
"""
|
|
27
|
+
Calculates Average Precision (AP) for a single query.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
relevant_ids (Iterable): A set of ground truth relevant document IDs.
|
|
31
|
+
retrieved_ids (Iterable): A list of retrieved document IDs, ordered by rank.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
float: The Average Precision score.
|
|
35
|
+
"""
|
|
36
|
+
relevant_set = set(relevant_ids)
|
|
37
|
+
hits = 0
|
|
38
|
+
sum_of_precisions = 0.0
|
|
39
|
+
|
|
40
|
+
for i, doc_id in enumerate(retrieved_ids):
|
|
41
|
+
if doc_id in relevant_set:
|
|
42
|
+
hits += 1
|
|
43
|
+
precision_at_k = hits / (i + 1)
|
|
44
|
+
sum_of_precisions += precision_at_k
|
|
45
|
+
|
|
46
|
+
total_relevant = len(relevant_set)
|
|
47
|
+
if total_relevant == 0:
|
|
48
|
+
return 0.0
|
|
49
|
+
|
|
50
|
+
return sum_of_precisions / total_relevant
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from langevals_ragas.context_precision import (
|
|
2
|
+
RagasContextPrecisionEntry,
|
|
3
|
+
RagasContextPrecisionEvaluator,
|
|
4
|
+
)
|
|
5
|
+
from langevals_ragas.context_recall import (
|
|
6
|
+
RagasContextRecallEntry,
|
|
7
|
+
RagasContextRecallEvaluator,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from graphrag_eval.util import get_f1_dict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _evaluate(
|
|
14
|
+
entry: RagasContextRecallEntry | RagasContextPrecisionEntry,
|
|
15
|
+
evauator: RagasContextRecallEvaluator | RagasContextPrecisionEvaluator,
|
|
16
|
+
metric: str
|
|
17
|
+
) -> dict:
|
|
18
|
+
try:
|
|
19
|
+
result = evauator.evaluate(entry)
|
|
20
|
+
if result.status == "processed":
|
|
21
|
+
result_dict = {
|
|
22
|
+
f"retrieval_context_{metric}": result.score,
|
|
23
|
+
}
|
|
24
|
+
if result.details:
|
|
25
|
+
result_dict[f"retrieval_context_{metric}_reason"] = result.details
|
|
26
|
+
if result.cost is not None:
|
|
27
|
+
result_dict[f"retrieval_context_{metric}_cost"] = result.cost.amount
|
|
28
|
+
return result_dict
|
|
29
|
+
else:
|
|
30
|
+
return {
|
|
31
|
+
f"retrieval_context_{metric}_error": result.details,
|
|
32
|
+
}
|
|
33
|
+
except Exception as e:
|
|
34
|
+
return {
|
|
35
|
+
f"retrieval_context_{metric}_error": str(e),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_retrieval_evaluation_dict(
|
|
40
|
+
reference_contexts: list[dict[str, str]],
|
|
41
|
+
actual_contexts: list[dict[str, str]],
|
|
42
|
+
model_name : str = "openai/gpt-4o-mini",
|
|
43
|
+
max_tokens : int = 65_536
|
|
44
|
+
) -> dict:
|
|
45
|
+
settings_dict = {
|
|
46
|
+
"model": model_name,
|
|
47
|
+
"max_tokens": max_tokens
|
|
48
|
+
}
|
|
49
|
+
entry = RagasContextRecallEntry(
|
|
50
|
+
expected_contexts=[a["text"] for a in reference_contexts],
|
|
51
|
+
contexts=[a["text"] for a in actual_contexts]
|
|
52
|
+
)
|
|
53
|
+
result = {}
|
|
54
|
+
evaluator = RagasContextRecallEvaluator(settings=settings_dict)
|
|
55
|
+
result.update(_evaluate(entry, evaluator, "recall"))
|
|
56
|
+
evaluator = RagasContextPrecisionEvaluator(settings=settings_dict)
|
|
57
|
+
result.update(_evaluate(entry, evaluator, "precision"))
|
|
58
|
+
result.update(get_f1_dict(result, "retrieval_context"))
|
|
59
|
+
return result
|
|
@@ -1,10 +1,31 @@
|
|
|
1
1
|
from collections import Counter
|
|
2
|
+
import re
|
|
2
3
|
from typing import Union
|
|
3
4
|
import itertools
|
|
4
5
|
import math
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
|
|
7
|
+
XSD_NUMERIC_TYPES = {
|
|
8
|
+
"http://www.w3.org/2001/XMLSchema#integer",
|
|
9
|
+
"http://www.w3.org/2001/XMLSchema#int",
|
|
10
|
+
"http://www.w3.org/2001/XMLSchema#long",
|
|
11
|
+
"http://www.w3.org/2001/XMLSchema#short",
|
|
12
|
+
"http://www.w3.org/2001/XMLSchema#byte",
|
|
13
|
+
"http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
|
|
14
|
+
"http://www.w3.org/2001/XMLSchema#positiveInteger",
|
|
15
|
+
"http://www.w3.org/2001/XMLSchema#unsignedLong",
|
|
16
|
+
"http://www.w3.org/2001/XMLSchema#unsignedInt",
|
|
17
|
+
"http://www.w3.org/2001/XMLSchema#unsignedShort",
|
|
18
|
+
"http://www.w3.org/2001/XMLSchema#unsignedByte",
|
|
19
|
+
}
|
|
20
|
+
XSD_FLOAT_TYPES = {
|
|
21
|
+
"http://www.w3.org/2001/XMLSchema#decimal",
|
|
22
|
+
"http://www.w3.org/2001/XMLSchema#double",
|
|
23
|
+
"http://www.w3.org/2001/XMLSchema#float",
|
|
24
|
+
}
|
|
25
|
+
XSD_BOOLEAN = "http://www.w3.org/2001/XMLSchema#boolean"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def truncate(number: float, decimals: int = 0) -> float:
|
|
8
29
|
"""
|
|
9
30
|
Truncates a float to a certain number of decimal places.
|
|
10
31
|
"""
|
|
@@ -19,37 +40,92 @@ def truncate(number, decimals=0):
|
|
|
19
40
|
return math.trunc(number * factor) / factor
|
|
20
41
|
|
|
21
42
|
|
|
43
|
+
def parse_sparql_term(term: dict) -> Union[str, float, bool, None]:
|
|
44
|
+
if not isinstance(term, dict):
|
|
45
|
+
return term
|
|
46
|
+
|
|
47
|
+
term_type = term.get("type")
|
|
48
|
+
value = term.get("value")
|
|
49
|
+
|
|
50
|
+
if term_type in ("literal", "typed-literal"):
|
|
51
|
+
datatype = term.get("datatype")
|
|
52
|
+
if not datatype:
|
|
53
|
+
return value
|
|
54
|
+
|
|
55
|
+
if datatype in XSD_NUMERIC_TYPES:
|
|
56
|
+
try:
|
|
57
|
+
return int(value)
|
|
58
|
+
except (ValueError, TypeError):
|
|
59
|
+
return value
|
|
60
|
+
elif datatype in XSD_FLOAT_TYPES:
|
|
61
|
+
try:
|
|
62
|
+
value = float(value)
|
|
63
|
+
return truncate(value, 5)
|
|
64
|
+
except (ValueError, TypeError):
|
|
65
|
+
return value
|
|
66
|
+
elif datatype == XSD_BOOLEAN:
|
|
67
|
+
return value.lower() in ("true", "1")
|
|
68
|
+
else:
|
|
69
|
+
return value
|
|
70
|
+
|
|
71
|
+
return value
|
|
72
|
+
|
|
73
|
+
|
|
22
74
|
def get_var_to_values(
|
|
23
75
|
vars_: list[str],
|
|
24
76
|
bindings: list[dict],
|
|
25
77
|
) -> dict[str, list]:
|
|
26
|
-
var_to_values =
|
|
78
|
+
var_to_values = {}
|
|
27
79
|
for var in vars_:
|
|
28
80
|
var_to_values[var] = []
|
|
29
81
|
for binding in bindings:
|
|
30
82
|
if var in binding:
|
|
31
|
-
var_to_values[var].append(binding[var]
|
|
83
|
+
var_to_values[var].append(parse_sparql_term(binding[var]))
|
|
32
84
|
else:
|
|
33
85
|
var_to_values[var].append(None)
|
|
34
86
|
return dict(var_to_values)
|
|
35
87
|
|
|
36
88
|
|
|
37
|
-
def
|
|
89
|
+
def convert_table_dict2lines(
|
|
38
90
|
reference_vars: Union[list[str], tuple[str, ...]],
|
|
39
91
|
reference_var_to_values: dict[str, list],
|
|
40
92
|
) -> list[str]:
|
|
93
|
+
"""Converts a dictionary of lists (columns) into a list of row strings.
|
|
94
|
+
|
|
95
|
+
This function takes a dictionary where keys are column headers and values are
|
|
96
|
+
lists of column data. It transforms this column-oriented data into a list
|
|
97
|
+
of rows, where each row is a single string formed by concatenating the
|
|
98
|
+
string representation of its cell values.
|
|
99
|
+
|
|
100
|
+
It assumes that all lists in the `reference_var_to_values` dictionary
|
|
101
|
+
have the same length.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
reference_vars: An ordered list or tuple of keys that defines the
|
|
105
|
+
column order for the output rows.
|
|
106
|
+
reference_var_to_values: A dictionary mapping column names (keys) to
|
|
107
|
+
lists of their corresponding values.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
A list of strings, where each string is a concatenation of the values
|
|
111
|
+
for a single row, ordered according to `reference_vars`.
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> columns = ['name', 'age', 'city']
|
|
115
|
+
>>> data = {
|
|
116
|
+
... 'name': ['Alice', 'Bob'],
|
|
117
|
+
... 'age': [30, 25],
|
|
118
|
+
... 'city': ['New York', 'Los Angeles']
|
|
119
|
+
... }
|
|
120
|
+
>>> dict2lines(columns, data)
|
|
121
|
+
['Alice30New York', 'Bob25Los Angeles']
|
|
122
|
+
"""
|
|
41
123
|
result = []
|
|
42
124
|
num_rows = len(reference_var_to_values[reference_vars[0]])
|
|
43
125
|
for row_idx in range(num_rows):
|
|
44
126
|
row = []
|
|
45
127
|
for reference_var in reference_vars:
|
|
46
128
|
val = reference_var_to_values[reference_var][row_idx]
|
|
47
|
-
if isinstance(val, float):
|
|
48
|
-
val = truncate(val, 5)
|
|
49
|
-
if isinstance(val, int):
|
|
50
|
-
print(val)
|
|
51
|
-
val = float(val)
|
|
52
|
-
print(str(val))
|
|
53
129
|
val = str(val)
|
|
54
130
|
row.append(val)
|
|
55
131
|
result.append("".join(row))
|
|
@@ -64,8 +140,6 @@ def compare_values(
|
|
|
64
140
|
results_are_ordered: bool,
|
|
65
141
|
) -> bool:
|
|
66
142
|
|
|
67
|
-
if len(reference_vars) > len(actual_vars):
|
|
68
|
-
return False
|
|
69
143
|
if len(reference_vars) < len(actual_vars):
|
|
70
144
|
for combination in itertools.combinations(actual_vars, len(reference_vars)):
|
|
71
145
|
if compare_values(
|
|
@@ -78,9 +152,9 @@ def compare_values(
|
|
|
78
152
|
return True
|
|
79
153
|
return False
|
|
80
154
|
|
|
81
|
-
table =
|
|
155
|
+
table = convert_table_dict2lines(reference_vars, reference_var_to_values)
|
|
82
156
|
for permutation in itertools.permutations(actual_vars):
|
|
83
|
-
actual_table =
|
|
157
|
+
actual_table = convert_table_dict2lines(permutation, actual_var_to_values)
|
|
84
158
|
if (results_are_ordered and table == actual_table) or (
|
|
85
159
|
not results_are_ordered and Counter(table) == Counter(actual_table)
|
|
86
160
|
):
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
def compute_f1(recall: float | str | None, precision: float | str | None) -> float | None:
|
|
2
|
+
if recall is None or precision is None:
|
|
3
|
+
return None
|
|
4
|
+
recall = float(recall)
|
|
5
|
+
precision = float(precision)
|
|
6
|
+
if recall == 0.0 and precision == 0.0:
|
|
7
|
+
return 0.0
|
|
8
|
+
return 2 * (recall * precision) / (recall + precision)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_f1_dict(
|
|
12
|
+
input_dict: dict,
|
|
13
|
+
prefix: str
|
|
14
|
+
) -> dict:
|
|
15
|
+
recall = input_dict.get(f"{prefix}_recall")
|
|
16
|
+
precision = input_dict.get(f"{prefix}_precision")
|
|
17
|
+
f1 = compute_f1(recall, precision)
|
|
18
|
+
if f1 is None:
|
|
19
|
+
return {}
|
|
20
|
+
result = {f"{prefix}_f1": f1}
|
|
21
|
+
recall_cost = input_dict.get(f"{prefix}_recall_cost")
|
|
22
|
+
precision_cost = input_dict.get(f"{prefix}_precision_cost")
|
|
23
|
+
if recall_cost is not None and precision_cost is not None:
|
|
24
|
+
result[f"{prefix}_f1_cost"] = recall_cost + precision_cost
|
|
25
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-eval"
|
|
3
|
-
version = "
|
|
3
|
+
version = "5.0.0"
|
|
4
4
|
description = "For assessing question answering systems' final answers and intermediate steps, against a given set of questions, reference answers and steps."
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Neli Hateva", email = "neli.hateva@graphwise.ai"},
|
|
@@ -11,7 +11,7 @@ license = "Apache-2.0"
|
|
|
11
11
|
requires-python = ">=3.12,<3.13"
|
|
12
12
|
|
|
13
13
|
[project.urls]
|
|
14
|
-
repository = "https://github.com/Ontotext-AD/
|
|
14
|
+
repository = "https://github.com/Ontotext-AD/graphrag-eval"
|
|
15
15
|
|
|
16
16
|
[build-system]
|
|
17
17
|
requires = ["poetry-core>=2.0.0"]
|
|
@@ -35,4 +35,4 @@ langevals-ragas = "^0.1.12"
|
|
|
35
35
|
optional = true
|
|
36
36
|
|
|
37
37
|
[project.scripts]
|
|
38
|
-
answer-correctness = "
|
|
38
|
+
answer-correctness = "graphrag_eval.answer_correctness:main"
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
from typing import Iterable
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def recall_at_k(relevant_docs: Iterable, retrieved_docs: list, k: int = 10) -> float:
|
|
5
|
-
"""
|
|
6
|
-
Calculates Recall@k.
|
|
7
|
-
|
|
8
|
-
Args:
|
|
9
|
-
relevant_docs (Iterable): A set of ground truth relevant document IDs.
|
|
10
|
-
retrieved_docs (list): A list of retrieved document IDs, ordered by rank.
|
|
11
|
-
k (int): The cutoff for the retrieval list.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
float: The Recall@k score.
|
|
15
|
-
"""
|
|
16
|
-
retrieved_at_k = retrieved_docs[:k]
|
|
17
|
-
|
|
18
|
-
relevant_set = set(relevant_docs)
|
|
19
|
-
retrieved_set = set(retrieved_at_k)
|
|
20
|
-
true_positives = len(relevant_set.intersection(retrieved_set))
|
|
21
|
-
|
|
22
|
-
total_relevant = len(relevant_set)
|
|
23
|
-
|
|
24
|
-
if total_relevant == 0:
|
|
25
|
-
return 0.0
|
|
26
|
-
|
|
27
|
-
return true_positives / total_relevant
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def average_precision(relevant_docs: Iterable, retrieved_docs: Iterable) -> float:
|
|
31
|
-
"""
|
|
32
|
-
Calculates Average Precision (AP) for a single query.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
relevant_docs (Iterable): A set of ground truth relevant document IDs.
|
|
36
|
-
retrieved_docs (Iterable): A list of retrieved document IDs, ordered by rank.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
float: The Average Precision score.
|
|
40
|
-
"""
|
|
41
|
-
relevant_set = set(relevant_docs)
|
|
42
|
-
hits = 0
|
|
43
|
-
sum_of_precisions = 0.0
|
|
44
|
-
|
|
45
|
-
for i, doc_id in enumerate(retrieved_docs):
|
|
46
|
-
if doc_id in relevant_set:
|
|
47
|
-
hits += 1
|
|
48
|
-
precision_at_k = hits / (i + 1)
|
|
49
|
-
sum_of_precisions += precision_at_k
|
|
50
|
-
|
|
51
|
-
total_relevant = len(relevant_set)
|
|
52
|
-
if total_relevant == 0:
|
|
53
|
-
return 0.0
|
|
54
|
-
|
|
55
|
-
return sum_of_precisions / total_relevant
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|